前端之家收集整理的这篇文章主要介绍了
crawler,
前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
#!/usr/bin/perl
use
strict;
use
warnings;
use
threads;
use
threads::shared;
use
Thread::Queue;
use
Thread::Semaphore;
use
Bloom::Filter;
use
URI;
use
URI::URL;
use
Web::Scraper;
use
LWP::Simple;
use
LWP::UserAgent;
use
HTTP::Cookies;
use
String::Diff;
use
String::Diff
qw(diff_fully diff diff_merge diff_regexp)
;
use
URI::Split
qw(uri_split uri_join)
;
my
$fid
: shared;
share(
$fid
);
$fid
=0;
my
$cookie_jar
=
'.mozilla/firefox/bg146ia6.default/cookies.sqlite'
;
my
$tmp_ua
= LWP::UserAgent->new;
$tmp_ua
->timeout(15);
$tmp_ua
->protocols_allowed( [
'http'
,
'https'
] );
$tmp_ua
->agent(
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;.NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
)
;
$tmp_ua
->cookie_jar(HTTP::Cookies->new(
'file'
=>
"$ENV{'HOME'}/$cookie_jar"
,
'autosave'
=>1));
push
@{
$tmp_ua
->requests_redirectable},
'POST'
;
my
$max_threads
= 5;
my
$host
= URI::URL->new(
$base_url
)->host;
print
"Host Name: $host.\n"
;
my
$queue
= Thread::Queue->new( );
my
$semaphore
= Thread::Semaphore->new(
$max_threads
);
my
$mutex
= Thread::Semaphore->new( 1 );
my
$filter
= shared_clone( Bloom::Filter->new(
capacity
=> 1000000,
error_rate
=> 0.001) );
$queue
->enqueue(
$base_url
);
$filter
->add(
$base_url
);
my
@tmp_url
= ();
push
(
@tmp_url
,
$base_url
);
while
( 1 )
{
foreach
( threads->list(threads::joinable) )
{
@H_633_
502@
$_
->
join
( );
}
@H_
502_526@