php curl 多线爬虫


几天没写了,主要都是自己的学习过程,贴一下curl / curl_multi_exec的一些代码,mark一下。




php
/**
* Created by PhpStorm.
* Date: 2017/2/23
* Time: 10:46
*/

//curl_multi_exec,proxy写一个简单多线程爬虫。




function curl_crawl($url, $proxy, $auth = array()){
$ch = curl_init();
curl_setopt(
$ch, CURLOPT_URL, $url);
curl_setopt(
$ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt(
$ch, CURLOPT_TIMEOUT , 10);
curl_setopt(
$ch, CURLOPT_HEADER, array('Connection: close'));
if (isset($proxy))
curl_setopt(
$ch, CURLOPT_PROXY, $proxy);
if (!empty($auth))
curl_setopt(
$ch, CURLOPT_PROXYUSERPWD, join(':', $auth));
$content = curl_exec($ch);
curl_close(
$ch);
return $content;
}

/*//$proxy = '200.255.220.211:8080';
$url = 'http://demo.com:8080/2.23/server.php';
$content = curl_crawl($url);
echo $content;
*/
/**
* @param $url
* @param array $proxy
* @param array $auth
* @param int $threads
*/
function curl_multi_craw($url = array(), $proxy = array(), $auth = array(), $threads = 1){
$mul = curl_multi_init();
$curl_handlers = array();
$results = array();
//非单URL多线程的情况
if ($threads === 1) {
foreach ($url as $t){
$ch = curl_init();
$curl_handlers[$t] = $ch;
curl_setopt(
$ch, CURLOPT_URL, $t);
curl_setopt(
$ch, CURLOPT_HEADER, 0);
//curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt(
$ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt(
$ch, CURLOPT_MAXREDIRS, 5);
curl_multi_add_handle(
$mul, $ch);
}
}
if(!empty($proxy)){
foreach ($curl_handlers as $handler){
curl_setopt(
$handler, CURLOPT_PROXY, $proxy[mt_rand(0, (count($proxy)-1))]);
}
}
/*foreach($curl_handlers as $url => $handler){
echo $url."=========".$handler."rn";
}
*/
//执行

do {
$mrc = curl_multi_exec($mul, $active);
}
while ($mrc == CURLM_CALL_MULTI_PERFORM);

while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mul) != -1) {
do {
$mrc = curl_multi_exec($mul, $active);
}
while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
foreach ($curl_handlers as $url => $handler){
$results[$url] = curl_multi_getcontent($handler);
curl_multi_remove_handle(
$mul, $handler);
}
curl_multi_close(
$mul);
var_dump($results);
}

/*
$proxy = array('127.0.0.1:8888');
$url = array('http://demo.com:8080/2.23/server.php', 'http://www.domain.com';);
curl_multi_craw($url, $proxy);
*/





0 个评论

要回复文章请先登录注册