20 June 2016

做程序开发的时候,经常会遇到产品汪要求你抓取某某网站的数据,我本人是很反感这样的需求的,但是呢,既然是强需求,代码还得撸。


程序抓取有很多种方式
1.简单粗暴 :file_get_contents();file_put_contents();
2.fread(),fwrite(),fopen(),feof(),fgets(),fclose()等一一系列文件操作函数。
3.curl函数。

ps:我在这里主要写了一点curl多线程的代码,仅供参考。

$dbOperate = new DbOperate();
//每次接受参数定量抓取
//$limit = $argv[2],$argv[1] ;

$offset = ($argv[2]-1) * $argv[1];
$limit =  $offset.','.$argv[1] ;
$urllist = $dbOperate->getUrlList('cat2=20 ',$limit);   //获取燃油滤清器的链接

if(!empty($urllist)){
    main($urllist,$dbOperate);
}

function main($urllist,$dbOperate){
    if(!empty($urllist)){
        // 初始化 curl_multi
        $mh = curl_multi_init(); 
        $handle = array();
        // 将单个curl 放入队列
        foreach($urllist as $i=>$urldetail){
            $url = $urldetail['sp_url'];
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
            curl_setopt($ch, CURLOPT_TIMEOUT, 40);
            curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)');
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); // 302 redirect
            curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
            curl_multi_add_handle($mh, $ch); 
            $handle[$i][0] = $ch;
            $handle[$i][1] = $urldetail['id'];
            $handle[$i][2] = $urldetail['sp_id'];
        }
        do {
            $mrc = curl_multi_exec($mh, $active);
        } while ($mrc == CURLM_CALL_MULTI_PERFORM);
        
        while ($active && $mrc == CURLM_OK) {
        
            if (curl_multi_select($mh) == -1) {
                usleep(1000);
            }
            do {
                $mrc = curl_multi_exec($mh, $active);
            } while ($mrc == CURLM_CALL_MULTI_PERFORM);
                   
        }
        // 实际处理
        foreach($handle as $ch) {
            $httpinfo = curl_getinfo( $ch[0] );
            if($httpinfo['http_code'] == 200){
                $html  = curl_multi_getcontent($ch[0]);

                /*截取网页内容
                $pos = strpos($html,'<div class="g_detail" id="comment">');
                $substr = strstr($html,'<div class="c_foot1">',true);
                $content = substr($substr,$pos);
                */

                //将商品详情页的有效的html内容保存到本地文件。
                $savePath = dirname(__FILE__).'/detail/'.$ch[2].'.html';
                echo $savePath.PHP_EOL;
                $fp = fopen($savePath,'w+');
                $status = fwrite($fp,$html);

                echo $status ? $ch[2].'-- is ok.'.PHP_EOL : " fail to get detail .".PHP_EOL;

            }elseif($httpinfo['http_code'] == 404){
                echo 'the goods is not found.';
            }else{
                echo 'error page.';
            }
        }
        // 移除队列的curl
        foreach($handle as $ch) {
           curl_multi_remove_handle($mh, $ch[0]);
        }
        curl_multi_close($mh);
    }
}