基于curl封装一个爬虫

<?php  
if ( ! defined('BASEPATH')) exit('No direct script access allowed');
/**
 * CodeIgniter
 *
 * An open source application development framework for PHP 5.1.6 or newer
 *
 * @package		CodeIgniter
 * @author		ExpressionEngine Dev Team
 * @copyright	Copyright (c) 2008 - 2011, EllisLab, Inc.
 * @license		http://codeigniter.com/user_guide/license.html
 * @link		http://codeigniter.com
 * @since		Version 1.0
 * @filesource
 */

// ------------------------------------------------------------------------

class Crawler {
	private $_url;
	private $_handle;
	//多线程处理时一些变量
	private $_batch_config = array(
											'handle_array' => array(),
											'map_array' => array()
									);
	private $_curl_option;
	private $_single_or_multi = TRUE; //TRUE表示是单个curl句柄,FALSE表示一组批处理curl句柄(多线程操作)

	function __construct() {
		$this->_curl_option = array(
				CURLOPT_TIMEOUT => 60,
				CURLOPT_FOLLOWLOCATION => TRUE,
				CURLOPT_RETURNTRANSFER => TRUE,
				CURLOPT_HEADER => TRUE,
				CURLOPT_NOSIGNAL => TRUE,
		);
	}

	public function deal($url) {
		$this->set_url($url);
		$this->init_handle();

		return $this->get_response();
	}

	public function set_url($url) {
		$this->_url = $url;
		$this->_single_or_multi = is_array($url) ? FALSE : TRUE;
	}

	public function set_curl_option($option) {
		$this->_curl_option = $option;
	}

	public function init_handle() {
		if(is_array($this->_url)) {
			$this->_handle = curl_multi_init();
			foreach ($this->_url as $id=>$s_url) {
				$ch = curl_init($s_url);
				$this->_batch_config['handle_array'][] = $ch;
				curl_setopt_array($ch, $this->_curl_option);

				curl_multi_add_handle($this->_handle, $ch);
				$this->_batch_config['map_array'][(string)$ch] = $id;
// 				print_r($ch);
// 				echo "\n" . (string)$ch . "\n";
			}
			print_r($this->_batch_config);
		}
		else {
			$this->_handle = curl_init($this->_url);
			curl_setopt_array($this->_handle, $this->_curl_option);
		}
	}

	public function get_response() {
		if($this->_single_or_multi) {
			$response = curl_exec($this->_handle);
			curl_close($this->_handle);

			return $response;
		}
		else {
			$responses = array();
			do
			{
				while(($code = curl_multi_exec($this->_handle , $active)) == CURLM_CALL_MULTI_PERFORM);
				if($code != CURLM_OK)
				{
					break;
				}

				// a request was just completed -- find out which one
				while($done = curl_multi_info_read($this->_handle))
				{

					// get the info and content returned on the request
					$content = curl_multi_getcontent($done['handle']);
					$responses[$this->_batch_config['map_array'][(string) $done['handle']]] = $content;//$this->parseHead($content);

					// remove the curl handle that just completed
					curl_multi_remove_handle($this->_handle , $done['handle']);
					curl_close($done['handle']);
				}

				// Block for data in / output; error handling is done by
				// curl_multi_exec
				if($active > 0)
				{
					curl_multi_select($this->_handle , 0.5);
				}
			}
			while($active);

			return $responses;
		}
	}
}

今天在CI下写一个爬虫库,此爬虫支持单url抓取和多线程批量url抓取,以及自定义正则匹配所需要的内容。

未完成

===========2013.05.06============

完成多线程部分,类代码待完善

curl获取头信息遇到的诡异事件

my problem code

<?php
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, 'http://www.tudou.com/programs/view/qyT7G6gVFSs');
curl_setopt($curl, CURLOPT_HEADER, 1);
curl_setopt($curl , CURLOPT_NOBODY, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($curl);
curl_close($curl);
var_dump($data);

the response is

string(241) “HTTP/1.1 405 Method Not Allowed Server: Tengine/1.4.0 Date: Sat, 01 Dec 2012 15:53:32 GMT Content-Type: text/html;charset=GBK Content-Length: 1085 Connection: close appSrv: itemview-app4-app_admin Vary: Accept-Encoding Allow: GET ”

then my correct code is

<?php
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, 'http://www.tudou.com/programs/view/qyT7G6gVFSs');
curl_setopt($curl, CURLOPT_HEADER, 1);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($curl);
curl_close($curl);
var_dump($data);

the result is string(313) “HTTP/1.1 302 Moved Temporarily Server: Tengine/1.4.0 Date: Sat, 01 Dec 2012 16:17:25 GMT Content-Length: 0 Connection: close appSrv: itemview-app5-app_admin Vary: Accept-Encoding Pragma: No-Cache Cache-Control: no-cache, no-store Expires: Thu, 01 Jan 1970 00:00:00 GMT Location: http://tv.tudou.com/

php中的(多线程)curl批量处理

function getMultiUrls($urls, $timeout = 30)
	{
		$queue = curl_multi_init();
		$map = array();
	
		foreach($urls as $id=>$url)
		{
			$ch = curl_init();
			curl_setopt($ch , CURLOPT_URL , $url);
			curl_setopt($ch , CURLOPT_TIMEOUT , $timeout);
			curl_setopt($ch , CURLOPT_RETURNTRANSFER , 1);
 			curl_setopt($ch, CURLOPT_HEADER, true);
// 			curl_setopt($ch , CURLOPT_HEADER , 0);
			curl_setopt($ch, CURLOPT_NOBODY, true);
			curl_setopt($ch , CURLOPT_NOSIGNAL , true);
	
			curl_multi_add_handle($queue , $ch);
			$map[(string) $ch] = $id;
		}
	
		$responses = array();
		do
		{
			while(($code = curl_multi_exec($queue , $active)) == CURLM_CALL_MULTI_PERFORM);
			if($code != CURLM_OK)
			{
				break;
			}
	
			// a request was just completed -- find out which one
			while($done = curl_multi_info_read($queue))
			{
	
				// get the info and content returned on the request
				$content = curl_multi_getcontent($done['handle']);
	
				$responses[$map[(string) $done['handle']]] = $this->parseHead($content);
	
				// remove the curl handle that just completed
				curl_multi_remove_handle($queue , $done['handle']);
				curl_close($done['handle']);
			}
	
			// Block for data in / output; error handling is done by
			// curl_multi_exec
			if($active > 0)
			{
				curl_multi_select($queue , 0.5);
			}
		}
		while($active);
		curl_multi_close($queue);
		return $responses;
	}
	
	function parseHead($headSream)
	{
		$headArrayTemp = explode("\r\n", $headSream);
		foreach ($headArrayTemp as $k=>$v)
		{
			if ($k==0)
			{
				$httpstas = explode(" ",$v);
				$headArray["http-edition"] = trim($httpstas[0]);
				$headArray["http-state"] = trim($httpstas[1]);
				$headArray["http-describe"] = "";
				for($i=2;$i<count($httpstas);$i++){
						$headArray["http-describe"] .= " ".trim($httpstas[$i]);
				}
			}
			else
			{
				if ($v == '')
				{
					break;
				}
				$headArray[strtolower(substr($v, 0, strpos($v, ':')))] = substr($v,strpos($v, ':')+1);
			}
		}
		
		return $headArray;
	}