采集 PHP
程序员文章站
2022-04-06 10:44:16
...
充分利用正则的强大字串处理能力,使用简单,功能也比较简单,能满足一般应用,功能也在不断完善中,使用过程:设置一个初始url,添加导航规则,添加采集字段和规则,保存输出即可
<?php set_time_limit(0); header("Content-type: text/html; charset=utf-8"); /** * 采集程序类 * @author shooting * @version 1.0.0 */ class spider { /** * 采集的终端页地址 * * @var array */ var $pages = array(); /** * 采集结果 * * @var array */ var $result = array(); /** * 第一层链接页面 * * @var array */ var $startUrls = array(); /** * 超时时间 * * @var integer */ var $timeout; /** * 正在处理的文件内容 * * @var string */ var $httpContent; /** * 正在处理的文件头 * * @var array */ var $httpHead=array(); /** * 自定义的head数组 * * @var array */ var $putHead = array(); /** * 采集字段与规则数组 * * @var array */ var $field_arr = array(); /** * 采集层次数 * * @var interger */ var $deep; /** * 采集层次结构 * * @var array */ var $layout_arr = array(); /** * 采集限制条数 * * @var integer */ var $limit = 0; /** * 程序运行时间 * * @var float */ var $runtime = 0; /** * 被采集页面编码 * * @var string */ var $charset = 'UTF-8'; /** * 页面引用地址 * * @var string */ var $httpreferer; var $pagelimit = 0; var $filepath = './'; function spider() { $this->timeout = 30; } /** * 运行采集 * * @return array */ function run() { $begintime = $this->microtime_float(); $cnt = 1; foreach ($this->startUrls as $starturl){ /** * 解析出起始地址中的页码区间 */ if(preg_match("~\{(\d+),(\d+)\}~",$starturl,$pagenum)){ $pagebegin = intval($pagenum[1]); $pageend = intval($pagenum[2]); for(;$pagebegin<=$pageend;$pagebegin++){ $starturl = str_replace($pagenum[0],$pagebegin,$starturl); $urllists = $this->getLists($this->layout_arr[0]['pattern'],$this->getContent($starturl)); foreach ($urllists as $url){ if(($this->limit > 0 && $cnt <= $this->limit)||$this->limit == 0) { $this->filterContent($this->getContent($url,$starturl)); $cnt++; } } } }else{ $urllists = $this->getLists($this->layout_arr[0]['pattern'],$this->getContent($starturl)); foreach ($urllists as $url){ if(($this->limit > 0 && $cnt <= $this->limit)||$this->limit == 0) { $this->filterContent($this->getContent($url,$starturl)); $cnt++; } } } } $this->runtime = $this->microtime_float()-$begintime; return $this->result; } /** * 从文字段中根据规则提取出url列表 * * @param string $pattern * @param string $content * @return Array */ function getLists($pattern='',$content='') { if(strpos($pattern,'{*}') === false)return array($pattern); $pattern = preg_quote($pattern); $pattern = str_replace('\{\*\}','([^\'\">]*)',$pattern); $pattern = "~".$pattern."~is"; preg_match_all($pattern,$content,$preg_rs); return array_unique($preg_rs[0]); } /** * 获取指定url的html内容包括头 * * @param string $url * @return string */ function getContent($url,$referer = '') { $url = $this->urlRtoA($url,$referer); preg_match("/(http:\/\/)([^:\/]*):?(\d*)(\/?.*)/i",$url,$preg_rs); $host = $preg_rs[2]; $port = empty($preg_rs[3])?80:$preg_rs[3]; $innerUrl = $preg_rs[4]; $fsp = fsockopen($host,$port,$errno,$errstr,$this->timeout); if(!$fsp)$this->log($errstr.'('.$errno.')'); $output = "GET $url HTTP/1.0\r\nHost: $host\r\n"; if(!isset($this->putHead['Accept']))$this->putHead['Accept']= "*/*"; if(!isset($this->putHead['User-Agent']))$this->putHead['User-Agent']='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2)'; if(!isset($this->putHead['Refer'])){ $this->putHead['Refer'] = ($referer == '')?'http://'.$host:$referer; } foreach ($this->putHead as $headname => $headvalue){ $output .= trim($headname).': '.trim($headvalue)."\r\n"; } $output .= "Connection: close\r\n\r\n"; fwrite($fsp,$output); $content = ''; while (!feof($fsp)) { $content .= fgets($fsp,256); } fclose($fsp); $this->getHead($content); $this->httpContent = $content; if(strtoupper($this->charset) != 'UTF-8'){ $content = iconv($this->charset,'utf-8',$content); }else if(!empty($this->httpHead['charset']) && $this->httpHead['charset']!='UTF-8') { $content = iconv($this->httpHead['charset'],'utf-8',$content); } $this->httpreferer = $referer; return $content; } /** * 按照规则从内容提取所有字段 * @param Array * @return Array */ function filterContent($content='') { $rs = array(); foreach ($this->field_arr as $field => $fieldinfo){ $rs[$field] = $this->getPregField($fieldinfo,$content); } $this->result[] = $rs; } /** * 相对路径转化为绝对路径 * * @param string $relative * @param string $referer * @return string */ function urlRtoA($relative,$referer) { /** * 去除#后面的部分 */ $pos = strpos($relative,'#'); if($pos >0)$relative = substr($relative,0,$pos); /** * 检测路径如果是绝对地址直接返回 */ if(preg_match("~^(http|ftp)://~i",$relative)) return $relative; /** * 解析引用地址,获得协议,主机等信息 */ preg_match("~((http|ftp)://([^/]*)(.*/))([^/#]*)~i", $referer, $preg_rs); $parentdir = $preg_rs[1]; $petrol = $preg_rs[2].'://'; $host = $preg_rs[3]; /** * 如果以/开头的情况 */ if(preg_match("~^/~i",$relative)) return $petrol.$host.$relative; return $parentdir.$relative; } /** * 根据规则提取一个字段 * * @param string $pattern * @param string $content * @return string */ function getPregField($fieldinfo,$content) { /** * 规则为固定值的情况,直接返回固定值 */ if(strpos($fieldinfo['pattern'],'{'.$fieldinfo['field'].'}') === false) return $fieldinfo['pattern']; if($fieldinfo['isregular'] == 'true'){ $pattern = $fieldinfo['pattern']; $pattern = str_replace('{'.$fieldinfo['field'].'}','(?P<'.$fieldinfo['field'].'>.*?)',$pattern); }else{ $pattern = preg_quote($fieldinfo['pattern']); $pattern = str_replace('\{'.$fieldinfo['field'].'\}','(?P<'.$fieldinfo['field'].'>.*?)',$pattern); } $pattern = "~".$pattern."~is"; preg_match($pattern,$content,$preg_rs); $fieldresult = $preg_rs[$fieldinfo['field']]; /** * 去掉换行符 */ $fieldresult = preg_replace("~[\r\n]*~is",'',$fieldresult); /** * 对采集到的结果根据规则再进行二次替换处理 */ $replace_arr = $fieldinfo['replace']; if(is_array($replace_arr)){ $replace_arr[0] = "~".$replace_arr[0]."~s"; $fieldresult = preg_replace($replace_arr[0],$replace_arr[1],$fieldresult); } /** * 针对有下一页的字段递归采集 */ if($this->pagelimit == 0){ if($fieldinfo['nextpage'] != ''){ $pattern = $fieldinfo['nextpage']; $pattern = str_replace('{nextpage}','(?P<nextpage>[^\'\">]*?)',$pattern); $pattern = "~".$pattern."~is"; if(preg_match($pattern,$content,$preg_rs) && $preg_rs['nextpage'] != ''){ $fieldresult .= $this->getPregField($fieldinfo,$this->getContent($preg_rs['nextpage'],$this->httpreferer)); } } } if(!empty($fieldinfo['callback']))$fieldresult = $fieldinfo['callback']($fieldresult); return $fieldresult; } /** * 添加一个采集字段和规则 * * @param string $field * @param string $pattern */ function addField($field,$pattern,$replace_arr='',$isregular='false',$nextpage = '',$callback='') { $rs = array( 'field' => $field, 'pattern' => $pattern, 'replace' => $replace_arr, 'isregular' => $isregular, 'nextpage' => $nextpage, 'callback'=>$callback ); $this->field_arr[$field] =$rs; } /** * 输出 * */ function output() { echo "The result is:<br/>"; echo "runtime :$this->runtime S<br/><pre>"; print_r($this->result); echo "</pre>"; } /** * 输出到XLS文件 * * @param string $file */ function saveXls($file = 'spider_result.xls') { $fp = fopen($file,'w'); if($fp){ foreach ($this->result as $result) { $line = implode("\t",$result)."\n"; fputs($fp,$line); } } fclose($fp); echo 'The result has been saved to '.$file.'.<br/>Cost time:'.$this->runtime; } function saveSql($table = 'spider_result',$file = 'spider_result.sql') { $fp = fopen($file,'w'); if($fp){ foreach($this->field_arr as $fieldinfo){ $sql_key .= ', `'.$fieldinfo['field'].'`'; } $sql_key = substr($sql_key,1); foreach ($this->result as $result) { $sql_value = array(); foreach ($result as $key => $value){ $sql_value[] = "'".$this->addslash($value)."'"; } $line ="INSERT INTO `$table` ( $sql_key ) VALUES (".join(', ',$sql_value).");\r\n"; fputs($fp,$line); } } fclose($fp); echo 'The result has been saved to '.$file.'.<br/>Cost time:'.$this->runtime; } /** * 取得响应内容的头部信息 * * @param string $content * @return array */ function getHead($content) { $head = explode("\r\n\r\n",$content); $head = $head[0]; // echo $head; if(!preg_match("~charset\=(.*)\r\n~i",$head,$preg_rs)) preg_match('~charset=([^\"\']*)~i',$content,$preg_rs); $this->httpHead['charset'] = strtoupper(trim($preg_rs[1])); // preg_match("~charset\=(.*)~i",$head,$preg_rs); return $this->httpHead; } /** * 设置采集页面的编码 * 在程序不能自动识别的情况下采集前要手动调用此函数 * * @param string $charset */ function setCharset($charset){ $this->charset = strtoupper($charset); } /** * 设置第一层链接页面地址 * * @param array $url_arr */ function setStartUrls($url_arr) { $this->startUrls = $url_arr; } /** * 增加一个第一层链接页面地址 * * @param string $url */ function addStartUrl($url) { $this->startUrls[] = $url; } /** * 添加一个采集层次 * * @param integer $deep * @param string $layout * @param boolean $isSimple * @param boolean $isPageBreak * @param string $pattern */ function addLayer($deep,$layout,$pattern = '',$isSimple = 'false',$isPageBreak = 'false') { $this->layout_arr[$deep] = array( 'layout'=>$layout, 'isSimple'=>$isSimple, 'isPageBreak'=>$isPageBreak, 'pattern'=>$pattern ); } /** * 自定义head * @param string $namespace * @param string $value */ function setHead($name,$value) { $this->putHead[$name] = $value; } /** * 清除html代码 * @param string $content; * @param string $cleartags * @return string */ function clearHtml($content,$cleartags = 'p') { $cleartags_arr = explode('|',$cleartags); foreach ($cleartags_arr as $cleartag){ $pattern = '~<\/?'.$cleartag.'[^>]*>~is'; $content = preg_replace($pattern,'',$content); } return $content; } /** * 日志 * */ function log($str) { echo $str."<br/>\n"; } /** * 获取采集运行时间 * * @return float */ function getRuntime() { return $this->runtime; } function microtime_float() { list($usec, $sec) = explode(" ", microtime()); return ((float)$usec + (float)$sec); } function addslash($string) { return addslashes($string); } } $spider = new spider(); $spider->addStartUrl('http://hi.baidu.com/shuntian/blog/index/{0,5}'); $spider->setCharset('gb2312'); $spider->addLayer(0,'list','/shuntian/blog/item/{*}.html'); $spider->addField('title','<title>{title}</title>',array('_顺者的天空-shooting's sky ','')); $spider->addField('body','<td><p id="blog_text" class="cnt" >{body}</p></td></tr></table>'); $spider->addField('author','shooting'); $spider->run(); $spider->saveSql();
以上就是采集的内容,更多相关内容请关注PHP中文网(www.php.cn)!
下一篇: 抓取源码 - php抓取网页源码的问题