php采集远程文章简单类
程序员文章站
2022-03-28 16:35:42
...
跳至
db = $db; } function geturlfile($url) { $url = trim($url); $content = ''; if (extension_loaded('curl')) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HEADER, 0); $content = curl_exec($ch); curl_close($ch); } else { $content = file_get_contents($url); } return trim($content); } function get_all_url($code) { preg_match_all('/"\' ]+)["|\']?\s*[^>]*>([^>]+)/is', $code, $arr); return array('name' => $arr[2], 'url' => $arr[1]); } function get_sub_content($str, $start, $end) { $start = trim($start); $end = trim($end); if ($start == '' || $end == '') { return $str; } $str = explode($start, $str); $str = explode($end, $str[1]); return $str[0]; } function vd($var) { echo "\r\n"; echo "\r\n"; var_dump($var); echo "\r\n\r\n"; echo ""; } } ?>geturlfile($url); //定义采集列表区间 $start = ''; $end = ''; //获取区间内的文章URL和TITLE $code = $gather->get_sub_content($html, $start, $end); $newsAry = $gather->get_all_url($code); //打印出结果 //$gather->vd($newsAry); $tarGetUrl = $newsAry['url'][0]; //获取目标网址HTML $html = $gather->geturlfile($tarGetUrl); //定义采集列表区间 $start = ''; $end = ''; //获取区间内的文章URL和TITLE $code = $gather->get_sub_content($html, $start, $end); $killHtml = ''; $killHtml2 = ''; $code = str_replace($killHtml, "", $code); $code = str_replace($killHtml2, "", $code); $gather->vd($code); ?>