分享下页面关键字抓取www.icbase.com站点代码(带asp.net参数的)
程序员文章站
2023-11-23 18:23:40
复制代码 代码如下:
<?php
/**
* host: www.icbase.com
*/
//set_time_limit(0);
// base function
function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')
{
$ch = curl_init();
if (!empty($data)) {
$data = is_array($data)?http_build_query($data): $data;
$url .= (strpos($url,'?')? '&': "?") . $data;
}
curl_setopt($ch, curlopt_url, $url);
curl_setopt($ch, curlopt_returntransfer, true);
curl_setopt($ch, curlopt_connecttimeout, $timeout);
curl_setopt($ch, curlopt_post, 0);
curl_setopt($ch, curlopt_port, $port);
curl_setopt($ch, curlopt_httpheader, $header);
curl_setopt($ch, curlopt_followlocation, 1); //是否抓取跳转后的页面
$reffer && curl_setopt($ch, curlopt_referer, $reffer);
if($proxy) {
curl_setopt($ch, curlopt_proxy, $proxy);
curl_setopt($ch, curlopt_proxyport, 1723);
curl_setopt($ch, curlopt_proxyuserpwd,"andhm001:andhm123");
}
$result = array();
$result['result'] = curl_exec($ch);
if (0 != curl_errno($ch)) {
$result['error'] = "error:\n" . curl_error($ch);
}
curl_close($ch);
return $result;
}
function curl_post($url, $data = array(), $header = array(), $timeout = 5, $port = 80)
{
$ch = curl_init();
curl_setopt($ch, curlopt_url, $url);
curl_setopt($ch, curlopt_returntransfer, true);
curl_setopt($ch, curlopt_connecttimeout, $timeout);
//curl_setopt($ch, curlopt_port, $port);
!empty ($header) && curl_setopt($ch, curlopt_httpheader, $header);
curl_setopt($ch, curlopt_post, 1);
curl_setopt($ch, curlopt_postfields, $data);
$result = array();
$result['result'] = curl_exec($ch);
if (0 != curl_errno($ch)) {
$result['error'] = "error:\n" . curl_error($ch);
}
curl_close($ch);
return $result;
}
/**
* 获取列表页的html源码
* @param string $keywords 搜索关键字
* @param int $page 页数
* @return boolean|array
*/
function getlisthtml($keywords, $page=1)
{
if ($page < 0)
{
return false;
}
$page = $page == 0 ? 1 : intval($page);
if ($page == 1)
{
$result = curl_get('http://www.icbase.com/proresult.aspx', array('prokey' => $keywords));
if ( isset($result['error']) )
{
return false;
//exit($result['error']);
}
$result = $result['result'];
// asp.net post提交数据
if(! defined('__viewstate') && preg_match('/<input\s+type="hidden"\s+name="__viewstate"\s+id="__viewstate"\s+value="(.[^"]+)"/isu', $result, $matches))
{
define('__viewstate', $matches[1]);
} else {
return false;
}
if(! defined('__previouspage') && preg_match('/<input\s+type="hidden"\s+name="__previouspage"\s+id="__previouspage"\s+value="(.[^"]+)"/isu', $result, $matches))
{
define('__previouspage', $matches[1]);
} else {
return false;
}
if(! defined('__eventvalidation') && preg_match('/<input\s+type="hidden"\s+name="__eventvalidation"\s+id="__eventvalidation"\s+value="(.[^"]+)"/isu', $result, $matches))
{
define('__eventvalidation', $matches[1]);
} else {
return false;
}
return $result;
}
$data = array(
'__eventtarget' => 'pager',
'__eventargument' => $page,
'__viewstate' => __viewstate,
'__previouspage' => __previouspage,
'__eventvalidation' => __eventvalidation,
);
$result = curl_post('http://www.icbase.com/proresult.aspx?prokey=' . $keywords, $data);
if ( isset($result['error']) )
{
return false;
//exit($result['error']);
}
$result = $result['result'];
return $result;
}
/**
* 获取列表页 a链接的url
* @param string $html html源码
* @return array
*/
function getlisthref($html)
{
$pattern = '/<a\s+href=\'(.[^\']+)\'\s+target="_blank"\s*>[\s\n]*<img.+[^>]\/>/isu';
if (preg_match_all($pattern, $html, $matches))
{
return $matches[1];
} else {
// 没有匹配项
return array();
}
}
/**
* 获取下一页数字
* @param string $html html源码
* @return number
*/
function getlistnextpage($html)
{
$pattern = '/<div\s+id="pager".+[^>]>.+<a\s+href="javascript\:__dopostback\(\'pager\',\'(\d+)\'\)">><\/a>/isu';
if (preg_match($pattern, $html, $matches))
{
return intval($matches[1]);
} else {
return -1;
}
}
/**
* 获取列表也所有的href
* @param string $keywords 搜索关键字
* @return boolean|array
*/
function getlisthrefall($keywords)
{
if (empty($keywords))
{
return false;
}
$html = getlisthtml($keywords);
$hreflist = getlisthref($html);
if (empty($hreflist))
{
// 没有结果
return array();
}
$nextpage = getlistnextpage($html);
while ($nextpage > 0)
{
$html = getlisthtml($keywords, $nextpage);
$tmphreflist = getlisthref($html);
$hreflist = array_merge($hreflist, $tmphreflist);
$nextpage = getlistnextpage($html);
}
return $hreflist;
}
/**
* 获取详情页信息
* @param string $url url地址或者是抓取到的html源代码 根据@see $is_url 区分
* @param int $is_url 1使用的是url地址 0直接处理html源代码
* @return boolean|multitype:|multitype:string
*/
function getdetail($url, $is_url = 1)
{
if ( empty($url) )
{
return false;
}
$host = 'www.icbase.com';
$html = $url;
if ($is_url) {
$url = '/' . ltrim($url, '/');
$result = curl_get($host . $url);
if ( isset($result['error']) )
{
exit($result['error']);
}
$html = $result['result'];
}
$result = array(
'sup_part' => '', // 供应商型号
'sup_id' => '', // 供应商id
'mfg_part' => '', // 制造商型号
'mfg_name' => '', // 制造商名称
'cat_name' => '', // 分类名称
'para' => '', // 属性
'desc' => '', // 描述
'pdf_url' => '', // pdf地址
'sup_stock' => '', // 库存
'min_purch' => '', // 最小订购量
'price' => '', // 价格
'img_url' => '', // 图片地址
'createtime' => '', // 创建时间
'datacode' => '', // 批号
'package' => '', // 封装
'page_url' => '', // 页面地址
);
// mfg_part
$pattern = '/<td>产品型号<\/td><td>(.[^<]+)</isu';
if (preg_match($pattern, $html, $matches))
{
$result['mfg_part'] = trim($matches[1]);
} else {
// 此项木有,说明也没处处了
return array();
}
// mfg_name
$pattern = '/<td>厂商<\/td>[\s\n]*<td>(.+)<\/td>/isu';
if (preg_match($pattern, $html, $matches))
{
$result['mfg_name'] = trim($matches[1]);
}
// para
$pattern = '/<tr\s+style="background-color:#e9e9e9;color:black; font-weight:bold;">(.+)<\/tr><\/table>/isu';
if (preg_match($pattern, $html, $matches))
{
if (preg_match_all('/<td>(.+)<\/td>/isu', $matches[1], $matches))
{
$count = count($matches[1]);
$count = intval($count / 2 );
foreach ($matches[1] as $k=>$v)
{
if ($k >= $count)
{
break;
}
if (trim($v) == '描述')
{
// desc
$result['desc'] = trim($matches[1][$count + $k]);
continue;
}
$v = trim($v);
$result['para'][$v] = trim($matches[1][$count + $k]);
}
}
}
// pdf_url
$pattern = '/<td>详细资料<\/td><td><a\s+href="(.[^"]+)"/isu';
if (preg_match($pattern, $html, $matches))
{
$result['pdf_url'] = trim($matches[1]);
}
// sup_stock
$pattern = '/<td>库存数量<\/td>[\s\n]*<td>(\d+)<\/td>/isu';
if (preg_match($pattern, $html, $matches))
{
$result['sup_stock'] = trim($matches[1]);
}
// price
$pattern = '/<tr><td.[^>]+>(\d+)\+<\/td><td.[^>]+>.[^\d]*([\d.]+)<\/td><\/tr>/isu';
if (preg_match_all($pattern, $html, $matches))
{
foreach ($matches[1] as $k=>$v)
{
$result['price'][$v] = '¥' . $matches[2][$k];
}
}
//img_url
$pattern = '/<td>图片<\/td><td><img\s+src="(.[^"]+)"/isu';
if (preg_match($pattern, $html, $matches))
{
$result['img_url'] = trim($matches[1]);
}
// page_url
if ($is_url)
{
$result['page_url'] = $host . $url;
}
return $result;
}
/**
* 最终调用函数
* @param string $keywords 搜索关键字
* @return array
*/
function getdata($keywords)
{
$hreflist = getlisthrefall($keywords);
$result = array();
foreach ($hreflist as $k=>$v)
{
$result[] = getdetail($v);
}
return $result;
}
// test script
$keywords = trim($_get['keywords']);
$result = getdata($keywords);
print_r($result);
复制代码 代码如下:
<?php
/**
* host: www.icbase.com
*/
//set_time_limit(0);
// base function
function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')
{
$ch = curl_init();
if (!empty($data)) {
$data = is_array($data)?http_build_query($data): $data;
$url .= (strpos($url,'?')? '&': "?") . $data;
}
curl_setopt($ch, curlopt_url, $url);
curl_setopt($ch, curlopt_returntransfer, true);
curl_setopt($ch, curlopt_connecttimeout, $timeout);
curl_setopt($ch, curlopt_post, 0);
curl_setopt($ch, curlopt_port, $port);
curl_setopt($ch, curlopt_httpheader, $header);
curl_setopt($ch, curlopt_followlocation, 1); //是否抓取跳转后的页面
$reffer && curl_setopt($ch, curlopt_referer, $reffer);
if($proxy) {
curl_setopt($ch, curlopt_proxy, $proxy);
curl_setopt($ch, curlopt_proxyport, 1723);
curl_setopt($ch, curlopt_proxyuserpwd,"andhm001:andhm123");
}
$result = array();
$result['result'] = curl_exec($ch);
if (0 != curl_errno($ch)) {
$result['error'] = "error:\n" . curl_error($ch);
}
curl_close($ch);
return $result;
}
复制代码 代码如下:
function curl_post($url, $data = array(), $header = array(), $timeout = 5, $port = 80)
{
$ch = curl_init();
curl_setopt($ch, curlopt_url, $url);
curl_setopt($ch, curlopt_returntransfer, true);
curl_setopt($ch, curlopt_connecttimeout, $timeout);
//curl_setopt($ch, curlopt_port, $port);
!empty ($header) && curl_setopt($ch, curlopt_httpheader, $header);
curl_setopt($ch, curlopt_post, 1);
curl_setopt($ch, curlopt_postfields, $data);
$result = array();
$result['result'] = curl_exec($ch);
if (0 != curl_errno($ch)) {
$result['error'] = "error:\n" . curl_error($ch);
}
curl_close($ch);
return $result;
}
/**
* 获取列表页的html源码
* @param string $keywords 搜索关键字
* @param int $page 页数
* @return boolean|array
*/
function getlisthtml($keywords, $page=1)
{
if ($page < 0)
{
return false;
}
$page = $page == 0 ? 1 : intval($page);
if ($page == 1)
{
$result = curl_get('http://www.icbase.com/proresult.aspx', array('prokey' => $keywords));
if ( isset($result['error']) )
{
return false;
//exit($result['error']);
}
$result = $result['result'];
// asp.net post提交数据
if(! defined('__viewstate') && preg_match('/<input\s+type="hidden"\s+name="__viewstate"\s+id="__viewstate"\s+value="(.[^"]+)"/isu', $result, $matches))
{
define('__viewstate', $matches[1]);
} else {
return false;
}
if(! defined('__previouspage') && preg_match('/<input\s+type="hidden"\s+name="__previouspage"\s+id="__previouspage"\s+value="(.[^"]+)"/isu', $result, $matches))
{
define('__previouspage', $matches[1]);
} else {
return false;
}
if(! defined('__eventvalidation') && preg_match('/<input\s+type="hidden"\s+name="__eventvalidation"\s+id="__eventvalidation"\s+value="(.[^"]+)"/isu', $result, $matches))
{
define('__eventvalidation', $matches[1]);
} else {
return false;
}
return $result;
}
$data = array(
'__eventtarget' => 'pager',
'__eventargument' => $page,
'__viewstate' => __viewstate,
'__previouspage' => __previouspage,
'__eventvalidation' => __eventvalidation,
);
$result = curl_post('http://www.icbase.com/proresult.aspx?prokey=' . $keywords, $data);
if ( isset($result['error']) )
{
return false;
//exit($result['error']);
}
$result = $result['result'];
return $result;
}
/**
* 获取列表页 a链接的url
* @param string $html html源码
* @return array
*/
function getlisthref($html)
{
$pattern = '/<a\s+href=\'(.[^\']+)\'\s+target="_blank"\s*>[\s\n]*<img.+[^>]\/>/isu';
if (preg_match_all($pattern, $html, $matches))
{
return $matches[1];
} else {
// 没有匹配项
return array();
}
}
/**
* 获取下一页数字
* @param string $html html源码
* @return number
*/
function getlistnextpage($html)
{
$pattern = '/<div\s+id="pager".+[^>]>.+<a\s+href="javascript\:__dopostback\(\'pager\',\'(\d+)\'\)">><\/a>/isu';
if (preg_match($pattern, $html, $matches))
{
return intval($matches[1]);
} else {
return -1;
}
}
/**
* 获取列表也所有的href
* @param string $keywords 搜索关键字
* @return boolean|array
*/
function getlisthrefall($keywords)
{
if (empty($keywords))
{
return false;
}
$html = getlisthtml($keywords);
$hreflist = getlisthref($html);
if (empty($hreflist))
{
// 没有结果
return array();
}
$nextpage = getlistnextpage($html);
while ($nextpage > 0)
{
$html = getlisthtml($keywords, $nextpage);
$tmphreflist = getlisthref($html);
$hreflist = array_merge($hreflist, $tmphreflist);
$nextpage = getlistnextpage($html);
}
return $hreflist;
}
/**
* 获取详情页信息
* @param string $url url地址或者是抓取到的html源代码 根据@see $is_url 区分
* @param int $is_url 1使用的是url地址 0直接处理html源代码
* @return boolean|multitype:|multitype:string
*/
function getdetail($url, $is_url = 1)
{
if ( empty($url) )
{
return false;
}
$host = 'www.icbase.com';
$html = $url;
if ($is_url) {
$url = '/' . ltrim($url, '/');
$result = curl_get($host . $url);
if ( isset($result['error']) )
{
exit($result['error']);
}
$html = $result['result'];
}
$result = array(
'sup_part' => '', // 供应商型号
'sup_id' => '', // 供应商id
'mfg_part' => '', // 制造商型号
'mfg_name' => '', // 制造商名称
'cat_name' => '', // 分类名称
'para' => '', // 属性
'desc' => '', // 描述
'pdf_url' => '', // pdf地址
'sup_stock' => '', // 库存
'min_purch' => '', // 最小订购量
'price' => '', // 价格
'img_url' => '', // 图片地址
'createtime' => '', // 创建时间
'datacode' => '', // 批号
'package' => '', // 封装
'page_url' => '', // 页面地址
);
// mfg_part
$pattern = '/<td>产品型号<\/td><td>(.[^<]+)</isu';
if (preg_match($pattern, $html, $matches))
{
$result['mfg_part'] = trim($matches[1]);
} else {
// 此项木有,说明也没处处了
return array();
}
// mfg_name
$pattern = '/<td>厂商<\/td>[\s\n]*<td>(.+)<\/td>/isu';
if (preg_match($pattern, $html, $matches))
{
$result['mfg_name'] = trim($matches[1]);
}
// para
$pattern = '/<tr\s+style="background-color:#e9e9e9;color:black; font-weight:bold;">(.+)<\/tr><\/table>/isu';
if (preg_match($pattern, $html, $matches))
{
if (preg_match_all('/<td>(.+)<\/td>/isu', $matches[1], $matches))
{
$count = count($matches[1]);
$count = intval($count / 2 );
foreach ($matches[1] as $k=>$v)
{
if ($k >= $count)
{
break;
}
if (trim($v) == '描述')
{
// desc
$result['desc'] = trim($matches[1][$count + $k]);
continue;
}
$v = trim($v);
$result['para'][$v] = trim($matches[1][$count + $k]);
}
}
}
// pdf_url
$pattern = '/<td>详细资料<\/td><td><a\s+href="(.[^"]+)"/isu';
if (preg_match($pattern, $html, $matches))
{
$result['pdf_url'] = trim($matches[1]);
}
// sup_stock
$pattern = '/<td>库存数量<\/td>[\s\n]*<td>(\d+)<\/td>/isu';
if (preg_match($pattern, $html, $matches))
{
$result['sup_stock'] = trim($matches[1]);
}
// price
$pattern = '/<tr><td.[^>]+>(\d+)\+<\/td><td.[^>]+>.[^\d]*([\d.]+)<\/td><\/tr>/isu';
if (preg_match_all($pattern, $html, $matches))
{
foreach ($matches[1] as $k=>$v)
{
$result['price'][$v] = '¥' . $matches[2][$k];
}
}
//img_url
$pattern = '/<td>图片<\/td><td><img\s+src="(.[^"]+)"/isu';
if (preg_match($pattern, $html, $matches))
{
$result['img_url'] = trim($matches[1]);
}
// page_url
if ($is_url)
{
$result['page_url'] = $host . $url;
}
return $result;
}
/**
* 最终调用函数
* @param string $keywords 搜索关键字
* @return array
*/
function getdata($keywords)
{
$hreflist = getlisthrefall($keywords);
$result = array();
foreach ($hreflist as $k=>$v)
{
$result[] = getdetail($v);
}
return $result;
}
// test script
$keywords = trim($_get['keywords']);
$result = getdata($keywords);
print_r($result);