欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  后端开发

抓取“中国 IC 网”供应商程序_PHP教程

程序员文章站 2022-05-13 09:27:34
...
/**
* 抓取“中国 IC 网(http://www.ic37.com)”供应商主程序
* author Lee.
* Last modify $Date: 2012-2-9 9:32:21 $
* 注:本程序按照编码 GB2312 执行,因为“中国 IC 网”网站是GB2312编码,数据库也得保持一致
*/
class ic37 {
private $key; // 型号
private $pageNum; // 页码

/**
* 入口程序
*/
public function go($key) {
$this->key = $key;
$this->pageNum = $this->getPageNum();
$this->getInfo();
}

/**
* 获取供应商 url 链接数组
* @return ArrayObject
*/
private function getInfo() {
if ($this->pageNum==1) { # 处理只有一页的情况
$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));
$this->isAddSuccess($arr);
} elseif ($this->pageNum>1) { # 多页
for ($i=1; $ipageNum; $i++) {
$arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));
$this->isAddSuccess($arr);
}
}
}

/**
* 打印是否添加成功
* @param ArrayObject $arr
* @return string
*/
private function isAddSuccess($arr) {
foreach ($arr as $k=>$v) {
if ($this->execAdd($this->getInfoByShopUrl($v))) {
echo 'Add Success!!';
} else {
echo 'Add Faild!!';
}
}
}

/**
* 执行添加到数据库
* @param ArrayObject $infoArr
* @return Number 受影响的行数
*/
private function execAdd($infoArr) {
$mysqli = $this->getDb();
if (!emptyempty($infoArr['company'])) {
if (!$this->isExists($mysqli, $infoArr)) {
$num = $mysqli->query("INSERT INTO ic37(company,person,phone,mobile,qq,msn,fax,email,address,country,region,zip,web,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['person']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['fax']}','{$infoArr['email']}','{$infoArr['address']}','{$infoArr['country']}','{$infoArr['region']}','{$infoArr['zip']}','{$infoArr['web']}','{$infoArr['shopUrl']}')");
return $num;
} else {
return false; # 表示数据已经存在
}
} else {
return false;
}
}

private function formatStr($str) {
$str = trim($str);
$str = str_replace(' ', '', $str);
$str = str_replace('==联系我们', '', $str);
return $str;
}

/**
* 连接数据库
*/
private function getDb() {
$mysqli = new mysqli('localhost', 'root', '1715544', 'weiku');
$mysqli->query('SET NAMES GB2312');
return $mysqli;
}

/**
* 检查公司是否已经存在
* @param Resource $mysqli
* @param ArrayObject $infoArr
* @return bool
*/
private function isExists($mysqli, $infoArr) {
$mysqli->query("SELECT company FROM weiku WHERE company = '{$infoArr['company']}'");
if ($mysqli->affected_rows) {
return true;
} else {
return false;
}
}

/**
* 抓取信息
* @param $url
* @return ArrayObject
*/
private function getInfoByShopUrl($url) {
$re = preg_replace('/(.*)/', '\1', str_replace('', '', str_replace('', '', $this->getUrlInfo($url))));
preg_match_all('/(.*)/Usi', $re, $companyArr); <br> preg_match_all('/<strong>联系人:\s*<td.>(.*)/Usi', $re, $personArr); <br> preg_match_all('/<strong>电话:\s*<td.>(.*)/Usi', $re, $phoneArr); <br> preg_match_all('/<strong>手机:\s*<td.>(.*)/Usi', $re, $mobileArr); <br> preg_match_all('/<strong>QQ:\s*<td.>(.*)/Usi', $re, $qqArr); <br> preg_match_all('/<strong>MSN:\s*<td.>(.*)/Usi', $re, $msnArr); <br> preg_match_all('/<strong>传真:\s*<td.>(.*)/Usi', $re, $faxArr); <br> preg_match_all('/<strong>EMail:\s*<td.>(.*)/Usi', $re, $emailArr); <br> preg_match_all('/司地址[:]*[]*[<strong>]*[:]*[]*\s*<td.>(.*)/Usi', $re, $addressArr); <br> preg_match_all('/<strong>国家[:]*[<strong>]*[:]*[]*\s*<td.>(.*)/Usi', $re, $countryArr); <br> preg_match_all('/<strong>地区:\s*<td.>(.*)/Usi', $re, $regionArr); <br> preg_match_all('/<strong>邮政编码:\s*<td.>(.*)/Usi', $re, $zipArr); <br> preg_match_all('/<strong>\s*网址[1]*:\s*<td.>(.*)/Usi', $re, $webArr); <br> $infoArr = array( <br> 'company'=>$this->formatStr($companyArr[1][0]), <br> 'person'=>$this->formatStr($personArr[1][0]), <br> 'phone'=>$this->formatStr($phoneArr[1][0]), <br> 'mobile'=>$this->formatStr($mobileArr[1][0]), <br> 'qq'=>$this->formatStr($qqArr[1][0]), <br> 'msn'=>$this->formatStr($msnArr[1][0]), <br> 'fax'=>$this->formatStr($faxArr[1][0]), <br> 'email'=>$this->formatStr($emailArr[1][0]), <br> 'address'=>$this->formatStr($addressArr[1][0]), <br> 'country'=>$this->formatStr($countryArr[1][0]), <br> 'region'=>$this->formatStr($regionArr[1][0]), <br> 'zip'=>$this->formatStr($zipArr[1][0]), <br> 'web'=>$this->formatStr($webArr[1][0]), <br> 'shopUrl'=>$url <br> ); <br> return $infoArr; <br> } <br><br> /**<br> * 根据页面获取供应商 url 数组<br> * @param string $re<br> * @return ArrayObject<br> */ <br> private function shopUrlMatchReArr($re) { <br> preg_match_all('/<p class="Company"><a. href="%5C%22(.+)%5C%22.*">[<font color="#FF0000">]*.*[]*\s*/Usi', $re, $arr); <br> $arr = $this->formatUrlArr(array_unique($arr[1])); <br> return $arr; <br> } <br><br> /**<br> * 格式化数组<br> * @param Array $arr<br> * @return ArrayObject<br> */ <br> private function formatUrlArr($arr) { <br> $newArr = array(); <br> foreach ($arr as $key=>$value) { <br> if ($this->isExistsHttp($value)) { <br> $newArr[$key] = $value; <br> } <br> } <br> return $newArr; <br> } <br><br> /**<br> * 格式化 QQ<br> * @param string $str<br> * @return string<br> */ <br> private function formatQqMsn($str, $e='QQ') { <br> if (emptyempty($str)) return ''; <br> preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr); <br> if (count($arr[1])==1) return $arr[1][0]; <br> $newStr = null; <br> foreach ($arr[1] as $value) { <br> $newStr .= $value . ' '; <br> } <br> return rtrim($newStr, ' '); <br> } <br><br> /**<br> * 供应商店铺链接添加 contact.asp<br> * @param array $arr<br> * @return string <br> */ <br> private function shopAddContact($arr) { <br> foreach ($arr as $k=>$v) { <br> if (stristr($v, 'contact.asp')===FALSE) <br> $newArr[$k] = $this->addContact($v); <br> else <br> $newArr[$k] = $v; <br> } <br> return $newArr; <br> } <br><br> /**<br> * 链接添加 contact.asp<br> * @param string $str<br> * @return string <br> */ <br> private function addContact($str) { <br> return $str . '/contact.asp'; <br> } <br><br> /**<br> * 去掉网址的 A 标签<br> * @param string $site<br> * @return string<br> */ <br> private function stripATags($site) { <br> $site = preg_replace('/<a.>(.+)/', '\1', $site); <br> return $site; <br> } <br><br> /**<br> * 检查 url 是否有 http<br> * @param string $url<br> * @return bool<br> */ <br> private function isExistsHttp($url) { <br> if (stristr($url, 'http://')) { <br> return true; <br> } else { <br> return false; <br> } <br> } <br><br> /**<br> * 获取页面内容<br> * @param Number $page<br> * @return string<br> */ <br> private function getContent($page=1) { <br> $re = file_get_contents($this->getUrl($this->key, $page)); <br> return $re; <br> } <br><br> /**<br> * 获取页码<br> * @return Number<br> */ <br> private function getPageNum() { <br> preg_match_all('/共.*条记录分(.*)页显示/Usi', $this->getContent(), $arr); <br> return $arr[1][0]; <br> } <br><br> /**<br> * 获取 URL 链接<br> * @param string $str<br> * @param int $page 页码<br> * @return string<br> */ <br> private function getUrl($str, $page=1) { <br> return "http://www.ic37.com/sell/search.asp?keyword={$str}&x=86&y=22&page={$page}"; <br> } <br><br> /**<br> * 获取页面内容<br> * @param string $url<br> * @return string<br> */ <br> private function getUrlInfo($url) { <br> $re = file_get_contents($url); <br> return $re; <br> } <br> } <br><br> /*<br> 程序运行思路:根据“中国 IC 网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息<br><br> 数据库结构<br> CREATE TABLE `ic37` (<br> `id` mediumint(8) unsigned NOT NULL auto_increment,<br> `company` varchar(500) default NULL,<br> `person` varchar(500) default NULL,<br> `phone` varchar(500) default NULL,<br> `mobile` varchar(500) default NULL,<br> `qq` varchar(500) default NULL,<br> `msn` varchar(500) default NULL,<br> `fax` varchar(500) default NULL,<br> `email` varchar(500) default NULL,<br> `address` varchar(1000) default NULL,<br> `country` varchar(500) default NULL,<br> `region` varchar(500) default NULL,<br> `zip` varchar(500) default NULL,<br> `web` varchar(500) default NULL,<br> `shopUrl` varchar(500) default NULL,<br> PRIMARY KEY (`id`)<br> ) ENGINE=InnoDB DEFAULT CHARSET=gb2312<br> */ <br><br> $k = new ic37(); <br> $arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358', 'NE555', '78L05', 'LM324', 'TL431', 'PC817', '7805', 'LM339', 'LM317', '46A-3GRI', 'MODEL', '78L05', '93C46-3GRI', '8050', 'DS18B20', 'TDA2030', 'LM393', '74HC595', '6N137', 'SN75176BDR')); <br> foreach ($arr as $v) { <br> $k->go($v); <br> } <br> ?> <br><?php <br /> /**<br> * 抓取“中国 IC 网(http://www.ic37.com)”供应商主程序<br> * author Lee.<br> * Last modify $Date: 2012-2-9 9:32:21 $<br> * 注:本程序按照编码 GB2312 执行,因为“中国 IC 网”网站是GB2312编码,数据库也得保持一致<br> */<br> class ic37 {<br> private $key; // 型号<br> private $pageNum; // 页码</a.></font></a.></p> <p> /**<br> * 入口程序<br> */<br> public function go($key) {<br> $this->key = $key;<br> $this->pageNum = $this->getPageNum();<br> $this->getInfo();<br> }</p> <p> /**<br> * 获取供应商 url 链接数组<br> * @return ArrayObject<br> */<br> private function getInfo() {<br> if ($this->pageNum==1) { # 处理只有一页的情况<br> $arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));<br> $this->isAddSuccess($arr);<br> } elseif ($this->pageNum>1) { # 多页<br> for ($i=1; $ipageNum; $i++) {<br> $arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));<br> $this->isAddSuccess($arr);<br> }<br> } <br> }<br><br> /**<br> * 打印是否添加成功<br> * @param ArrayObject $arr<br> * @return string<br> */<br> private function isAddSuccess($arr) {<br> foreach ($arr as $k=>$v) {<br> if ($this->execAdd($this->getInfoByShopUrl($v))) {<br> echo 'Add Success!!';<br> } else {<br> echo 'Add Faild!!';<br> }<br> }<br> }</p> <p> /**<br> * 执行添加到数据库<br> * @param ArrayObject $infoArr<br> * @return Number 受影响的行数<br> */<br> private function execAdd($infoArr) {<br> $mysqli = $this->getDb();<br> if (!empty($infoArr['company'])) {<br> if (!$this->isExists($mysqli, $infoArr)) {<br> $num = $mysqli->query("INSERT INTO ic37(company,person,phone,mobile,qq,msn,fax,email,address,country,region,zip,web,shopUrl) VALUES ('{$infoArr['company']}','{$infoArr['person']}','{$infoArr['phone']}','{$infoArr['mobile']}','{$infoArr['qq']}','{$infoArr['msn']}','{$infoArr['fax']}','{$infoArr['email']}','{$infoArr['address']}','{$infoArr['country']}','{$infoArr['region']}','{$infoArr['zip']}','{$infoArr['web']}','{$infoArr['shopUrl']}')");<br> return $num;<br> } else {<br> return false; # 表示数据已经存在<br> }<br> } else {<br> return false;<br> }<br> }<br><br> private function formatStr($str) {<br> $str = trim($str);<br> $str = str_replace(' ', '', $str);<br> $str = str_replace('==联系我们', '', $str);<br> return $str;<br> }</p> <p> /**<br> * 连接数据库<br> */<br> private function getDb() {<br> $mysqli = new mysqli('localhost', 'root', '1715544', 'weiku');<br> $mysqli->query('SET NAMES GB2312');<br> return $mysqli;<br> }</p> <p> /**<br> * 检查公司是否已经存在<br> * @param Resource $mysqli<br> * @param ArrayObject $infoArr<br> * @return bool<br> */<br> private function isExists($mysqli, $infoArr) {<br> $mysqli->query("SELECT company FROM weiku WHERE company = '{$infoArr['company']}'");<br> if ($mysqli->affected_rows) {<br> return true;<br> } else {<br> return false;<br> }<br> }</p> <p> /**<br> * 抓取信息<br> * @param $url<br> * @return ArrayObject<br> */<br> private function getInfoByShopUrl($url) {<br> $re = preg_replace('/<a.>(.*)/', '\1', str_replace('</a.></p></td.></strong></td.></strong></td.></strong></td.></strong></strong></td.></strong></td.></strong></td.></strong></td.></strong></td.></strong></td.></strong></td.></strong></td.></strong>
', '', str_replace('', '', $this->getUrlInfo($url))));
preg_match_all('/(.*)/Usi', $re, $companyArr);<br> preg_match_all('/<strong>联系人:\s*<td.>(.*)/Usi', $re, $personArr);<br> preg_match_all('/<strong>电话:\s*<td.>(.*)/Usi', $re, $phoneArr);<br> preg_match_all('/<strong>手机:\s*<td.>(.*)/Usi', $re, $mobileArr);<br> preg_match_all('/<strong>QQ:\s*<td.>(.*)/Usi', $re, $qqArr);<br> preg_match_all('/<strong>MSN:\s*<td.>(.*)/Usi', $re, $msnArr);<br> preg_match_all('/<strong>传真:\s*<td.>(.*)/Usi', $re, $faxArr);<br> preg_match_all('/<strong>EMail:\s*<td.>(.*)/Usi', $re, $emailArr);<br> preg_match_all('/司地址[:]*[]*[<strong>]*[:]*[]*\s*<td.>(.*)/Usi', $re, $addressArr);<br> preg_match_all('/<strong>国家[:]*[<strong>]*[:]*[]*\s*<td.>(.*)/Usi', $re, $countryArr);<br> preg_match_all('/<strong>地区:\s*<td.>(.*)/Usi', $re, $regionArr);<br> preg_match_all('/<strong>邮政编码:\s*<td.>(.*)/Usi', $re, $zipArr);<br> preg_match_all('/<strong>\s*网址[1]*:\s*<td.>(.*)/Usi', $re, $webArr);<br> $infoArr = array(<br> 'company'=>$this->formatStr($companyArr[1][0]),<br> 'person'=>$this->formatStr($personArr[1][0]),<br> 'phone'=>$this->formatStr($phoneArr[1][0]),<br> 'mobile'=>$this->formatStr($mobileArr[1][0]),<br> 'qq'=>$this->formatStr($qqArr[1][0]),<br> 'msn'=>$this->formatStr($msnArr[1][0]),<br> 'fax'=>$this->formatStr($faxArr[1][0]),<br> 'email'=>$this->formatStr($emailArr[1][0]),<br> 'address'=>$this->formatStr($addressArr[1][0]),<br> 'country'=>$this->formatStr($countryArr[1][0]),<br> 'region'=>$this->formatStr($regionArr[1][0]),<br> 'zip'=>$this->formatStr($zipArr[1][0]),<br> 'web'=>$this->formatStr($webArr[1][0]),<br> 'shopUrl'=>$url<br> );<br> return $infoArr;<br> } <p> /**<br> * 根据页面获取供应商 url 数组<br> * @param string $re<br> * @return ArrayObject<br> */<br> private function shopUrlMatchReArr($re) {<br> preg_match_all('/</p> <p class="Company"><a. href="%5C%22(.+)%5C%22.*">[<font color="#FF0000">]*.*[]*\s*/Usi', $re, $arr);<br> $arr = $this->formatUrlArr(array_unique($arr[1]));<br> return $arr;<br> }<br><br> /**<br> * 格式化数组<br> * @param Array $arr<br> * @return ArrayObject<br> */<br> private function formatUrlArr($arr) {<br> $newArr = array();<br> foreach ($arr as $key=>$value) {<br> if ($this->isExistsHttp($value)) {<br> $newArr[$key] = $value;<br> }<br> }<br> return $newArr;<br> }<br><br> /**<br> * 格式化 QQ<br> * @param string $str<br> * @return string<br> */<br> private function formatQqMsn($str, $e='QQ') {<br> if (empty($str)) return '';<br> preg_match_all('/alt="'.$e.'\:(.+)"/Usi', $str, $arr);<br> if (count($arr[1])==1) return $arr[1][0];<br> $newStr = null;<br> foreach ($arr[1] as $value) {<br> $newStr .= $value . ' ';<br> }<br> return rtrim($newStr, ' ');<br> }</font></a.></p> <p> /**<br> * 供应商店铺链接添加 contact.asp<br> * @param array $arr<br> * @return string <br> */<br> private function shopAddContact($arr) {<br> foreach ($arr as $k=>$v) {<br> if (stristr($v, 'contact.asp')===FALSE)<br> $newArr[$k] = $this->addContact($v);<br> else<br> $newArr[$k] = $v;<br> }<br> return $newArr;<br> }<br><br> /**<br> * 链接添加 contact.asp<br> * @param string $str<br> * @return string <br> */<br> private function addContact($str) {<br> return $str . '/contact.asp';<br> }</p> <p> /**<br> * 去掉网址的 A 标签<br> * @param string $site<br> * @return string<br> */<br> private function stripATags($site) {<br> $site = preg_replace('/<a.>(.+)/', '\1', $site);<br> return $site;<br> }</a.></p> <p> /**<br> * 检查 url 是否有 http<br> * @param string $url<br> * @return bool<br> */<br> private function isExistsHttp($url) {<br> if (stristr($url, 'http://')) {<br> return true;<br> } else {<br> return false;<br> }<br> }<br><br> /**<br> * 获取页面内容<br> * @param Number $page<br> * @return string<br> */<br> private function getContent($page=1) {<br> $re = file_get_contents($this->getUrl($this->key, $page));<br> return $re;<br> }<br><br> /**<br> * 获取页码<br> * @return Number<br> */<br> private function getPageNum() {<br> preg_match_all('/共.*条记录分(.*)页显示/Usi', $this->getContent(), $arr);<br> return $arr[1][0];<br> }</p> <p> /**<br> * 获取 URL 链接<br> * @param string $str<br> * @param int $page 页码<br> * @return string<br> */<br> private function getUrl($str, $page=1) {<br> return "http://www.ic37.com/sell/search.asp?keyword={$str}&x=86&y=22&page={$page}";<br> }</p> <p> /**<br> * 获取页面内容<br> * @param string $url<br> * @return string<br> */<br> private function getUrlInfo($url) {<br> $re = file_get_contents($url);<br> return $re;<br> }<br> }</p> <p>/*<br> 程序运行思路:根据“中国 IC 网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息</p> <p>数据库结构<br> CREATE TABLE `ic37` (<br> `id` mediumint(8) unsigned NOT NULL auto_increment,<br> `company` varchar(500) default NULL,<br> `person` varchar(500) default NULL,<br> `phone` varchar(500) default NULL,<br> `mobile` varchar(500) default NULL,<br> `qq` varchar(500) default NULL,<br> `msn` varchar(500) default NULL,<br> `fax` varchar(500) default NULL,<br> `email` varchar(500) default NULL,<br> `address` varchar(1000) default NULL,<br> `country` varchar(500) default NULL,<br> `region` varchar(500) default NULL,<br> `zip` varchar(500) default NULL,<br> `web` varchar(500) default NULL,<br> `shopUrl` varchar(500) default NULL,<br> PRIMARY KEY (`id`)<br> ) ENGINE=InnoDB DEFAULT CHARSET=gb2312<br> */</p> <p>$k = new ic37();<br> $arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358', 'NE555', '78L05', 'LM324', 'TL431', 'PC817', '7805', 'LM339', 'LM317', '46A-3GRI', 'MODEL', '78L05', '93C46-3GRI', '8050', 'DS18B20', 'TDA2030', 'LM393', '74HC595', '6N137', 'SN75176BDR'));<br> foreach ($arr as $v) {<br> $k->go($v);<br> }<br> ?></p> <br> 摘自 Lee.的专栏<br><p align="left"></p> <div style="display:none;"> <span id="url" itemprop="url">http://www.bkjia.com/PHPjc/478403.html</span><span id="indexUrl" itemprop="indexUrl">www.bkjia.com</span><span id="isOriginal" itemprop="isOriginal">true</span><span id="isBasedOnUrl" itemprop="isBasedOnUrl">http://www.bkjia.com/PHPjc/478403.html</span><span id="genre" itemprop="genre">TechArticle</span><span id="description" itemprop="description">?php /** * 抓取中国 IC 网(http://www.ic37.com)供应商主程序 * author Lee. * Last modify $Date: 2012-2-9 9:32:21 $ * 注:本程序按照编码 GB2312 执行,因为中...</span> </div></td.></strong></td.></strong></td.></strong></td.></strong></strong></td.></strong></td.></strong></td.></strong></td.></strong></td.></strong></td.></strong></td.></strong></td.></strong>