抓取“全球 IC 采购网”供应商程序_PHP教程
程序员文章站
2022-05-23 20:39:24
...
/**
* 抓取“全球 IC 采购网(http://www.qic.com.cn/)”供应商主程序
* author Lee.
* Last modify $Date: 2012-2-7 09:35:21 $
*/
require_once './config.inc.php';
class qic{
private $startId;
private $endId;
public function __construct() {
$this->startId = 27688;
$this->endId = 55185;
}
public function go() {
for ($i=$this->startId; $iendId; $i++) {
$infoArr = $this->getInfoByUrl($this->getUrl($i));
if (emptyempty($infoArr['company'])) continue;
$m = new Model();
if ($m->isExists('qic', "company='{$infoArr['company']}'")) {
echo 'Data Exists!!'; continue;
} else {
if ($this->addInfoInDB($m, $infoArr)) {
echo 'Add Success!!';
} else {
echo 'Add Faild!!';
}
}
}
}
/**
* 添加数据进数据库
* @param Object $m
* @param array $infoArr
* @return Number
*/
private function addInfoInDB($m ,$infoArr) {
$num = null;
$num = $m->insert('qic', array('company', 'person', 'phone', 'mobile', 'fax', 'qq', 'msn', 'email', 'address', 'website', 'shopUrl'), array($infoArr['company'], $infoArr['person'], $infoArr['phone'], $infoArr['mobile'], $infoArr['fax'], $infoArr['qq'], $infoArr['msn'], $infoArr['email'], $infoArr['address'], $infoArr['website'], $infoArr['shopUrl']));
return $num;
}
/**
* 根据供应商地址获取信息
* @param string $re
* @return ArrayObject
*/
private function getInfoByUrl($url) {
$re = file_get_contents($url);
preg_match_all('/
* 抓取“全球 IC 采购网(http://www.qic.com.cn/)”供应商主程序
* author Lee.
* Last modify $Date: 2012-2-7 09:35:21 $
*/
require_once './config.inc.php';
class qic{
private $startId;
private $endId;
public function __construct() {
$this->startId = 27688;
$this->endId = 55185;
}
public function go() {
for ($i=$this->startId; $iendId; $i++) {
$infoArr = $this->getInfoByUrl($this->getUrl($i));
if (emptyempty($infoArr['company'])) continue;
$m = new Model();
if ($m->isExists('qic', "company='{$infoArr['company']}'")) {
echo 'Data Exists!!'; continue;
} else {
if ($this->addInfoInDB($m, $infoArr)) {
echo 'Add Success!!';
} else {
echo 'Add Faild!!';
}
}
}
}
/**
* 添加数据进数据库
* @param Object $m
* @param array $infoArr
* @return Number
*/
private function addInfoInDB($m ,$infoArr) {
$num = null;
$num = $m->insert('qic', array('company', 'person', 'phone', 'mobile', 'fax', 'qq', 'msn', 'email', 'address', 'website', 'shopUrl'), array($infoArr['company'], $infoArr['person'], $infoArr['phone'], $infoArr['mobile'], $infoArr['fax'], $infoArr['qq'], $infoArr['msn'], $infoArr['email'], $infoArr['address'], $infoArr['website'], $infoArr['shopUrl']));
return $num;
}
/**
* 根据供应商地址获取信息
* @param string $re
* @return ArrayObject
*/
private function getInfoByUrl($url) {
$re = file_get_contents($url);
preg_match_all('/
(.*)/Usi', $re, $companyArr);
preg_match_all('/\s*联 系 人:(.*)/Usi', $re, $personArr);
preg_match_all('/
\s*电 话:(.*)/Usi', $re, $phoneArr);
preg_match_all('/
\s*手 机:(.*)/Usi', $re, $mobileArr);
preg_match_all('/
\s*传 真:(.*)/Usi', $re, $faxArr);
preg_match_all('/
\s*QQ:(.*)/Usi', $re, $qqArr);
preg_match_all('/
\s*MSN:(.*)/Usi', $re, $msnArr);
preg_match_all('/
\s*邮 箱:(.*)/Usi', $re, $emailArr);
preg_match_all('/公司地址:(.*)/Usi', $re, $addressArr);
preg_match_all('/公司网址:(.*)/Usi', $re, $websiteArr);
$infoArr = array(
'company'=>$this->formatString($companyArr[1][0]),
'person'=>$this->formatString($personArr[1][0]),
'phone'=>$this->formatString($phoneArr[1][0]),
'mobile'=>$this->formatString($mobileArr[1][0]),
'fax'=>$this->formatString($faxArr[1][0]),
'qq'=>$this->formatString($qqArr[1][0], 'qm'),
'msn'=>$this->formatString($msnArr[1][0], 'qm'),
'email'=>$this->formatString($emailArr[1][0]),
'address'=>$this->formatString($addressArr[1][0]),
'website'=>$this->formatString($websiteArr[1][0], 'a'),
'shopUrl'=>$url
);
return $infoArr;
}
/**
* 获取页面内容
* @param string $url
* @return string
*/
private function getContent($url) {
$re = file_get_contents($url);
return $re;
}
/**
* 格式化字符串
* @param string $str
* @param string $type 类型
* @return string
*/
private function formatString($str, $type='default') {
$str = trim($str);
switch ($type) {
case 'default':
return emptyempty($str) ? '' : $str;
break;
case 'qm': # 处理QQ
if (emptyempty($str)) return '';
preg_match_all('/\'(.+)\'/si', $str, $arr);
return trim($arr[1][0]);
break;
case 'a':
$str = preg_replace('/(.+)/', '\1', $str);
return $str;
break;
default:
return '';
break;
}
}
/**
* 获取供应商页面地址 www.2cto.com
* @param int $shopId
* @return string
*/
private function getUrl($shopId) {
return "http://www.qic.com.cn/specialstore/tsh_{$shopId}.html";
}
}
/*
* 表结构
CREATE TABLE `qic` (
`id` mediumint(8) unsigned NOT NULL auto_increment,
`company` varchar(500) NOT NULL,
`person` varchar(500) NOT NULL,
`phone` varchar(300) NOT NULL,
`mobile` varchar(300) NOT NULL,
`fax` varchar(300) NOT NULL,
`qq` varchar(300) NOT NULL,
`msn` varchar(500) NOT NULL,
`email` varchar(500) NOT NULL,
`address` varchar(500) NOT NULL,
`website` varchar(500) NOT NULL,
`shopUrl` varchar(200) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
* */
$q = new qic();
$q->go();
?>
摘自 Lee.的专栏
preg_match_all('/
preg_match_all('/
preg_match_all('/
preg_match_all('/
preg_match_all('/
preg_match_all('/
preg_match_all('/
preg_match_all('/公司地址:(.*)/Usi', $re, $addressArr);
preg_match_all('/公司网址:(.*)/Usi', $re, $websiteArr);
$infoArr = array(
'company'=>$this->formatString($companyArr[1][0]),
'person'=>$this->formatString($personArr[1][0]),
'phone'=>$this->formatString($phoneArr[1][0]),
'mobile'=>$this->formatString($mobileArr[1][0]),
'fax'=>$this->formatString($faxArr[1][0]),
'qq'=>$this->formatString($qqArr[1][0], 'qm'),
'msn'=>$this->formatString($msnArr[1][0], 'qm'),
'email'=>$this->formatString($emailArr[1][0]),
'address'=>$this->formatString($addressArr[1][0]),
'website'=>$this->formatString($websiteArr[1][0], 'a'),
'shopUrl'=>$url
);
return $infoArr;
}
/**
* 获取页面内容
* @param string $url
* @return string
*/
private function getContent($url) {
$re = file_get_contents($url);
return $re;
}
/**
* 格式化字符串
* @param string $str
* @param string $type 类型
* @return string
*/
private function formatString($str, $type='default') {
$str = trim($str);
switch ($type) {
case 'default':
return emptyempty($str) ? '' : $str;
break;
case 'qm': # 处理QQ
if (emptyempty($str)) return '';
preg_match_all('/\'(.+)\'/si', $str, $arr);
return trim($arr[1][0]);
break;
case 'a':
$str = preg_replace('/
return $str;
break;
default:
return '';
break;
}
}
/**
* 获取供应商页面地址 www.2cto.com
* @param int $shopId
* @return string
*/
private function getUrl($shopId) {
return "http://www.qic.com.cn/specialstore/tsh_{$shopId}.html";
}
}
/*
* 表结构
CREATE TABLE `qic` (
`id` mediumint(8) unsigned NOT NULL auto_increment,
`company` varchar(500) NOT NULL,
`person` varchar(500) NOT NULL,
`phone` varchar(300) NOT NULL,
`mobile` varchar(300) NOT NULL,
`fax` varchar(300) NOT NULL,
`qq` varchar(300) NOT NULL,
`msn` varchar(500) NOT NULL,
`email` varchar(500) NOT NULL,
`address` varchar(500) NOT NULL,
`website` varchar(500) NOT NULL,
`shopUrl` varchar(200) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
* */
$q = new qic();
$q->go();
?>
摘自 Lee.的专栏