采集数据
程序员文章站
2022-07-02 09:48:25
...
<?
// 连接数据库的配置
$config = array(
'url' => LOCAL_DB_HOST.':3306',
'user' => LOCAL_DB_USER,
'password' => LOCAL_DB_PWD,
'db' => LOCAL_DB_NAME,
);
$is_debug = true;
// 命令行参数一为日志的开关
$log = new CommonLog(array('log_level' => $_SERVER["argv"][1],));
// 连接数据库实例
$db = new DBConfig($config);
$today = date('Ymd');
$type_list = array(
// 'game' => "http://top.baidu.com/buzz/game.html",
'webgame' => "http://top.baidu.com/buzz/mmogame.html",
'rpg' => "http://top.baidu.com/buzz/magic_rpg.html",
);
foreach ($type_list as $type => $url)
{
// 采集游戏排行榜
$max_date = null;
$max_date = $db->query_single("select max(data_date)
from web_baidu_gametop50
where type = '$type'");
if (! $max_date || $max_date < $today)
{
$log->debug("start at page: " . $url);
// 读取网页内容,在读取失败时,可连续尝试9次。
$cn = 0;
while ($cn < 9 && ($page = @file_get_contents($url)) === FALSE)
$cn++;
// 编码转换,phpquery无法再gb2312的情况下处理特殊字符
$page = preg_replace('/gb2312/i', 'utf-8', $page);
// 转换编码, 并解析
$doc = phpQuery::newDocumentHTML(
mb_convert_encoding($page, 'utf-8', 'gb2312')
);
// 解析文档
foreach ($doc->find("div.list > table > tbody > tr")->not(".th") as $tr)
{
$tr = pq($tr);
$data = array();
$data['index_id'] = $tr->find('> th:nth-child(1)')->text();
$data['key_name'] = $tr->find('> td:nth-child(2)')->text();
// 今日搜索
$data['search_num'] = $tr->find('> td:nth-child(5)')->text();
// 最近七日
$data['count_num'] = $tr->find('> td:nth-child(6)')->text();
// $data['online_day'] = $tr->find('> td:nth-child(5)')->text();
// $data['avg_num'] = $tr->find('> td:nth-child(6)')->text();
$data['online_day'] = 0;
$data['avg_num'] = 0;
$data['data_date'] = $today;
$data['type'] = $type;
$db->insert_array("web_baidu_gametop50", $data);
}
} else
{
$log->debug("page: (" . $url . ")has gathered before");
}
}
?>
dd
上一篇: stm32的低功耗模式
下一篇: python截取视频某一帧