欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

采集数据

程序员文章站 2022-07-02 09:48:25
...
<?
// 连接数据库的配置
$config = array(
    'url'       => LOCAL_DB_HOST.':3306',
    'user'      => LOCAL_DB_USER,
    'password'  => LOCAL_DB_PWD,
    'db'        => LOCAL_DB_NAME,
);
$is_debug = true;

// 命令行参数一为日志的开关
$log = new CommonLog(array('log_level' => $_SERVER["argv"][1],));

// 连接数据库实例
$db = new DBConfig($config);

$today = date('Ymd');
$type_list = array(

    // 'game'      => "http://top.baidu.com/buzz/game.html",

    'webgame'   => "http://top.baidu.com/buzz/mmogame.html",
    'rpg'       => "http://top.baidu.com/buzz/magic_rpg.html",
);

foreach ($type_list as $type => $url)
{
    // 采集游戏排行榜
    $max_date = null;
    $max_date = $db->query_single("select max(data_date)
                                    from web_baidu_gametop50
                                    where type = '$type'");

    if (! $max_date || $max_date < $today)
    {
        $log->debug("start at page: " . $url);


        // 读取网页内容,在读取失败时,可连续尝试9次。
        $cn = 0;
        while ($cn < 9 && ($page = @file_get_contents($url)) === FALSE)
        $cn++;

        // 编码转换,phpquery无法再gb2312的情况下处理特殊字符
        $page = preg_replace('/gb2312/i', 'utf-8', $page);

        // 转换编码, 并解析
        $doc = phpQuery::newDocumentHTML(
                mb_convert_encoding($page, 'utf-8', 'gb2312')
               );


        // 解析文档
        foreach ($doc->find("div.list > table > tbody > tr")->not(".th") as $tr)
        {
            $tr                 = pq($tr);
            $data               = array();
            $data['index_id']   = $tr->find('> th:nth-child(1)')->text();
            $data['key_name']   = $tr->find('> td:nth-child(2)')->text();

            // 今日搜索
            $data['search_num'] = $tr->find('> td:nth-child(5)')->text();

            // 最近七日
            $data['count_num']  = $tr->find('> td:nth-child(6)')->text();

            // $data['online_day'] = $tr->find('> td:nth-child(5)')->text();
            // $data['avg_num']    = $tr->find('> td:nth-child(6)')->text();

            $data['online_day'] = 0;
            $data['avg_num']    = 0;
            $data['data_date']  = $today;
            $data['type']       = $type;
            $db->insert_array("web_baidu_gametop50", $data);
        }
    } else
    {
        $log->debug("page: (" . $url . ")has gathered before");
    }
}
?>

 dd