欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

PHP 抓取函数curl 实践

程序员文章站 2022-12-28 10:54:46
最近在学习curl的抓取实践, 在里面也学到了一些东西。 有一些网站需要cookie才可以抓取成功。这个时候我们就可以通过fiddle4 去抓包实现。然后通过构建头部信息 ,绕过网站端的验证。 以下是实现代码:

最近在学习curl的抓取实践, 在里面也学到了一些东西。

有一些网站需要cookie才可以抓取成功。这个时候我们就可以通过fiddle4 去抓包实现。然后通过构建头部信息 ,绕过网站端的验证。

以下是实现代码:

 

<?php
$stime=microtime(true);


$ch = curl_init();
$ckw = urlencode("圆形折叠麻将机全自动餐桌两用带椅子机麻家用欧式实木电动麻将桌php");
curl_setopt($ch, curlopt_url, " http://www.meegoe.com/sou_y3sl/ ?keywords=".$ckw);
curl_setopt($ch, curlopt_ssl_verifypeer, false);//https请求 不验证证书
curl_setopt($ch, curlopt_ssl_verifyhost, false);//https请求 不验证host
curl_setopt($ch, curlopt_timeout, 10); // 3秒超时
curl_setopt($ch, curlopt_header, 0); // 不需要页面的http头
curl_setopt($ch, curlopt_returntransfer, 1); // 抓取结果直接返回(如果为0,则直接输出内容到页面)
curl_setopt($ch, curlopt_encoding, 'gzip,deflate');//这个是解释gzip内容
curl_setopt($ch, curlopt_httpheader, array(
'connection: keep-alive',
'upgrade-insecure-requests: 1',
'user-agent: mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/71.0.3578.98 safari/537.36',
'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'referer: http://www.meegoe.com/ ',
'accept-encoding: gzip, deflate, br',
'accept-language: zh-cn,zh;q=0.9',
));
curl_setopt($ch,curlopt_cookie,'cna=3kdtfid2qkgcababnczxpp/x; hng=cn%7czh-cn%7ccny%7c156; um_distinctid=1694cacf99e377-0508f14d59d804-b781636-100200-1694cacf99f503; lid=haosweet; ali_ab=139.207.69.183.1552121532339.7; ali_apache_id=11.23.78.17.1552121946782.441191.7; h_keys="%u714e%u997c#%u4ebf%u5065%u8702%u80f6#arrl#%u8702%u871c"; ad_prefer="2019/03/10 16:12:09"; alicnweb=touch_tb_at%3d1552399125776%7clastlogonid%3dlweizwer1%7cshow_inter_tips%3dfalse%7chomeidtts%3d00964423784623518415084922041883626898%7chomeidttsaction%3dtrue; l=bbevjusvvczbe9oxbocivqhfho_t7irxmusjcrvmi_5znsy1vuqol_0deuv6vj5rslyb4z6vznp9-etlw; xsrf-token=2f40c7b9-d618-4a87-929c-6ec68e187b2c; cnzzdata1261998348=973403025-1552118420-%7c1552486140; _m_h5_tk=4727e399320af3a5455fcd4219c72fab_1552498204613; _m_h5_tk_enc=12f885140e5402698a7e5018ae67b361; cookie2=58efdb753105b686ecef55419f454629; t=dbb53d6a3987478e0c4062af14864a21; _tb_token_=5376f111de15e; __wapcsf__=1; cnzzdata1000231236=1647319684-1552120971-https%253a%252f%252fm.1688.com%252f%7c1552488361; cookie1=ac0hycjxlfdq3rn7w3vkzbxzzacsgna0aclxtmnl0ly%3d; cookie17=uuwz%2fisa2fe%3d; sg=t49; csg=1d0e7cb7; unb=24919014; cn_tmp=z28mc+gqtz3c/avrs/ymfpz2bd/61p+3mihrxh1ln3drzyi6ty99cop9r4fxqg5hefcbmbgfcom0uj5s4hkyp8d/dcaxlrtbfg1h7fjyamce010i4n7w+jg7vhr/kazj; cn_m_s=f45iamzwweuupgx4wq4km4etjyv3kx2cqwhnxycgwpvdxe/jmovbc3dcocy+zuvvq8hof3fxkpa=; ali_apache_track="c_mid=b2b-24919014|c_lid=haosweet"; tbsnid=l/wtnzrvkmphmdhmdlmplegyswk1oiye25mgnn62tsi6solepjkl9g==; __cn_logon__=true; __cn_logon_id__=haosweet; ali-ss=bg9naw5jzd1oyw9zd2vldczsb2dpbk1lc3nhz2vfcnjvcj0mbg9naw5tdgf0dxnszxrnc2c9jnvzzxjjzd0yndkxotaxnczsb2dpbkvycm9yvxnlck5hbwu9jmnozwnry29kzt0mbwvtymvyswq9yjjilti0ote5mde0jnnpzd01ogvmzgi3ntmxmdvinjg2zwnlzju1nde5zjq1ndyyoszly29kzt1pzvntsw==; isg=bgxsuluhhlbc1gh-fazc2peupupejrhmgizbbmatwze60q3b7jeyxvjz9dlm2ugn');
$output = curl_exec($ch);
curl_close($ch);

$contents = preg_replace("/([\r\n|\n|\t| ]+)/",'',$output);
$contents = mb_convert_encoding($contents, 'gbk', 'utf-8');
print_r($contents);
preg_match_all("/(<divclass=\"container-tag\"><ahref=\"(.*)\">)(.*)(<\/a><\/div>)/ius",$contents,$matchrelated);

 

print_r($matchrelated[3]);



$etime=microtime(true);//获取程序执行结束的时间
$total=$etime-$stime; //计算差值

echo "<br />当前页面执行时间为:{$total} 秒";
?>