PHP Cookbook读书笔记 – 第13章Web自动化
程序员文章站
2022-04-21 11:53:46
...
通过GET获得一个指定url的页面内容 有3种方式来获取一个URL的内容: PHP提供的文件函数file_get_contents() cURL扩展 PEAR中的HTTP_Request类 //方式1$page = file_get_contents('http://www.example.com/robots.txt');//方式2$c = curl_init('http://www.ex
通过GET获得一个指定url的页面内容
有3种方式来获取一个URL的内容:
- PHP提供的文件函数file_get_contents()
- cURL扩展
- PEAR中的HTTP_Request类
//方式1 $page = file_get_contents('http://www.example.com/robots.txt'); //方式2 $c = curl_init('http://www.example.com/robots.txt'); curl_setopt($c, CURLOPT_RETURNTRANSFER, true); $page = curl_exec($c); curl_close($c); //方式3 require_once 'HTTP/Request.php'; $r = new HTTP_Request('http://www.example.com/robots.txt'); $r->sendRequest(); $page = $r->getResponseBody();
可以通过这些方式来获取XML文档,通过结合http_build_query()来建立一个查询字符串,可以通过url中加入username@password的形式来访问受保护的页面,通过cURL和PEAR的HTTP_Client类来跟踪重定向。
通过POST获得一个URL
让PHP模拟发送一个POST请求并获得服务器的反馈内容
//1 $url = 'http://www.example.com/submit.php'; $body = 'monkey=uncle&rhino=aunt'; $options = array('method' => 'POST', 'content' => $body); $context = stream_context_create(array('http' => $options)); print file_get_contents($url, false, $context); //2 $url = 'http://www.example.com/submit.php'; $body = 'monkey=uncle&rhino=aunt'; $c = curl_init($url); curl_setopt($c, CURLOPT_POST, true); curl_setopt($c, CURLOPT_POSTFIELDS, $body); curl_setopt($c, CURLOPT_RETURNTRANSFER, true); $page = curl_exec($c); curl_close($c); //3 require 'HTTP/Request.php'; $url = 'http://www.example.com/submit.php'; $r = new HTTP_Request($url); $r->setMethod(HTTP_REQUEST_METHOD_POST); $r->addPostData('monkey','uncle'); $r->addPostData('rhino','aunt'); $r->sendRequest(); $page = $r->getResponseBody();
通过Cookie获得一个URL
//2 $c = curl_init('http://www.example.com/needs-cookies.php'); curl_setopt($c, CURLOPT_COOKIE, 'user=ellen; activity=swimming'); curl_setopt($c, CURLOPT_RETURNTRANSFER, true); $page = curl_exec($c); curl_close($c); //3 require 'HTTP/Request.php'; $r = new HTTP_Request('http://www.example.com/needs-cookies.php'); $r->addHeader('Cookie','user=ellen; activity=swimming'); $r->sendRequest(); $page = $r->getResponseBody();
通过Header获得一个URL
通过修改header中的信息可以来伪造 Referer 或 User-Agent 后请求目标URL,不少防盗链网站经常会采用判断Referer中的信息来源决定是否允许下载或访问资源。需要具备一些HTTP的HEADER背景知识。
标记网页
其实这个代码经过简单修改还可以应用到替换网页中的敏感关键字,这在天朝是很有用的一个功能
$body = '
I like pickles and herring.
A pickle picture
I have a herringbone-patterned toaster cozy.
Herring is not a real HTML element!
';
$words = array('pickle','herring');
$patterns = array();
$replacements = array();
foreach ($words as $i => $word) {
$patterns[] = '/' . preg_quote($word) .'/i';
$replacements[] = "\\0";
}
// Split up the page into chunks delimited by a
// reasonable approximation of what an HTML element
// looks like.
$parts = preg_split("{(])*>)}",
$body,
-1, // Unlimited number of chunks
PREG_SPLIT_DELIM_CAPTURE);
foreach ($parts as $i => $part) {
// Skip if this part is an HTML element
if (isset($part[0]) && ($part[0] == 's
$parts[$i] = preg_replace($patterns, $replacements, $part);
}
// Reconstruct the body
$body = implode('',$parts);
print $body;