我自己开发的工具,打印出百度贴吧某用户发表过的所有帖子 nodejs百度贴吧工具程序员
程序员文章站
2022-04-11 09:05:06
...
<html>
<meta charset="UTF-8"/>
<style>
a {
color: green;
font-family: arial;
font-weight: bold
}
</style>
<body>
<div id="container"></div>
</body>
<script src="jquery1.7.1.js">
/* Jerry 2017-02-06 14:58PM update
should use C:\MyApp\Chrome\Application\chrome.exe --user-data-dir="C:/yaas" --disable-web-security
and then FIRST LOG ON BAIDU successfully!!!!
*/
</script>
<script>
/* Jerry 2017-02-05 5:54PM
这个警告的意思是说:请求的资源可能会被(扩展/或其他什么机制)屏蔽掉。
之所以会出现这个警告,是因为去获取该资源的请求其实并(还)没有真的发生,所以 Header 里显示的是伪信息,直到服务器真的有响应返回,这里的 Header 信息才会被更新为真实的。不过这一切也可能不会发生,因为该请求可能会被屏蔽。比如说 AdBlock 什么的,当然了不全是浏览器扩展,具体情况具体分析了。
对了,别忘了用 chrome://net-internals 来帮助你查找被屏蔽的请求以及可能的原因。
*/
var PREFIX = "http://tieba.baidu.com";
var START = "http://tieba.baidu.com/i/i/my_tie";
//var START = "http://www.baidu.com";
var POST = {};
var TOTAL = 0;
var SORTED = [];
function getTotalCount(collection){
var count = 0;
for( bar in collection){
if( !collection.hasOwnProperty(bar))
continue;
var postList = collection[bar];
count += postList.length;
}
return count;
}
function shouldEnd(previousCount) {
TOTAL = getTotalCount(POST);
console.log("pre: " + previousCount + " total: " + TOTAL);
return ( previousCount == TOTAL );
}
function main() {
var html = getPostByAJAX(START);
handleLiChildren(html);
var page = 2;
while(1){
var prevCount = getTotalCount(POST);
var task = START + "?&pn=" + page;
var html = getPostByAJAX(task);
handleLiChildren(html);
page++;
/*
if( page >=2 )
break;*/
if( shouldEnd(prevCount) )
break;
}
sort();
generate();
}
function handleLiChildren(resultString){
var htmlDom = $(resultString);
var liChildren = $("li", htmlDom);
$.each( liChildren, function(i, value) {
// if( value.className.indexOf("nav_item") != -1 )
if( value.className)
return true;
if( value.innerText == "我回复的" || value.innerText == "我的精品")
return true;
var detail = parseDetail(value);
insertPost(detail);
});
}
/*
<ul>
<li>
<cite>2016</cite>
<a href="/f?kw=%E5%A4%A7%E9%82%91" >尿素氮</a>
</li>
<li>
<cite>2015</cite>
<a href="/f?kw=%E5%A4%A7%E9%82%91" >尿素氮2</a>
</li>
</ul>
*/
function getpostSource(post) {
var source = "<li><cite>";
source += post.date + "/<cite>";
source += '<a href="' + post.url + '">' + post.postTitle + "</a></li>";
return source;
}
function getBarPostsSource(barName, posts) {
var source = '<h1>' + barName + ': ' + posts.length + '个</h1>';
source += "<ul>";
for( var i = 0; i < posts.length; i++){
var post = posts[i];
source += getpostSource(post);
}
source += "</ul>";
return source;
}
function sortNumber(a,b){
return b.size - a.size;
}
function sort() {
for( barName in POST) {
if( !POST.hasOwnProperty(barName))
continue;
var post = {
name: barName,
size: POST[barName].length
};
SORTED.push(post);
}
SORTED.sort(sortNumber);
}
function generate(){
var div = document.getElementById("container");
var source = "总共帖子: " + TOTAL + "个";
for( var i = 0; i < SORTED.length; i++){
var posts = POST[SORTED[i].name];
source += getBarPostsSource(SORTED[i].name, posts);
}
div.innerHTML = source;
}
$(function(){
main();
});
function getPostByAJAX(requestURL){
var html = $.ajax({
url: requestURL,
xhrFields: {
// The 'xhrFields' property sets additional fields on the XMLHttpRequest.
// This can be used to set the 'withCredentials' property.
// Set the value to 'true' if you'd like to pass cookies to the server.
// If this is enabled, your server must respond with the header
// 'Access-Control-Allow-Credentials: true'.
withCredentials: true
},
async: false}).responseText;
debugger;
return html;
}
/*
function getPostByAJAX(requestURL){
var settings = {
type: "GET",
crossOrigin: true,
url:requestURL,
error: function(XHR,textStatus,errorThrown) {
alert ("XHR="+XHR+"\ntextStatus="+textStatus+"\nerrorThrown=" + errorThrown);
},
success: function(data,textStatus) {
debugger;
},
headers: {
"Access-Control-Allow-Origin":"http://tieba.baidu.com",
"Access-Control-Allow-Headers":"X-Requested-With"
}
};
$.ajax(settings);
}
*/
/*
function getPostByAJAX(requestURL){
var html = $.ajax({
url: requestURL,
dataType:"jsonp",
xhrFields: {
// The 'xhrFields' property sets additional fields on the XMLHttpRequest.
// This can be used to set the 'withCredentials' property.
// Set the value to 'true' if you'd like to pass cookies to the server.
// If this is enabled, your server must respond with the header
// 'Access-Control-Allow-Credentials: true'.
withCredentials: true
},
async: false}).responseText;
return html;
}
*/
function insertPost(postDetail){
if( !POST[postDetail.barName]){
POST[postDetail.barName] = [];
}
POST[postDetail.barName].push(postDetail);
}
function parseDetail(liNode) {
var cite = $("cite", liNode);
var date = cite[0].innerHTML; // value1
var tds = $("td", liNode);
var a1 = $("a", tds[0]);
var barName = a1[0].innerHTML; // value2
var a2 = $("a", tds[1]);
var postTitle = a2[0].innerHTML; // value3
var url = PREFIX + a2.attr("href");
return {
date: date,
barName: barName,
postTitle: postTitle,
url: url
}
}
function getTestData(){
return '<!DOCTYPE html><html><body><div class="wrap1"><div class="wrap2"><div ' +
' id="main_wrapper" class="main_wrapper"><div id="main_back_img"><div ' +
' id="main_back_bottom"><div id="container" class="ibody clearfix"><div><div ' +
' id="content"><div class="simple_block_container"><ul><li><cite>2-16</cite>' +
'<div class="wrap_container"><table><tr><td class="nowrap">在<a style="" ' +
' href="/f?kw=%E5%A4%A7%E9%82%91" target="_blank">ANDROID吧</a> 发贴</td><td class="wrap">' +
'<a href="/p/4356641476?pid=84106363194&cid=0#841063631" class="thread_title" target="_blank">硬盘</a></td>' +
'</tr></table></div><div class="clear"></div></li>' +
'<li></li><li></li></ul></div></div></div></div></div></div></div></div></body></html>';
}
</script>
</html>
上一篇: NoSQL 非关系型数据库
下一篇: 工具汇总