java多线程抓取铃声多多官网的铃声数据

程序员文章站 2024-03-11 18:34:13

一直想练习下java多线程抓取数据。有天被我发现，铃声多多的官网（http://www.shoujiduoduo.com/main/）有大量的数据。通过观察他们前端...

一直想练习下java多线程抓取数据。

有天被我发现，铃声多多的官网（http://www.shoujiduoduo.com/main/）有大量的数据。

通过观察他们前端获取铃声数据的ajax

java多线程抓取铃声多多官网的铃声数据

http://www.shoujiduoduo.com/ringweb/ringweb.php?type=getlist&listid={类别id}&page={分页页码}

很容易就能发现通过改变 listid和page就能从服务器获取铃声的json数据，通过解析json数据，

可以看到都带有{"hasmore":1,"curpage":1}这样子的指示，通过判断hasmore的值，决定是否进行下一页的抓取。

但是通过上面这个链接返回的json中不带有铃声的下载地址

很快就可以发现，点击页面的“下载”会看到

通过下面的请求，就可以获取铃声的下载地址了

http://www.shoujiduoduo.com/ringweb/ringweb.php?type=geturl&act=down&rid={铃声id}

java多线程抓取铃声多多官网的铃声数据

所以，他们的数据是很容易被偷的。于是我就开始...

源码已经发在github上。如果感兴趣的童鞋可以查看

github：https://github.com/yongbo000/duoduoaudiorobot

上代码：

package me.yongbo.duoduoringrobot;
import java.io.bufferedreader;
import java.io.file;
import java.io.filewriter;
import java.io.ioexception;
import java.io.inputstream;
import java.io.inputstreamreader;
import java.net.url;
import java.net.urlconnection;
import java.util.iterator;
import java.util.regex.matcher;
import java.util.regex.pattern;
import com.google.gson.gson;
import com.google.gson.jsonarray;
import com.google.gson.jsonelement;
import com.google.gson.jsonparser;
/* * @author yongbo_ * @created 2013/4/16 * * */
public class duoduoringrobotclient implements runnable {
public static string get_ringinfo_url = "http://www.shoujiduoduo.com/ringweb/ringweb.php?type=getlist&listid=%1$d&page=%2$d";
public static string get_down_url = "http://www.shoujiduoduo.com/ringweb/ringweb.php?type=geturl&act=down&rid=%1$d";
public static string error_msg = "listid为 %1$d 的robot发生错误，已自动停止。当前page为 %2$d";public static string status_msg = "开始抓取数据，当前listid： %1$d，当前page： %2$d";
public static string file_dir = "e:/ringdata/";public static string file_name = "listid=%1$d.txt";private boolean errorflag = false;private int listid;private int page;
private int endpage = -1;private int hasmore = 1;
private dbhelper dbhelper;
/** * 构造函数 * @param listid 菜单id * @param page 开始页码 * @param endpage 结束页码 * */
public duoduoringrobotclient(int listid, int beginpage, int endpage)
 {this.listid = listid;this.page = beginpage;this.endpage = endpage;this.dbhelper = new dbhelper();}
/** * 构造函数 * @param listid 菜单id * @param page 开始页码 * */
public duoduoringrobotclient(int listid, int page) {this(listid, page, -1);}
/** * 获取铃声 * */public void getrings() {string url = string.format(get_ringinfo_url, listid, page);string responsestr = httpget(url);hasmore = gethasmore(responsestr);
page = getnextpage(responsestr);
ringparse(responsestr.replaceall("\\{\"hasmore\":[0-9]*,\"curpage\":[0-9]*\\},", "").replaceall(",]", "]"));}/** * 发起http请求 * @param weburl 请求连接地址 * */public string httpget(string weburl){url url;urlconnection conn;stringbuilder sb = new stringbuilder();string resultstr = "";try {url = new url(weburl);conn = url.openconnection();conn.connect();inputstream is = conn.getinputstream();inputstreamreader isr = new inputstreamreader(is);bufferedreader bufreader = new bufferedreader(isr);string linetext;while ((linetext = bufreader.readline()) != null) {sb.append(linetext);}resultstr = sb.tostring();} catch (exception e) {errorflag = true;//将错误写入txtwritetofile(string.format(error_msg, listid, page));}return resultstr;}/** * 将json字符串转化成ring对象，并存入txt中 * @param json json字符串 * */public void ringparse(string json) {ring ring = null;jsonelement element = new jsonparser().parse(json);jsonarray array = element.getasjsonarray();// 遍历数组iterator<jsonelement> it = array.iterator();
gson gson = new gson();while (it.hasnext() && !errorflag) {jsonelement e = it.next();// jsonelement转换为javabean对象ring = gson.fromjson(e, ring.class);ring.setdownurl(getringdownurl(ring.getid()));if(isavailablering(ring)) {system.out.println(ring.tostring());
//可选择写入数据库还是写入文本//writetofile(ring.tostring());writetodatabase(ring);}}}
/** * 写入txt * @param data 字符串 * */public void writetofile(string data)
 {string path = file_dir + string.format(file_name, listid);file dir = new file(file_dir);file file = new file(path);filewriter fw = null;if(!dir.exists()){dir.mkdirs();
}try {if(!file.exists()){file.createnewfile();}fw = new filewriter(file, true);
fw.write(data);fw.write("\r\n");fw.flush();} catch (ioexception e) {
// todo auto-generated catch blocke.printstacktrace();
}finally {try {if(fw != null){fw.close();}} catch (ioexception e) {
// todo auto-generated catch blocke.printstacktrace();}}}/** * 写入数据库 * @param ring 一个ring的实例 * */
public void writetodatabase(ring ring) {dbhelper.execute("addring", ring);}
@overridepublic void run() {while(hasmore == 1 && !errorflag){if(endpage != -1){if(page > endpage) { break; }}system.out.println(string.format(status_msg, listid, page));
getrings();system.out.println(string.format("该页数据写入完成"));}system.out.println("ending...");}
private int gethasmore(string resultstr){pattern p = pattern.compile("\"hasmore\":([0-9]*),\"curpage\":([0-9]*)"); 
 matcher match = p.matcher(resultstr);  
 if (match.find()) {  return integer.parseint(match.group(1));
  }  return 0;
}
private int getnextpage(string resultstr){pattern p = pattern.compile("\"hasmore\":([0-9]*),\"curpage\":([0-9]*)");matcher match = p.matcher(resultstr);if (match.find()) {return integer.parseint(match.group(2));}return 0;}
/** * 判断当前ring是否满足条件。当ring的name大于50个字符或是duration为小数则不符合条件，将被剔除。 * @param ring 当前ring对象实例 * */private boolean isavailablering(ring ring){pattern p = pattern.compile("^[1-9][0-9]*$");
matcher match = p.matcher(ring.getduration());
if(!match.find()){return false;}if(ring.getname().length() > 50 || ring.getartist().length() > 50 || ring.getdownurl().length() == 0){return false;}return true;}
/** * 获取铃声的下载地址 * @param rid 铃声的id * */
public string getringdownurl(string rid){string url = string.format(get_down_url, rid);
string responsestr = httpget(url);return responsestr;}}

上一篇： mysql存储过程《2》续博客分类： MySQL mysql存储过程mysql游标mysql循环mysql使用whilemysql使用repeat

下一篇： PHP实现表单提交数据的验证处理功能【防SQL注入和XSS攻击等】