Java爬虫-爬取四级词汇网站音频
程序员文章站
2022-05-04 16:39:55
...
背景:
作为一个久不过四级的程序员有点不好意思啊,所以网购了冲刺卷认真过四级,签收后发现附送的词汇书音频网站竟然没有一键下载全部.只能自己写个伪爬虫了.
知识点:
- Java网络连接
- 字节流
- 文件输入输入
开工:
分析页面:
四级词汇乱序版网站:
http://download.dogwood.com.cn/online/4jlxbx/index.html
发现都是极具规律性的格式就像这样
http://download.dogwood.com.cn/online/4jlxbx/01.mp3
所以直接在基础string 后面添加 MP3编号即可,出于对复用考虑设计成一个下载工具类,方便以后爬虫时复用
代码
package top.chen.dogwood;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class DownloadUtils {
// 目标链接字符串
private String[] urlString;
// 目标文件的格式
private String targetType;
// 存放文件路径
private File rootDir;
public String[] getUrlString() {
return urlString;
}
public void setUrlString(String[] urlString) {
this.urlString = urlString;
}
public String getTargetType() {
return targetType;
}
public void setTargetType(String targetType) {
this.targetType = targetType;
}
public File getRootDir() {
return rootDir;
}
public void setRootDir(File rootDir) {
this.rootDir = rootDir;
}
public DownloadUtils(String[] urlString, String targetType, File rootDir) {
super();
this.urlString = urlString;
this.targetType = targetType;
this.rootDir = rootDir;
}
public DownloadUtils(String[] urlString, String targetType, String rootDir) {
super();
this.urlString = urlString;
this.targetType = targetType;
this.rootDir = new File(rootDir);
}
public DownloadUtils() {
super();
}
/**
* 开始下载
*
* @throws Exception
*/
public void httpDownload() throws Exception {
validate();
final String[] urls = urlString;
HttpURLConnection urlConnection;
for (int i = 0; i < urls.length; i++) {
urlConnection = (HttpURLConnection) new URL(urls[i])
.openConnection();
// 开启链接
urlConnection.connect();
InputStream inputStream = urlConnection.getInputStream();
File temp = new File(rootDir,
String.format("%02d",i+1)+"."+targetType);
if (!temp.exists()) {
temp.createNewFile();
}
FileOutputStream fileOutputStream = new FileOutputStream(temp, true);
int tem;
while (-1 != (tem = inputStream.read())) {
fileOutputStream.write(tem);
fileOutputStream.flush();
}
fileOutputStream.close();
inputStream.close();
}
}
private void validate() throws Exception {
if (urlString.length <= 0) {
throw new Exception("下载路径不能为空!");
}
if (null == rootDir || !rootDir.exists() || !rootDir.isDirectory()) {
throw new Exception("目标文件夹不存在!");
}
}
}
主程序类
package top.chen.dogwood;
import java.io.IOException;
import java.net.MalformedURLException;
/**
* 爬取指定链接的一组MP3 文件
*
* 放入指定的目录中
*
* @author Geek
*
*/
public class Application {
public static void main(String[] args) throws MalformedURLException,
IOException {
String base = "http://download.dogwood.com.cn/online/4jlxbx/";
String[] strings = new String[35] ;
for (int i = 1; i <= 35; i++) {
strings[i-1] = base+String.format("%02d", i)+".mp3";
}
DownloadUtils downloadUtils = new DownloadUtils(strings,"mp3","E:\\360Downloads\\TempFile");
try {
downloadUtils.httpDownload();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
不过单线程下载速度有点慢 ,以后有空考虑下改成多线程下载
希望我的思路能对大家有所帮助! 不妥之处,欢迎指正