Java爬虫-爬取四级词汇网站音频

程序员文章站 2022-05-04 16:39:55

...

背景:

作为一个久不过四级的程序员有点不好意思啊,所以网购了冲刺卷认真过四级,签收后发现附送的词汇书音频网站竟然没有一键下载全部.只能自己写个伪爬虫了.

知识点:

Java网络连接
字节流
文件输入输入

开工:

分析页面:

四级词汇乱序版网站:

http://download.dogwood.com.cn/online/4jlxbx/index.html

发现都是极具规律性的格式就像这样

http://download.dogwood.com.cn/online/4jlxbx/01.mp3

所以直接在基础string 后面添加 MP3编号即可,出于对复用考虑设计成一个下载工具类,方便以后爬虫时复用

代码

package top.chen.dogwood;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

public class DownloadUtils {
	// 目标链接字符串
	private String[] urlString;
	// 目标文件的格式
	private String targetType;
	// 存放文件路径
	private File rootDir;

	public String[] getUrlString() {
		return urlString;
	}

	public void setUrlString(String[] urlString) {
		this.urlString = urlString;
	}

	public String getTargetType() {
		return targetType;
	}

	public void setTargetType(String targetType) {
		this.targetType = targetType;
	}

	public File getRootDir() {
		return rootDir;
	}

	public void setRootDir(File rootDir) {
		this.rootDir = rootDir;
	}

	public DownloadUtils(String[] urlString, String targetType, File rootDir) {
		super();
		this.urlString = urlString;
		this.targetType = targetType;
		this.rootDir = rootDir;
	}

	public DownloadUtils(String[] urlString, String targetType, String rootDir) {
		super();
		this.urlString = urlString;
		this.targetType = targetType;
		this.rootDir = new File(rootDir);
	}

	public DownloadUtils() {
		super();
	}

	/**
	 * 开始下载
	 * 
	 * @throws Exception
	 */
	public void httpDownload() throws Exception {
		validate();
		final String[] urls = urlString;
		HttpURLConnection urlConnection;
		for (int i = 0; i < urls.length; i++) {
			urlConnection = (HttpURLConnection) new URL(urls[i])
					.openConnection();
			// 开启链接
			urlConnection.connect();
			InputStream inputStream = urlConnection.getInputStream();
			File temp = new File(rootDir,
					String.format("%02d",i+1)+"."+targetType);
			if (!temp.exists()) {
				temp.createNewFile();
			}
			FileOutputStream fileOutputStream = new FileOutputStream(temp, true);
			int tem;
			while (-1 != (tem = inputStream.read())) {
				fileOutputStream.write(tem);
				fileOutputStream.flush();
			}
			fileOutputStream.close();
			inputStream.close();

		}

	}

	private void validate() throws Exception {
		if (urlString.length <= 0) {
			throw new Exception("下载路径不能为空!");
		}
		if (null == rootDir || !rootDir.exists() || !rootDir.isDirectory()) {
			throw new Exception("目标文件夹不存在!");
		}

	}

}

主程序类

package top.chen.dogwood;

import java.io.IOException;
import java.net.MalformedURLException;

/**
 * 爬取指定链接的一组MP3 文件
 * 
 * 放入指定的目录中
 * 
 * @author Geek
 * 
 */
public class Application {
	public static void main(String[] args) throws MalformedURLException,
			IOException {
		String base = 	"http://download.dogwood.com.cn/online/4jlxbx/";
		String[] strings = new String[35] ;
		for (int i = 1; i <= 35; i++) {
			strings[i-1] = base+String.format("%02d", i)+".mp3";
		}
		DownloadUtils downloadUtils  = new DownloadUtils(strings,"mp3","E:\\360Downloads\\TempFile");
		try {
			downloadUtils.httpDownload();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}
}

不过单线程下载速度有点慢 ,以后有空考虑下改成多线程下载

希望我的思路能对大家有所帮助! 不妥之处,欢迎指正

上一篇：腾讯云linux系统yum安装软件——以nginx为例

下一篇：基于python的留一法+朴素贝叶斯分类2021-05-13

Java爬虫-爬取四级词汇网站音频

背景:

知识点:

开工:

分析页面:

代码

java爬虫webmagic 案例爬取动态（ajax+js) 网站京东售价格

Java爬虫-爬取四级词汇网站音频