爬虫程序
程序员文章站
2022-03-02 21:02:25
...
package com.jw;
import com.jw.excel.ExcelUtil;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.util.List;
import java.util.Random;
/**
* TODO
*
* @author lijiwang6407001878
* @date 2019/12/28 10:15
*/
public class ParseWord {
private static String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
"Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"};
public static void main(String[] args) throws Exception {
String t = "B2";
String path = "D:\\development\\English\\" + t + ".xlsx";
String path2 = "D:\\development\\English\\" + t + ".xlsx";
List<Word> list = ExcelUtil.readExcel2007(new FileInputStream(path), Word.class, false);
// getWord(list);
//
// Workbook workbook = ExcelUtil.createExcel2007(list, null, null, false);
// FileOutputStream fos = new FileOutputStream(path2);
// workbook.write(fos);
//
// workbook.close();
// fos.close();
getAuto(list, t);
}
private static void getAuto(List<Word> list, String t) {
String path = "D:\\development\\English\\" + t;
Random r = new Random();
int ra;
for (Word w : list) {
try {
if (w.getPronUk() != null) {
File file = new File(path + "\\UK\\" + w.getWord() + ".mp3");
saveMp3(file, w.getWord(), 1);
ra = r.nextInt(100);
Thread.sleep(300 + ra);
System.out.println("ra=" + ra);
}
if (w.getPronUs() != null) {
File file = new File(path + "\\US\\" + w.getWord() + ".mp3");
saveMp3(file, w.getWord(), 2);
ra = r.nextInt(100);
Thread.sleep(300 + ra);
System.out.println("ra=" + ra);
}
} catch (Exception e) {
System.out.println(e.getMessage() + " " + w.getWord());
}
}
}
private static void saveMp3(File mp3, String word, int type) throws IOException {
Random r = new Random();
int rand = r.nextInt(14);
String url = "?audio=" + word + "&type=" + type;
URL u = new URL(url);
InetSocketAddress addr = new InetSocketAddress("xxxxx", 80);
Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
HttpURLConnection con = (HttpURLConnection) u.openConnection(proxy);
// 此处必须伪造referer,否则会自动返回首页.分析后,与cookie无关
con.setRequestProperty("User-Agent", ua[rand]);
con.setRequestProperty("Accept-Encoding", "gzip");
con.setRequestProperty("referer", url);
con.setDoInput(true);
con.setReadTimeout(1000 * 8);
if (con.getResponseCode() == HttpURLConnection.HTTP_OK) {
InputStream is = con.getInputStream();
byte[] b = new byte[1024 * 5];
int length;
FileOutputStream os = new FileOutputStream(mp3);
while ((length = is.read(b)) != -1) {
os.write(b, 0, length);
}
os.flush();
os.close();
is.close();
} else {
System.out.println("服务器返回:" + con.getResponseCode() + " " + word);
}
}
private static void getWord(List<Word> list) throws InterruptedException {
for (Word word : list) {
if (word.getPronUk() != null || word.getPronUs() != null) {
continue;
}
String url = "/" + word.getWord().trim();
Random r = new Random();
int rand = r.nextInt(14);
Document doc = null;
try {
Connection con = Jsoup.connect(url);
doc = con.userAgent(ua[rand])
.proxy("proxysz.zte.com.cn", 80)
.header("referer", "xxxx")
.timeout(8000)
.ignoreContentType(true)
.ignoreHttpErrors(true)
.get();
} catch (Exception e) {
System.out.println(e.getMessage() + " " + word.getWord());
}
if (doc == null) {
continue;
}
Element pron = doc.getElementById("yd-word-pron");
if (pron != null) {
String[] prons = pron.text().split("美");
if (prons.length > 0) {
word.setLevel("B2");
String pronUk = prons[0];
if (pronUk != null) {
pronUk = pronUk.replace("英", "").trim();
}
word.setPronUk(pronUk);
if (prons.length > 1) {
String pronUs = prons[1];
if (pronUs != null) {
pronUs = pronUs.trim();
}
word.setPronUs(pronUs);
}
}
}
Element meaning = doc.getElementById("yd-word-meaning");
if (meaning != null) {
Elements elements = meaning.select("li");
int i = 0;
for (Element e : elements) {
if (i == 0) {
word.setMeaning1(e.text());
} else if (i == 1) {
word.setMeaning2(e.text());
} else {
break;
}
i++;
}
}
int ra = r.nextInt(200);
Thread.sleep(300 + ra);
System.out.println("ra=" + ra);
}
}
}