欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

学习了Java正则表达式,写了个帖子邮箱获取工具

程序员文章站 2022-04-09 10:50:45
...

 

看完了马士兵老师的正则表达式视频,想到了贴吧各种留邮箱,于是写了这个小工具。
输入网址(要加http://,可以单击“网址”粘贴),然后会获取网址内匹配的邮箱,包括全角的
截图:

学习了Java正则表达式,写了个帖子邮箱获取工具
            
    
    博客分类: JAVA java
 
代码:
//import java.awt.*;
import java.awt.datatransfer.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;

public class EmailSpider extends MouseAdapter implements ActionListener, KeyListener {
	
	private static final String INIT_URL = "http://tieba.baidu.com/p/2286659953?pn=4";
	private static final String URL_PREFIX = "http://tieba.baidu.com/p/"; // 贴吧帖子网址前缀
	private static final String WORDS = "\n\n    说明:\n    百度贴吧帖子获取所有页面的邮箱,其他的获取当前页。\n    匹配不全肯定是有的,只做了粗略匹配。\n    也有可能只匹配到一段,有些人邮箱输入方式有点奇葩。\n\n    单击“网址”可以粘贴剪贴板的网址。\n    右键菜单没有写,反正可以Ctrl+C Ctrl+V\n    提取时请不要进行操作。";
	
	private BufferedReader buffer;
	private Pattern pattern;
	private Matcher matcher;
	private String line; // 网页源代码的一行
	private List<String> list = new ArrayList<String>(); // 邮箱
	
	private JFrame frame;
	private JTextArea textArea;
	private JScrollPane scrollPane; // textArea的滚动条
	private JTextField textField;
	private JButton button1, button2;
	private JCheckBox checkBox;
	private JLabel label1, label2;
	private Clipboard clipboard; // 剪贴板
	
	private int total_page = 99999; // 总页数
	private String id; // 贴吧帖子id
	private StringBuffer sBuf; // 网址
	
	//private Thread thread;
	//private boolean isFinished;
	
	public EmailSpider() {
		
		//System.out.println(Integer.toHexString(".".codePointAt(0)));
		
		// 正则表达式匹配,包括@和.(全角).
		pattern = Pattern.compile("[\\w.-[\uff0e]]+[@[\uff20]][\\w-]+[\\.[\uff0e]]?[\\w.-[\uff0e]]*\\w");
		matcher = pattern.matcher("");
		
		frame = new JFrame("网页Email提取v1.0  by kyda");
		clipboard = frame.getToolkit().getSystemClipboard(); // 剪贴板
		
		label1 = new JLabel("<html><font size=4 color=green>网址:</font></html>");
		label1.setBounds(20, 10, 50, 30);
		label1.addMouseListener(this);
		textField = new JTextField(INIT_URL);
		textField.setBounds(70, 12, 310, 26);
		textField.addKeyListener(this);
		button1 = new JButton("<html><font size=4 color=blue>提取</font></html>");
		button1.setBounds(400, 10, 60, 30);
		button1.addActionListener(this);
		
		textArea = new JTextArea(WORDS);
		textArea.setLineWrap(true); // 自动换行
		scrollPane = new JScrollPane(textArea); // 添加滚动条
		scrollPane.setBounds(10, 50, 390, 290);
		scrollPane.setBorder(BorderFactory.createLoweredSoftBevelBorder()); // 边框样式
		
		//checkBox = new JCheckBox("<html><div style=\"top:-5px;\"><font size=5>换行</font></div></html>");
		checkBox = new JCheckBox("换行");
		// html会偏一点,不知道为什么。。。Font类有冲突,就这样new没什么问题吧
		checkBox.setFont(new java.awt.Font(checkBox.getFont().deriveFont((float) 16).getAttributes()));
		checkBox.setSelected(true);
		checkBox.setBounds(410, 100, 90, 30);
		checkBox.addActionListener(this);
		
		button2 = new JButton("<html><font size=3>复制到<br>剪贴板</font></html>");
		button2.setBounds(410, 160, 70, 40);
		button2.addActionListener(this);
		label2 = new JLabel("<html><font size=4 color=green>请选择操作:</font></html>");
		label2.setBounds(10, 340, 480, 30);
		
		frame.add(label1);
		frame.add(textField);
		frame.add(button1);
		frame.getContentPane().add(scrollPane);
		frame.add(checkBox);
		frame.add(button2);
		frame.add(label2);
		
		frame.setLayout(null);
		frame.setSize(500, 400);
		frame.setLocationRelativeTo(null); // 窗口居中
		frame.setResizable(false); // 不可改变大小
		frame.setVisible(true);
		frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
		
	}
	
	public void actionPerformed(ActionEvent e) {
		
		if (e.getSource() == button1) {
			startAnalyse();
		} else if (e.getSource() == button2) {
			// 复制到剪贴板
			String s = textArea.getText();
			if (s.length() > 0) {
				StringSelection ss = new StringSelection(s);
				this.clipboard.setContents(ss, ss);
				label2.setText("<html><font color=red>已复制到剪贴板。</font></html>");
			}
		}
	}
	
	// 对网址单击粘贴
	public void mouseClicked(MouseEvent e) {
		if (e.getButton() == MouseEvent.BUTTON1) {
			try {
				String s = clipboard.getContents(frame).getTransferData(DataFlavor.stringFlavor).toString();
				if (s.length() == 0) {
					label2.setText("剪贴板无内容!");
				} else {
					textField.setText(s);
					label2.setText("网址粘贴成功!");
				}
			} catch (UnsupportedFlavorException e1) {
				label2.setText("剪贴板内容不是字符串!");
			} catch (IOException e1) {
				e1.printStackTrace();
			}
		}
	}
	
	// 按enter可以处理
	public void keyPressed(KeyEvent e) {
		if (e.getKeyCode() == KeyEvent.VK_ENTER) {
			startAnalyse();
		}
	}
	
	public void keyReleased(KeyEvent arg0) {
	}
	
	public void keyTyped(KeyEvent arg0) {
	}
	
	// 页面处理前启动线程,实现setText实时更新
	public void startAnalyse() {
		
		new Thread(new Runnable() {
			
			public void run() {
				analyse();
			}
		}).start();
	}
	
	// 页面处理
	public void analyse() {
		
		label2.setText("邮箱地址提取中……");
		try {
			for (int i = 1; i <= total_page; ++i) {
				
				// 网址计算
				if (i == 1) {
					
					sBuf = new StringBuffer(textField.getText());
					if (sBuf.length() < 1)
						return;
					Matcher m = Pattern.compile("http://tieba.baidu.com/p/[0-9]{1,10}").matcher(textField.getText());
					
					if (m.find()) {
						id = m.group().substring(25); // 如果是贴吧获取帖子id
						sBuf = new StringBuffer(URL_PREFIX); // 转到第一页
						sBuf.append(id).append("?pn=").append(i);
						total_page = 99999;
						//System.out.println(id);
					} else { // 否则只处理当前页
						total_page = 1;
					}
					
				} else { // 每次循环页面地址递增1
				
					sBuf = new StringBuffer(URL_PREFIX);
					sBuf.append(id).append("?pn=").append(i);
					
				}
				
				// 获取网址内容
				URL url = new URL(sBuf.toString());
				buffer = new BufferedReader(new InputStreamReader(url.openStream()));
				
				// 获取总页数
				if (total_page == 99999) {
					
					BufferedReader tmpBuf = new BufferedReader(new InputStreamReader(url.openStream()));
					String s = "";
					while ((s = tmpBuf.readLine()) != null) {
						// 有些浏览器进不去的帖子java可以获取,并得到total_page,估计是被删的帖子
						// 然后进浏览器就会得到百度的温馨提示。。不敢玩了 - -  ,被封号就蛋疼了
						Matcher tmpMatcher = Pattern.compile("all_page_num:[0-9]{1,6}").matcher(s);
						if (tmpMatcher.find()) {
							total_page = Integer.valueOf(tmpMatcher.group().substring(13));
							break;
						}
					}
					
					if (total_page == 99999) { // 没有获取到页数
						label2.setText("貌似出错了~你真确定有这个帖子?");
						return;
					}
				}
				
				label2.setText("正在提取页面:" + i + "  共" + total_page + "页   地址:" + sBuf);
				
				// 逐行提取email地址
				while ((line = buffer.readLine()) != null) {
					getEmailAddr(line);
				}
			}
			
			StringBuffer result = new StringBuffer("");
			for (String s : list) {
				result.append(s);
			}
			textArea.setText(result.toString());
			label2.setText("提取完成!  邮箱总计:" + list.size());
			list.clear(); // 清空list
			
		} catch (MalformedURLException | IllegalArgumentException | UnknownHostException e2) {
			label2.setText("请输入正确的网址!");
			return;
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
	
	// 提取一行字符串中的email地址
	public void getEmailAddr(String str) {
		
		matcher.reset(line);
		
		while (matcher.find()) {
			//System.out.println(matcher.group());
			
			// 用StringBuffer的话存一个进list后list.contains就一直true了,原因未知。
			String s = matcher.group();
			//if (s.equals("i@cegle.net")) 
			//	System.out.println(line);
			s = s.replaceAll("\uff20", "@");
			s = s.replaceAll("\uff0e", ".");
			s += ';';
			if (checkBox.isSelected())
				s += '\n';
			
			// 用ArrayList以防止重复
			if (!list.contains(s)) {
				list.add(s);
			}
		}
		
	}
	
	public static void main(String[] args) {
		
		// 使用Windows的界面风格
		try {
			UIManager.setLookAndFeel("com.sun.java.swing.plaf.windows.WindowsLookAndFeel");
		} catch (Exception e) {
			e.printStackTrace();
		}
		
		new EmailSpider();
		
	}
}
 
 
  • 学习了Java正则表达式,写了个帖子邮箱获取工具
            
    
    博客分类: JAVA java
  • 大小: 49.5 KB
相关标签: java