看完了马士兵老师的正则表达式视频,想到了贴吧各种留邮箱,于是写了这个小工具。
输入网址(要加http://,可以单击“网址”粘贴),然后会获取网址内匹配的邮箱,包括全角的@和.
学习了Java正则表达式,写了个帖子邮箱获取工具
程序员文章站
2022-04-09 10:50:45
...
截图:
代码:
//import java.awt.*; import java.awt.datatransfer.*; import java.awt.event.*; import java.io.*; import java.net.*; import java.util.*; import java.util.regex.*; import javax.swing.*; public class EmailSpider extends MouseAdapter implements ActionListener, KeyListener { private static final String INIT_URL = "http://tieba.baidu.com/p/2286659953?pn=4"; private static final String URL_PREFIX = "http://tieba.baidu.com/p/"; // 贴吧帖子网址前缀 private static final String WORDS = "\n\n 说明:\n 百度贴吧帖子获取所有页面的邮箱,其他的获取当前页。\n 匹配不全肯定是有的,只做了粗略匹配。\n 也有可能只匹配到一段,有些人邮箱输入方式有点奇葩。\n\n 单击“网址”可以粘贴剪贴板的网址。\n 右键菜单没有写,反正可以Ctrl+C Ctrl+V\n 提取时请不要进行操作。"; private BufferedReader buffer; private Pattern pattern; private Matcher matcher; private String line; // 网页源代码的一行 private List<String> list = new ArrayList<String>(); // 邮箱 private JFrame frame; private JTextArea textArea; private JScrollPane scrollPane; // textArea的滚动条 private JTextField textField; private JButton button1, button2; private JCheckBox checkBox; private JLabel label1, label2; private Clipboard clipboard; // 剪贴板 private int total_page = 99999; // 总页数 private String id; // 贴吧帖子id private StringBuffer sBuf; // 网址 //private Thread thread; //private boolean isFinished; public EmailSpider() { //System.out.println(Integer.toHexString(".".codePointAt(0))); // 正则表达式匹配,包括@和.(全角). pattern = Pattern.compile("[\\w.-[\uff0e]]+[@[\uff20]][\\w-]+[\\.[\uff0e]]?[\\w.-[\uff0e]]*\\w"); matcher = pattern.matcher(""); frame = new JFrame("网页Email提取v1.0 by kyda"); clipboard = frame.getToolkit().getSystemClipboard(); // 剪贴板 label1 = new JLabel("<html><font size=4 color=green>网址:</font></html>"); label1.setBounds(20, 10, 50, 30); label1.addMouseListener(this); textField = new JTextField(INIT_URL); textField.setBounds(70, 12, 310, 26); textField.addKeyListener(this); button1 = new JButton("<html><font size=4 color=blue>提取</font></html>"); button1.setBounds(400, 10, 60, 30); button1.addActionListener(this); textArea = new JTextArea(WORDS); textArea.setLineWrap(true); // 自动换行 scrollPane = new JScrollPane(textArea); // 添加滚动条 scrollPane.setBounds(10, 50, 390, 290); scrollPane.setBorder(BorderFactory.createLoweredSoftBevelBorder()); // 边框样式 //checkBox = new JCheckBox("<html><div style=\"top:-5px;\"><font size=5>换行</font></div></html>"); checkBox = new JCheckBox("换行"); // html会偏一点,不知道为什么。。。Font类有冲突,就这样new没什么问题吧 checkBox.setFont(new java.awt.Font(checkBox.getFont().deriveFont((float) 16).getAttributes())); checkBox.setSelected(true); checkBox.setBounds(410, 100, 90, 30); checkBox.addActionListener(this); button2 = new JButton("<html><font size=3>复制到<br>剪贴板</font></html>"); button2.setBounds(410, 160, 70, 40); button2.addActionListener(this); label2 = new JLabel("<html><font size=4 color=green>请选择操作:</font></html>"); label2.setBounds(10, 340, 480, 30); frame.add(label1); frame.add(textField); frame.add(button1); frame.getContentPane().add(scrollPane); frame.add(checkBox); frame.add(button2); frame.add(label2); frame.setLayout(null); frame.setSize(500, 400); frame.setLocationRelativeTo(null); // 窗口居中 frame.setResizable(false); // 不可改变大小 frame.setVisible(true); frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); } public void actionPerformed(ActionEvent e) { if (e.getSource() == button1) { startAnalyse(); } else if (e.getSource() == button2) { // 复制到剪贴板 String s = textArea.getText(); if (s.length() > 0) { StringSelection ss = new StringSelection(s); this.clipboard.setContents(ss, ss); label2.setText("<html><font color=red>已复制到剪贴板。</font></html>"); } } } // 对网址单击粘贴 public void mouseClicked(MouseEvent e) { if (e.getButton() == MouseEvent.BUTTON1) { try { String s = clipboard.getContents(frame).getTransferData(DataFlavor.stringFlavor).toString(); if (s.length() == 0) { label2.setText("剪贴板无内容!"); } else { textField.setText(s); label2.setText("网址粘贴成功!"); } } catch (UnsupportedFlavorException e1) { label2.setText("剪贴板内容不是字符串!"); } catch (IOException e1) { e1.printStackTrace(); } } } // 按enter可以处理 public void keyPressed(KeyEvent e) { if (e.getKeyCode() == KeyEvent.VK_ENTER) { startAnalyse(); } } public void keyReleased(KeyEvent arg0) { } public void keyTyped(KeyEvent arg0) { } // 页面处理前启动线程,实现setText实时更新 public void startAnalyse() { new Thread(new Runnable() { public void run() { analyse(); } }).start(); } // 页面处理 public void analyse() { label2.setText("邮箱地址提取中……"); try { for (int i = 1; i <= total_page; ++i) { // 网址计算 if (i == 1) { sBuf = new StringBuffer(textField.getText()); if (sBuf.length() < 1) return; Matcher m = Pattern.compile("http://tieba.baidu.com/p/[0-9]{1,10}").matcher(textField.getText()); if (m.find()) { id = m.group().substring(25); // 如果是贴吧获取帖子id sBuf = new StringBuffer(URL_PREFIX); // 转到第一页 sBuf.append(id).append("?pn=").append(i); total_page = 99999; //System.out.println(id); } else { // 否则只处理当前页 total_page = 1; } } else { // 每次循环页面地址递增1 sBuf = new StringBuffer(URL_PREFIX); sBuf.append(id).append("?pn=").append(i); } // 获取网址内容 URL url = new URL(sBuf.toString()); buffer = new BufferedReader(new InputStreamReader(url.openStream())); // 获取总页数 if (total_page == 99999) { BufferedReader tmpBuf = new BufferedReader(new InputStreamReader(url.openStream())); String s = ""; while ((s = tmpBuf.readLine()) != null) { // 有些浏览器进不去的帖子java可以获取,并得到total_page,估计是被删的帖子 // 然后进浏览器就会得到百度的温馨提示。。不敢玩了 - - ,被封号就蛋疼了 Matcher tmpMatcher = Pattern.compile("all_page_num:[0-9]{1,6}").matcher(s); if (tmpMatcher.find()) { total_page = Integer.valueOf(tmpMatcher.group().substring(13)); break; } } if (total_page == 99999) { // 没有获取到页数 label2.setText("貌似出错了~你真确定有这个帖子?"); return; } } label2.setText("正在提取页面:" + i + " 共" + total_page + "页 地址:" + sBuf); // 逐行提取email地址 while ((line = buffer.readLine()) != null) { getEmailAddr(line); } } StringBuffer result = new StringBuffer(""); for (String s : list) { result.append(s); } textArea.setText(result.toString()); label2.setText("提取完成! 邮箱总计:" + list.size()); list.clear(); // 清空list } catch (MalformedURLException | IllegalArgumentException | UnknownHostException e2) { label2.setText("请输入正确的网址!"); return; } catch (IOException e) { e.printStackTrace(); } } // 提取一行字符串中的email地址 public void getEmailAddr(String str) { matcher.reset(line); while (matcher.find()) { //System.out.println(matcher.group()); // 用StringBuffer的话存一个进list后list.contains就一直true了,原因未知。 String s = matcher.group(); //if (s.equals("i@cegle.net")) // System.out.println(line); s = s.replaceAll("\uff20", "@"); s = s.replaceAll("\uff0e", "."); s += ';'; if (checkBox.isSelected()) s += '\n'; // 用ArrayList以防止重复 if (!list.contains(s)) { list.add(s); } } } public static void main(String[] args) { // 使用Windows的界面风格 try { UIManager.setLookAndFeel("com.sun.java.swing.plaf.windows.WindowsLookAndFeel"); } catch (Exception e) { e.printStackTrace(); } new EmailSpider(); } }
上一篇: JAVA学习,写的一个2048小游戏