C# 爬虫
//ps 需要引用htmlagilitypack.dll 文件,可自行在网上下载
public partial class grabinterface : form
{
public int number = 1;
public grabinterface()
{
initializecomponent();
this.load += grabinterface_load;
}
//定时器
system.timers.timer mytimer;
//定义委托,防止两线程之间控件赋值冲突
public delegate void action<in t>(t obj);
public void actionread(int t)
{
this.lbl_ok.text = "正在读取中,请稍后...";
this.btn_sure.enabled = false;
}
public void actionfinall(int t)
{
this.lbl_ok.text = "读取完毕";
this.btn_sure.enabled = false;
}
public static bool result = true;//设置定时器点击按钮时执行,避免重复执行定时器导致时间混乱
private void btn_sure_click(object sender, eventargs e)
{
if (string.isnullorempty(this.txt_url.text))
{
this.lbl_ok.text = "请选择需要读取的文件";
return;
}
list<string> list_read = txthelper.read(this.txt_url.text);
if (result)
{
mytimer = new system.timers.timer(1000 * 40);//定时周期
mytimer.elapsed += btn_sure_click;//
mytimer.autoreset = true; //是否不断重复定时器操作
mytimer.enabled = true;//开启计时器
result = false;
}
number++;
action<int> action = new action<int>(actionread);
invoke(action, number);
try
{
#region 抓取网页数据
if (list_read != null && list_read.count > 0)
{
for (int i = 0; i < list_read.count; i++)
{
string url = list_read[i];
list<string> list = new list<string>();
htmlweb webclient = new htmlweb();
htmlagilitypack.htmldocument doc = geturltohtml(url);
htmlnodecollection hreflist = doc.documentnode.selectnodes("//table");
if (hreflist != null)
{
foreach (htmlnode table in doc.documentnode.selectnodes("//table"))
{
foreach (htmlnode row in table.selectnodes(".//tr")) // 注意 .//tr 读取当前tr,如果为//tr节点 则读取所有的tr节点
{
string name = "";
if (row.selectnodes("th") == null)//不存在
{
if (row.selectnodes("td") != null)
{
foreach (htmlnode cell in row.selectnodes("td"))
{
regex htmlspacereg = new regex("\\ \\;", regexoptions.compiled | regexoptions.ignorecase);//去掉
name += regex.replace((regex.replace(cell.innertext.trim(), @"\s", "")), @"&(nbsp|#160);", " ", regexoptions.ignorecase) + "\t";
}
list.add(name);
}
}
}
}
string filename = "";
for (int j = 0; j < list.count; j++)
{
filename += list[j] + "\n";
}
string sort = datetime.now.tostring("yyyymmddhh").tostring() + number.tostring();
txthelper.savaprocess(filename, sort);
}
}
action<int> action1 = new action<int>(actionfinall);
invoke(action1, number);
}
#endregion
}
catch (exception ex)
{
console.writeline("错误" + ex.message + "");
}
}
private void timer1_tick(object sender, eventargs e)
{
}
//选择文件
private void btn_choose_click(object sender, eventargs e)
{
openfiledialog dialog = new openfiledialog();
dialog.multiselect = false;//该值确定是否可以选择多个文件
dialog.title = "请选择电子文档excel";
dialog.filter = "所有文件(*.txt)|*.txt";
if (dialog.showdialog() == system.windows.forms.dialogresult.ok)
{
txt_url.text = dialog.filename;
}
}
public thread t1;
public thread t2;
public static manualresetevent eventab = new manualresetevent(false);
private void grabinterface_load(object sender, eventargs e)
{
//抓取代理ip
t1 = new thread(new threadstart(redirhelper.getproxylist));
t2 = new thread(new threadstart(redirhelper.yanzhengip));//验证代理ip
t1.start();
thread.sleep(1000);
t2.start();
}
private void btn_stop_click(object sender, eventargs e)
{
//释放资源
if (mytimer != null)
{
mytimer.close();
mytimer.dispose();
}
application.exit();
}
public htmlagilitypack.htmldocument geturltohtml(string url)
{
list<redirhelper.proxy> list = redirhelper.masterporxylist;
list<redirhelper.proxy> list_proxy = redirhelper.list_proxy;//有效的代理ip集合
//if (list_proxy != null && list_proxy.count > 0)
//{
htmlagilitypack.htmldocument doc = null;
bool isok = true;
while (isok)
{
try
{
random index = new random();
int num = index.next(0, list_proxy.count);
system.net.webrequest wreq = system.net.webrequest.create(url);
webproxy myproxy = new webproxy(list_proxy[num].ip, list_proxy[num].port);
wreq.proxy = myproxy;
wreq.timeout = 5000;
system.net.webresponse wresp = wreq.getresponse();
system.io.stream respstream = wresp.getresponsestream();
using (system.io.streamreader reader = new system.io.streamreader(respstream, encoding.getencoding("utf-8")))
{
htmlweb webclient = new htmlweb();
doc = webclient.load(url);
isok = false;
if (!string.isnullorempty(lbl_message.text))
{
lbl_ok.text = "正在读取中,请稍后...";
lbl_message.text = "";
}
return doc;
}
}
catch (exception ex)
{
lbl_ok.text = "";
lbl_message.text = "提示:" + ex.message + "正在尝试重新连接";
isok = false;
//this.btn_sure.enabled = true;
//mytimer.close();
}
}
return doc;
//}
//else
//{
lbl_ok.text = "";
lbl_message.text = "提示:未获取到有效的代理ip";
this.btn_sure.enabled = true;
mytimer.close();
return null;
//}
}
}
/// <summary>
/// 读取代理ip,并验证代理ip的可用性
/// </summary>
public static class redirhelper
{
public static list<proxy> masterporxylist = new list<proxy>();//代理ip集合
public static list<proxy> list_proxy = new list<proxy>();//有效的代理ip集合
//代理ip网址
private static string address_ip3366 = "http://www.ip3366.net/?stype=1&page={0}";
//代理ip类
public class proxy
{
public string ip { get; set; }
public int port { get; set; }
public string speed { get; set; }
public proxy(string pip, int pport, string speed)
{
this.ip = pip;
this.port = pport;
this.speed = speed;
}
}
/// <summary>
/// 验证代理ip是否有效的方法
/// </summary>
/// <param name="ip"></param>
/// <param name="port"></param>
/// <returns></returns>
public static void yanzhengip()
{
bool isok = true;
int index = 0;
while (isok)
{
try
{
if (masterporxylist != null && masterporxylist.count > 0)
{
//设置代理ip
webproxy proxyobject = new webproxy(masterporxylist[index].ip, masterporxylist[index].port);
//向指定地址发送请求
httpwebrequest httpwreq = (httpwebrequest)webrequest.create("http://www.baidu.com");
httpwreq.proxy = proxyobject;
httpwreq.timeout = 3000;
httpwebresponse httpwresp = (httpwebresponse)httpwreq.getresponse();
using (streamreader sr = new streamreader(httpwresp.getresponsestream(), encoding.utf8))
{
proxy temp = new proxy(masterporxylist[index].ip, masterporxylist[index].port, "");
list_proxy.add(temp);
list_proxy = list_proxy.where((x, i) => list_proxy.findindex(z => z.ip == x.ip) == i).tolist();//去重
isok = true;
}
}
}
catch (exception ex)
{
isok = true;
console.writeline(ex.message);
}
index++;
if (index == masterporxylist.count)
{
index = 0;
list_proxy.clear();//清除集合数据重新赋值
}
}
}
/// <summary>
/// 抓取代理ip,分页
/// </summary>
/// <param name="pageindex">分页</param>
/// <returns></returns>
/// object index,system.timers.elapsedeventargs e
public static void getproxylist()
{
list<proxy> proxy_count = new list<proxy>();
while (true)
{
for (int i = 1; i <= 2; i++)
{
string urlcombin = string.format(address_ip3366, i);
string catchhtml = catchproxipmethord(urlcombin, "");
htmlagilitypack.htmldocument doc = new htmlagilitypack.htmldocument();
doc.loadhtml(catchhtml);
htmlnodecollection hreflist = doc.documentnode.selectnodes("//table");
if (hreflist != null)
{
foreach (htmlnode table in doc.documentnode.selectnodes("//table"))
{
// .// tr 获取当前table下面的,//tr 检索整个界面的tr
foreach (htmlnode row in table.selectnodes(".//tr"))
{
if (row.selectnodes("th") == null)
{
if (row.selectnodes("td") != null)
{
string ip = row.selectnodes("td")[0].innertext.trim().tostring();
int port = convert.toint32(row.selectnodes("td")[1].innertext.trim());
proxy temp = new proxy(ip, port, "");
proxy_count.add(temp);
}
}
}
}
}
}
masterporxylist = proxy_count.where((x, j) => proxy_count.findindex(z => z.ip == x.ip) == j).tolist();//去重
}
}
//抓网页方法
public static string catchproxipmethord(string url, string encoding)
{
string htmlstr = "";
try
{
if (!string.isnullorempty(url))
{
webrequest request = webrequest.create(url);
webresponse response = request.getresponse();
stream datastream = response.getresponsestream();
encoding ec = encoding.default;
if (encoding == "utf8")
{
ec = encoding.utf8;
}
else if (encoding == "default")
{
ec = encoding.default;
}
else
{
ec = encoding.getencoding("gb2312");
}
streamreader reader = new streamreader(datastream, ec);
htmlstr = reader.readtoend();
reader.close();
datastream.close();
response.close();
}
}
catch { }
return htmlstr;
}
}
/// <summary>
/// 读取过保存值到txt中
/// </summary>
public class txthelper
{
public static string savaprocess(string data, string sort)
{
if (!string.isnullorempty(data))
{
system.datetime currenttime = system.datetime.now;
//获取当前日期的前一天转换成tofiletime
string strymd = currenttime.tostring("yyyymmddhhmmssfff");
//按照日期建立一个文件名
string filename = strymd + ".txt";
//获取当前项目所在磁盘
string curdir = system.windows.forms.application.startuppath.substring(0, system.windows.forms.application.startuppath.indexof(':')) + ":\\" + @"savedir";
//判断路径是否存在
if (!system.io.directory.exists(curdir))
{
system.io.directory.createdirectory(curdir);
}
string name = curdir + "\\" + @"" + sort + "";
if (!system.io.directory.exists(name))
{
system.io.directory.createdirectory(name);
}
curdir = name;
//不存在就创建
string filepath = curdir + "\\" + filename;
////文件覆盖方式添加内容
system.io.streamwriter file = new system.io.streamwriter(filepath, false);
//保存数据到文件
file.write(data);
//关闭文件
file.close();
//释放对象
file.dispose();
return filepath;
}
else
{
return "";
}
}
/// <summary>
/// 读取txt文件
/// </summary>
/// <param name="path"></param>
/// <returns></returns>
public static list<string> read(string path)
{
streamreader sr = new streamreader(path, encoding.default);
string line;
list<string> list = new list<string>();
while ((line = sr.readline()) != null)
{
if (!string.isnullorempty(line.tostring()))
{
list.add(line.tostring());
}
}
list = list.distinct().tolist();
return list;
}
}
上一篇: PHP中动态显示签名和ip原理