欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

C# 爬虫

程序员文章站 2022-03-20 12:21:57
//PS 需要引用HtmlAgilityPack.dll 文件,可自行在网上下载 public partial class GrabInterface : Form { public int number = 1; public GrabInterface() { InitializeCompone ......

//ps   需要引用htmlagilitypack.dll 文件,可自行在网上下载

public partial class grabinterface : form
{
public int number = 1;
public grabinterface()
{
initializecomponent();
this.load += grabinterface_load;
}

//定时器
system.timers.timer mytimer;

//定义委托,防止两线程之间控件赋值冲突
public delegate void action<in t>(t obj);
public void actionread(int t)
{
this.lbl_ok.text = "正在读取中,请稍后...";
this.btn_sure.enabled = false;
}
public void actionfinall(int t)
{
this.lbl_ok.text = "读取完毕";
this.btn_sure.enabled = false;
}
public static bool result = true;//设置定时器点击按钮时执行,避免重复执行定时器导致时间混乱
private void btn_sure_click(object sender, eventargs e)
{
if (string.isnullorempty(this.txt_url.text))
{
this.lbl_ok.text = "请选择需要读取的文件";
return;
}
list<string> list_read = txthelper.read(this.txt_url.text);
if (result)
{
mytimer = new system.timers.timer(1000 * 40);//定时周期
mytimer.elapsed += btn_sure_click;//
mytimer.autoreset = true; //是否不断重复定时器操作
mytimer.enabled = true;//开启计时器
result = false;
}

number++;
action<int> action = new action<int>(actionread);
invoke(action, number);

try
{
#region 抓取网页数据
if (list_read != null && list_read.count > 0)
{
for (int i = 0; i < list_read.count; i++)
{
string url = list_read[i];
list<string> list = new list<string>();
htmlweb webclient = new htmlweb();
htmlagilitypack.htmldocument doc = geturltohtml(url);
htmlnodecollection hreflist = doc.documentnode.selectnodes("//table");
if (hreflist != null)
{
foreach (htmlnode table in doc.documentnode.selectnodes("//table"))
{
foreach (htmlnode row in table.selectnodes(".//tr")) // 注意 .//tr 读取当前tr,如果为//tr节点 则读取所有的tr节点
{
string name = "";
if (row.selectnodes("th") == null)//不存在
{
if (row.selectnodes("td") != null)
{
foreach (htmlnode cell in row.selectnodes("td"))
{
regex htmlspacereg = new regex("\\ \\;", regexoptions.compiled | regexoptions.ignorecase);//去掉&nbsp;
name += regex.replace((regex.replace(cell.innertext.trim(), @"\s", "")), @"&(nbsp|#160);", " ", regexoptions.ignorecase) + "\t";
}
list.add(name);
}
}
}
}
string filename = "";
for (int j = 0; j < list.count; j++)
{
filename += list[j] + "\n";
}
string sort = datetime.now.tostring("yyyymmddhh").tostring() + number.tostring();
txthelper.savaprocess(filename, sort);
}
}
action<int> action1 = new action<int>(actionfinall);
invoke(action1, number);
}
#endregion
}
catch (exception ex)
{
console.writeline("错误" + ex.message + "");
}
}

private void timer1_tick(object sender, eventargs e)
{

}

//选择文件
private void btn_choose_click(object sender, eventargs e)
{
openfiledialog dialog = new openfiledialog();
dialog.multiselect = false;//该值确定是否可以选择多个文件
dialog.title = "请选择电子文档excel";
dialog.filter = "所有文件(*.txt)|*.txt";
if (dialog.showdialog() == system.windows.forms.dialogresult.ok)
{
txt_url.text = dialog.filename;
}
}

public thread t1;
public thread t2;


public static manualresetevent eventab = new manualresetevent(false);
private void grabinterface_load(object sender, eventargs e)
{
//抓取代理ip
t1 = new thread(new threadstart(redirhelper.getproxylist));
t2 = new thread(new threadstart(redirhelper.yanzhengip));//验证代理ip
t1.start();
thread.sleep(1000);
t2.start();
}

private void btn_stop_click(object sender, eventargs e)
{
//释放资源
if (mytimer != null)
{
mytimer.close();
mytimer.dispose();
}
application.exit();
}

public htmlagilitypack.htmldocument geturltohtml(string url)
{
list<redirhelper.proxy> list = redirhelper.masterporxylist;
list<redirhelper.proxy> list_proxy = redirhelper.list_proxy;//有效的代理ip集合
//if (list_proxy != null && list_proxy.count > 0)
//{
htmlagilitypack.htmldocument doc = null;
bool isok = true;
while (isok)
{
try
{
random index = new random();
int num = index.next(0, list_proxy.count);
system.net.webrequest wreq = system.net.webrequest.create(url);
webproxy myproxy = new webproxy(list_proxy[num].ip, list_proxy[num].port);
wreq.proxy = myproxy;
wreq.timeout = 5000;
system.net.webresponse wresp = wreq.getresponse();
system.io.stream respstream = wresp.getresponsestream();
using (system.io.streamreader reader = new system.io.streamreader(respstream, encoding.getencoding("utf-8")))
{
htmlweb webclient = new htmlweb();
doc = webclient.load(url);
isok = false;
if (!string.isnullorempty(lbl_message.text))
{
lbl_ok.text = "正在读取中,请稍后...";
lbl_message.text = "";
}
return doc;
}
}
catch (exception ex)
{
lbl_ok.text = "";
lbl_message.text = "提示:" + ex.message + "正在尝试重新连接";
isok = false;
//this.btn_sure.enabled = true;
//mytimer.close();
}
}
return doc;
//}
//else
//{
lbl_ok.text = "";
lbl_message.text = "提示:未获取到有效的代理ip";
this.btn_sure.enabled = true;
mytimer.close();
return null;
//}

}
}

 

/// <summary>
/// 读取代理ip,并验证代理ip的可用性
/// </summary>

public static class redirhelper
{
public static list<proxy> masterporxylist = new list<proxy>();//代理ip集合
public static list<proxy> list_proxy = new list<proxy>();//有效的代理ip集合

//代理ip网址
private static string address_ip3366 = "http://www.ip3366.net/?stype=1&page={0}";
//代理ip类
public class proxy
{
public string ip { get; set; }

public int port { get; set; }

public string speed { get; set; }

public proxy(string pip, int pport, string speed)
{
this.ip = pip;
this.port = pport;
this.speed = speed;
}
}

/// <summary>
/// 验证代理ip是否有效的方法
/// </summary>
/// <param name="ip"></param>
/// <param name="port"></param>
/// <returns></returns>
public static void yanzhengip()
{
bool isok = true;
int index = 0;
while (isok)
{
try
{
if (masterporxylist != null && masterporxylist.count > 0)
{
//设置代理ip
webproxy proxyobject = new webproxy(masterporxylist[index].ip, masterporxylist[index].port);
//向指定地址发送请求
httpwebrequest httpwreq = (httpwebrequest)webrequest.create("http://www.baidu.com");
httpwreq.proxy = proxyobject;
httpwreq.timeout = 3000;
httpwebresponse httpwresp = (httpwebresponse)httpwreq.getresponse();
using (streamreader sr = new streamreader(httpwresp.getresponsestream(), encoding.utf8))
{
proxy temp = new proxy(masterporxylist[index].ip, masterporxylist[index].port, "");
list_proxy.add(temp);
list_proxy = list_proxy.where((x, i) => list_proxy.findindex(z => z.ip == x.ip) == i).tolist();//去重
isok = true;
}
}
}
catch (exception ex)
{
isok = true;
console.writeline(ex.message);
}
index++;
if (index == masterporxylist.count)
{
index = 0;
list_proxy.clear();//清除集合数据重新赋值
}
}
}

/// <summary>
/// 抓取代理ip,分页
/// </summary>
/// <param name="pageindex">分页</param>
/// <returns></returns>
/// object index,system.timers.elapsedeventargs e
public static void getproxylist()
{
list<proxy> proxy_count = new list<proxy>();
while (true)
{
for (int i = 1; i <= 2; i++)
{
string urlcombin = string.format(address_ip3366, i);
string catchhtml = catchproxipmethord(urlcombin, "");
htmlagilitypack.htmldocument doc = new htmlagilitypack.htmldocument();
doc.loadhtml(catchhtml);
htmlnodecollection hreflist = doc.documentnode.selectnodes("//table");
if (hreflist != null)
{
foreach (htmlnode table in doc.documentnode.selectnodes("//table"))
{
// .// tr 获取当前table下面的,//tr 检索整个界面的tr
foreach (htmlnode row in table.selectnodes(".//tr"))
{
if (row.selectnodes("th") == null)
{
if (row.selectnodes("td") != null)
{
string ip = row.selectnodes("td")[0].innertext.trim().tostring();
int port = convert.toint32(row.selectnodes("td")[1].innertext.trim());
proxy temp = new proxy(ip, port, "");
proxy_count.add(temp);
}
}
}
}
}
}
masterporxylist = proxy_count.where((x, j) => proxy_count.findindex(z => z.ip == x.ip) == j).tolist();//去重
}
}
//抓网页方法
public static string catchproxipmethord(string url, string encoding)
{
string htmlstr = "";
try
{
if (!string.isnullorempty(url))
{
webrequest request = webrequest.create(url);
webresponse response = request.getresponse();
stream datastream = response.getresponsestream();
encoding ec = encoding.default;
if (encoding == "utf8")
{
ec = encoding.utf8;
}
else if (encoding == "default")
{
ec = encoding.default;
}
else
{
ec = encoding.getencoding("gb2312");
}
streamreader reader = new streamreader(datastream, ec);
htmlstr = reader.readtoend();
reader.close();
datastream.close();
response.close();
}
}
catch { }
return htmlstr;
}
}

/// <summary>
/// 读取过保存值到txt中
/// </summary>

public class txthelper
{
public static string savaprocess(string data, string sort)
{
if (!string.isnullorempty(data))
{
system.datetime currenttime = system.datetime.now;
//获取当前日期的前一天转换成tofiletime
string strymd = currenttime.tostring("yyyymmddhhmmssfff");
//按照日期建立一个文件名
string filename = strymd + ".txt";
//获取当前项目所在磁盘
string curdir = system.windows.forms.application.startuppath.substring(0, system.windows.forms.application.startuppath.indexof(':')) + ":\\" + @"savedir";
//判断路径是否存在
if (!system.io.directory.exists(curdir))
{
system.io.directory.createdirectory(curdir);
}
string name = curdir + "\\" + @"" + sort + "";
if (!system.io.directory.exists(name))
{
system.io.directory.createdirectory(name);
}
curdir = name;
//不存在就创建
string filepath = curdir + "\\" + filename;
////文件覆盖方式添加内容
system.io.streamwriter file = new system.io.streamwriter(filepath, false);
//保存数据到文件
file.write(data);
//关闭文件
file.close();
//释放对象
file.dispose();
return filepath;
}
else
{
return "";
}
}
/// <summary>
/// 读取txt文件
/// </summary>
/// <param name="path"></param>
/// <returns></returns>

public static list<string> read(string path)
{
streamreader sr = new streamreader(path, encoding.default);
string line;
list<string> list = new list<string>();
while ((line = sr.readline()) != null)
{
if (!string.isnullorempty(line.tostring()))
{
list.add(line.tostring());
}
}
list = list.distinct().tolist();
return list;
}
}