如何打造网站克隆、仿站工具(C#版)
程序员文章站
2022-05-18 23:01:52
前两天朋友叫我模仿一个网站,刚刚开始,我一个页面一个页面查看源码并复制和保存,花了我很多时间,一个字“累”,为了减轻工作量,我写了个网站“克隆工具”,一键克隆,比起人工操作, 效率提高了200%以上,精确度也大大提高,虽然网上也很多网站克隆工具,但我觉得作为一个程序员,要有点研究精神,哈哈哈,可以根 ......
前两天朋友叫我模仿一个网站,刚刚开始,我一个页面一个页面查看源码并复制和保存,花了我很多时间,一个字“累”,为了减轻工作量,我写了个网站“克隆工具”,一键克隆,比起人工操作,
效率提高了200%以上,精确度也大大提高,虽然网上也很多网站克隆工具,但我觉得作为一个程序员,要有点研究精神,哈哈哈,可以根据自己的需要随意编写自己需要的功能。
下面我将我写的“网站克隆工具”实现方法分享给大家,源码在文末有下载链接,有需要的朋友可以下载来玩,也可以根据自己的需要做相应的修改或优化。
一睹为快,先看看界面:
简单的工作流程:
项目代码目录结构:
下面一步步实现程序功能:
1.新建主界面窗体(mainform.cs):
2.新建模型类(urlmodel.cs)
public class urlmodel
{
public string relatedpath { get; set; }
public string absoluteuri { get; set; }
public string currpath { get; set; }
public string rootpath { get; set; }
public string host { get; set; }
public int port { get; set; }
public string scheme { get; set; }
}
3.新建服务类(services)
urlparser:
public class urlparser
{
public static urlmodel parse(string url)
{
urlmodel model = new urlmodel();
//默认
if (url.length < 8)
throw new exception("url参数不正确");
else if (!url.tolower().startswith("http:") && !url.tolower().startswith("https:"))
throw new exception("url格式有误");
if (url.lastindexof('/') < 8)
url = url + "/";
regex reg = new regex("(?<scheme>(http|https))://(?<host>.+?)/", regexoptions.singleline);
if (reg.ismatch(url))
{
string scheme = reg.match(url).groups["scheme"].value;
string host = reg.match(url).groups["host"].value;
if (host.contains(":"))
{
var aa = host.split(':');
if (aa.length == 2)
{
model.host = aa[0];
model.port = int.parse(aa[1]);
}
}
else
{
model.host = host;
model.port = 80;
}
int index = url.indexof('/', 8);
model.relatedpath = url.substring(index);
model.absoluteuri = url;
model.scheme = scheme;
model.currpath = url.substring(0, url.lastindexof("/"));
if (80 == model.port)
{
model.rootpath = string.format("{0}://{1}", model.scheme, model.host);
}
else
{
model.rootpath = string.format("{0}://{1}:{2", model.scheme, model.host, model.port);
}
}
else
{
throw new exception("url解析失败!");
}
return model;
}
}
webpageservice:
/// <summary>
/// 网页处理服务工具
/// </summary>
public class webpageservice
{
private static string[] excludekeys = { "http:", "https:", "//", "#", "javascript:", "?", "tel:", "mailto:" };
/// <summary>
/// 获取所有html元素的href属性值,只获取站点本地的链接,站外的不获取
/// </summary>
/// <param name="html">页面的html源码</param>
/// <returns></returns>
public static list<urlmodel> getlocalhrefs(string url,string html)
{
if (string.isnullorempty(html))
return new list<urlmodel>();
dictionary<string, urlmodel> urls = gethrefs(url,html);
list<urlmodel> newurls = new list<urlmodel>();
if (null != urls)
{
foreach (string key in urls.keys)
{
string newkey = key.tolower();
bool iscontained = false;
foreach (var exkey in excludekeys)
{
if (newkey.indexof(exkey) == 0)
{
iscontained = true;
break;
}
}
if (!iscontained) {
//只获取本地路径
newurls.add(urls[key]);
}
}
}
return newurls;
}
/// <summary>
/// 获取所有html元素的src属性值,只获取站点本地的链接,站外的不获取
/// </summary>
/// <param name="html">页面的html源码</param>
/// <returns></returns>
public static list<urlmodel> getlocalsrcs(string url,string html)
{
if (string.isnullorempty(html))
return new list<urlmodel>();
dictionary<string, urlmodel> urls = getsrc(url, html);
list<urlmodel> newurls = new list<urlmodel>();
if (null != urls)
{
foreach (string key in urls.keys)
{
string newkey = key.tolower();
bool iscontained = false;
foreach (var exkey in excludekeys)
{
if (newkey.indexof(exkey) == 0)
{
iscontained = true;
break;
}
}
if (!iscontained)
{
//只获取本地路径
newurls.add(urls[key]);
}
}
}
return newurls;
}
private static dictionary<string, urlmodel> gethrefs(string url,string html)
{
if (string.isnullorempty(html))
return null;
urlmodel currurl = urlparser.parse(url);
dictionary<string, urlmodel> urls = new dictionary<string, urlmodel>();
regex reg = new regex("href=\"(?<url>.+?)\"", regexoptions.ignorecase);
if (currurl != null)
{
addurlmodel(html, currurl, urls, reg);
}
return urls;
}
private static dictionary<string, urlmodel> getsrc(string url,string html)
{
if (string.isnullorempty(html))
return null;
urlmodel currurl = urlparser.parse(url);
dictionary<string, urlmodel> urls = new dictionary<string, urlmodel>();
regex reg = new regex("(src=\"(?<url>.+?)\"|url\\((?<url>.+?)\\))", regexoptions.ignorecase);
if (currurl != null)
{
addurlmodel(html, currurl, urls, reg);
}
return urls;
}
private static void addurlmodel(string html, urlmodel currurl, dictionary<string, urlmodel> urls, regex reg)
{
if (reg.ismatch(html))
{
matchcollection matchs = reg.matches(html);
foreach (match item in matchs)
{
try
{
string strurl = item.groups["url"].value;
urlmodel model = new urlmodel();
model.relatedpath = strurl;
model.currpath = currurl.currpath;
model.rootpath = currurl.rootpath;
model.scheme = currurl.scheme;
model.port = currurl.port;
model.host = currurl.host;
if (strurl.startswith("/"))
{
//绝对目录情况下
model.absoluteuri = string.format("{0}{1}", model.rootpath, model.relatedpath);
}
else
{
//相对目录情况下
string currpath = model.currpath;
int depth = 0;
string path = model.relatedpath;
if (path.startswith(".."))
{
try
{
while (path.startswith(".."))
{
depth++;
path = path.substring(3);
currpath = currpath.substring(0, currpath.lastindexof("/"));
}
model.absoluteuri = string.format("{0}/{1}", currpath, path);
}
catch
{
}
}
else
{
model.absoluteuri = string.format("{0}/{1}", currpath, path);
}
}
strurl = strurl.trim().tolower();
urls.add(strurl, model);
}
catch
{
}
}
}
}
}
4.网页源码扒取类
public class httptool
{
public static string httpget(string url, string referer, string encoding, out string msg)
{
msg = string.empty;
string result = string.empty;
try
{
httpwebrequest request = (httpwebrequest)webrequest.create(url);
//request.contenttype = "application/x-www-form-urlencoded";
request.accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.referer = referer;
request.method = "get";
request.useragent = "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/38.0.2125.122 safari/537.36";
//request.headers.add("accept-language", "zh-cn");
//request.headers.add("accept-encoding", "gzip,deflate");
request.timeout = 60000;//一分钟
httpwebresponse response = (httpwebresponse)request.getresponse();
stream responsestream = response.getresponsestream();
if (responsestream != null)
{
streamreader reader = new streamreader(responsestream, system.text.encoding.getencoding(encoding));
result = reader.readtoend();
reader.close();
responsestream.close();
request.abort();
response.close();
return result.trim();
}
}
catch (exception ex)
{
msg = ex.message + ex.stacktrace;
}
return result;
}
public static void downfile(string urladdress, string localpath, string filename)
{
webclient client = new webclient();
stream str = client.openread(urladdress);
streamreader reader = new streamreader(str);
byte[] mbyte = new byte[1000000];
int allmybyte = (int)mbyte.length;
int startmbyte = 0;
while (allmybyte > 0)
{
int m = str.read(mbyte, startmbyte, allmybyte);
if (m == 0)
{
break;
}
startmbyte += m;
allmybyte -= m;
}
reader.dispose();
str.dispose();
string path = path.combine(localpath, filename);
filestream fstr = new filestream(path, filemode.openorcreate, fileaccess.write);
fstr.write(mbyte, 0, startmbyte);
fstr.flush();
fstr.close();
}
}
5.网站克隆主类
接口:
interface iwebcloneworker { void start(); void cancel(); }
实现类:
public class webcloneworker : iwebcloneworker { //网站页面克隆深度(如:0-首页,1-分类页,2-详细页面) public static int depth = 0; //要克隆的网站网址 public string url { get; set; } //克隆后,保存的路径 public string savepath { get; set; } private backgroundworker backgroundworker1 = null; public event urlchangedeventhandler urlchanged; public event filesavedsuccesseventhandler filesavedsuccess; public event filesavedfaileventhandler filesavedfail; public event downloadcompletedeventhandler downloadcompleted; public event collectingurleventhandler collectingurl; public event collectedurleventhandler collectedurl; public event progresschangedeventhandler progresschanged; //所有页面、文件资源地址集合 private dictionary<string, urlmodel> _hrefs = new dictionary<string, urlmodel>(); /// <summary> /// 所有页面、文件资源地址集合 /// </summary> public dictionary<string,urlmodel> hrefs { get { return _hrefs; } set { _hrefs = value; } } //网站页面请求编码,默认为utf-8 private string _encoding = "utf-8"; //网站页面请求编码,默认为utf-8 public string encoding { get { return _encoding; } set { _encoding = value; } } public webcloneworker() { } public webcloneworker(string url,string path) { //设置网站、保存路径 this.url = url; this.savepath = path; if (string.isnullorempty(this.url)) throw new exception("请输入网址"); if (string.isnullorempty(this.savepath)) throw new exception("请选择要保存的目录"); backgroundworker1 = new backgroundworker(); //设置报告进度更新 backgroundworker1.workerreportsprogress = true; backgroundworker1.workersupportscancellation = true; //注册线程主体方法 backgroundworker1.dowork += backgroundworker1_dowork; //注册更新ui方法 backgroundworker1.progresschanged += backgroundworker1_progresschanged; //处理完毕 backgroundworker1.runworkercompleted += backgroundworker1_runworkercompleted; } void backgroundworker1_runworkercompleted(object sender, runworkercompletedeventargs e) { if (e.cancelled) { return; } if (this.downloadcompleted != null) { downloadcompletedeventargs eventargs = new downloadcompletedeventargs(e.result, e.error, e.cancelled); this.downloadcompleted(this, eventargs); } } void backgroundworker1_progresschanged(object sender, progresschangedeventargs e) { //进度回调 if (this.progresschanged != null) this.progresschanged(this, e); urlmodel model = (urlmodel)e.userstate; if (this.urlchanged != null) { //url改变后,回调 urlchangedeventargs eventargs = new urlchangedeventargs(model); this.urlchanged(this, eventargs); } try { string dir = this.savepath; string url = model.absoluteuri; string absolutepath = url.substring(url.indexof('/', 8)); string filename = ""; if (url.indexof('?') > 0) { string path = absolutepath.substring(0, model.relatedpath.indexof('?')); filename = system.io.path.getfilename(path); } else { filename = system.io.path.getfilename(absolutepath); } //默认首页 if (string.isnullorempty(filename) || filename.indexof(".") < 0) { filename = "index.html"; if (!absolutepath.endswith("/")) absolutepath = absolutepath + "/"; } filename = system.web.httputility.urldecode(filename); string localpath = string.format("{0}{1}", dir, system.io.path.getdirectoryname(absolutepath)); if (!system.io.directory.exists(localpath)) { system.io.directory.createdirectory(localpath); } //判断文件是否存在,存在不再下载 string path2 = path.combine(localpath, filename); if (file.exists(path2)) { return; } //下载网页、图片、资源文件 httptool.downfile(url, localpath, filename); //保存成功后,回调 if (this.filesavedsuccess != null) { filesavedsuccesseventargs eventargs = new filesavedsuccesseventargs(model); this.filesavedsuccess(this, eventargs); } } catch (exception ex) { //保存失败后,回调 if (this.filesavedfail != null) { filesavedfaileventargs eventargs = new filesavedfaileventargs(ex); this.filesavedfail(this, eventargs); } } } void backgroundworker1_dowork(object sender, doworkeventargs e) { //获取资源 getresource(); int index = 1; if (this.hrefs.keys.count > 0) { foreach (var k in this.hrefs.keys) { //取消操作 if (backgroundworker1.cancellationpending) { e.cancel = true; return; } backgroundworker1.reportprogress(index, this.hrefs[k]); index++; //挂起当前线程200毫秒 thread.sleep(200); } } } public void start() { if (this.backgroundworker1.isbusy) return; this.backgroundworker1.runworkerasync(); } public void cancel() { if (this.backgroundworker1.cancellationpending) return; this.backgroundworker1.cancelasync(); } private void getresource() { string url = this.url; string referer = this.url; string msg = ""; string html = httptool.httpget(url, referer, this.encoding, out msg); //收集页面链接 gethrefs(0, url, html); //收集完毕 if (null != collectedurl) { urlmodel urlmodel = new urlmodel(); collectedurleventargs eventargs = new collectedurleventargs(urlmodel); this.collectedurl(this, eventargs); } } private void gethrefs(int level,string url,string html) { #region 添加当前页 urlmodel currurl = urlparser.parse(url); try { //取消 if (backgroundworker1.cancellationpending) return; this.hrefs.add(currurl.relatedpath, currurl); //收集回调 if (null != collectingurl) { collectingurleventargs eventargs = new collectingurleventargs(currurl); this.collectingurl(this, eventargs); } } catch { } #endregion //获取相关链接(含有href属性的) list<urlmodel> list1 = webpageservice.getlocalhrefs(url,html); //获取图片,文件等资源文件(含有src属性的) list<urlmodel> listsrcs = webpageservice.getlocalsrcs(url,html); #region 获取当级资源文件 if (listsrcs != null) { for (int i = 0; i < listsrcs.count; i++) { urlmodel urlmodel = listsrcs[i]; try { //取消 if (backgroundworker1.cancellationpending) return; this.hrefs.add(urlmodel.relatedpath, urlmodel); //收集回调 if (null != collectingurl) { collectingurleventargs eventargs = new collectingurleventargs(urlmodel); this.collectingurl(this, eventargs); } } catch { } } } #endregion #region 获取子级页面资源 //获取第二级 if (list1 != null) { for (int i = 0; i < list1.count; i++) { urlmodel urlmodel = list1[i]; try { //取消 if (backgroundworker1.cancellationpending) return; this.hrefs.add(urlmodel.relatedpath, urlmodel); //收集回调 if (null != collectingurl) { collectingurleventargs eventargs = new collectingurleventargs(urlmodel); this.collectingurl(this, eventargs); } } catch { } string msg = ""; html = httptool.httpget(urlmodel.absoluteuri, urlmodel.absoluteuri, this.encoding, out msg); #region 获取子级资源文件 /* * 获取二级资源文件 * */ listsrcs = webpageservice.getlocalsrcs(urlmodel.absoluteuri, html);//资源文件 if (listsrcs != null) { for (int j = 0; j < listsrcs.count; j++) { urlmodel urlmodel2 = listsrcs[j]; try { //取消 if (backgroundworker1.cancellationpending) return; this.hrefs.add(urlmodel2.relatedpath, urlmodel2); //收集回调 if (null != collectingurl) { collectingurleventargs eventargs = new collectingurleventargs(urlmodel2); this.collectingurl(this, eventargs); } } catch { } //挂起线程20毫秒 thread.sleep(20); } } #endregion //挂起线程20毫秒 thread.sleep(20); //到达指定深度后,退出 if (level >= depth) return; //递归 gethrefs(level + 1, urlmodel.absoluteuri, html); } } #endregion } }
6.一些事件、委托类:
public delegate void urlchangedeventhandler(object sender, urlchangedeventargs e); public delegate void filesavedsuccesseventhandler(object sender, filesavedsuccesseventargs e); public delegate void filesavedfaileventhandler(object sender, filesavedfaileventargs e); public delegate void downloadcompletedeventhandler(object sender, downloadcompletedeventargs e); public delegate void collectingurleventhandler(object sender, collectingurleventargs e); public delegate void collectedurleventhandler(object sender, collectedurleventargs e); public delegate void progresschangedeventhandler(object sender, progresschangedeventargs e);
public class collectedurleventargs : eventargs public class collectingurleventargs : eventargs public class downloadcompletedeventargs : eventargs public class filesavedfaileventargs : eventargs public class filesavedsuccesseventargs : eventargs public class urlchangedeventargs : eventargs
代码有点多,各位有需要的还是下载源码查看并运行吧,由于赶时间,没时间仔细测试程序的各个功能,难免有不足的地方。
百度网盘:链接:https://pan.baidu.com/s/1hja1rl9uecl0dztqvft0dg 密码:7s6r
上一篇: python forkping
下一篇: 微信小程序蓝牙连接小票打印机