.Net 爬虫封装

“爬虫”就是一段用来自动化采集网站数据的程序。

ICrawler.cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
using Caty.Spider.Crawler.Events;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Threading.Tasks;

namespace Caty.Spider.Crawler
{
public interface ICrawler
{
event EventHandler<OnStartEventArgs> OnStart;//爬虫启动事件

event EventHandler<OnCompletedEventArgs> OnCompleted;//爬虫完成事件

event EventHandler<OnErrorEventArgs> OnError;//爬虫出错事件

Task<string> Start(Uri uri, string proxy); //异步爬虫
}
}

SimpleCrawler.cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.IO.Compression;
using Caty.Spider.Crawler.Events;

namespace Caty.Spider.Crawler
{
public class SimpleCrawler : ICrawler
{
public event EventHandler<OnStartEventArgs> OnStart;//爬虫启动事件

public event EventHandler<OnCompletedEventArgs> OnCompleted;//爬虫完成事件

public event EventHandler<OnErrorEventArgs> OnError; //爬虫出错事件

public CookieContainer CookiesContainer { get; set; } //定义Cookie容器

public SimpleCrawler() { }

/// <summary>
/// 异步创建爬虫
/// </summary>
/// <param name="uri">爬虫URL地址</param>
/// <param name="proxy">代理服务器</param>
/// <returns>网页源代码</returns>
public async Task<string> Start(Uri uri, string proxy = null)
{
return await Task.Run(() =>
{
var pageSource = string.Empty;
try
{
if (this.OnStart != null) this.OnStart(this, new OnStartEventArgs(uri));
var watch = new Stopwatch();
watch.Start();
var request = (HttpWebRequest)WebRequest.Create(uri);
request.Accept = "*/*";
request.ContentType = "application/x-www-form-urlencoede";//定义文档类型及编码
request.AllowAutoRedirect = false;//禁止自动跳转
//设置User-Agent,伪装成Google Chrome浏览器
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
request.Timeout = 5000;//定义请求超时时间为5秒
request.KeepAlive = true;//启用长连接
request.Method = "GET";//定义请求方式为GET
if (proxy != null) request.Proxy = new WebProxy(proxy);//设置代理服务器IP,伪装请求地址
request.CookieContainer = this.CookiesContainer;//附加Cookie容器
request.ServicePoint.ConnectionLimit = int.MaxValue;//定义最大连接数

using (var response = (HttpWebResponse)request.GetResponse())
{//获取请求响应

foreach (Cookie cookie in response.Cookies) this.CookiesContainer.Add(cookie);//将Cookie加入容器,保存登录状态

if (response.ContentEncoding.ToLower().Contains("gzip"))//解压
{
using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{
pageSource = reader.ReadToEnd();
}
}
}
else if (response.ContentEncoding.ToLower().Contains("deflate"))//解压
{
using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{
pageSource = reader.ReadToEnd();
}

}
}
else
{
using (Stream stream = response.GetResponseStream())//原始
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{

pageSource = reader.ReadToEnd();
}
}
}
}
request.Abort();
watch.Stop();
var threadId = System.Threading.Thread.CurrentThread.ManagedThreadId;//获取当前任务线程ID
var milliseconds = watch.ElapsedMilliseconds;//获取请求执行时间
if (this.OnCompleted != null) this.OnCompleted(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource));
}
catch (Exception ex)
{
if (this.OnError != null) this.OnError(this, new OnErrorEventArgs(uri, ex));
}
return pageSource;
});
}
}
}

OnCompletedEventArgs.cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
using System;
using System.Collections.Generic;
using System.Text;

namespace Caty.Spider.Crawler.Events
{
/// <summary>
/// 爬虫完成事件
/// </summary>
public class OnCompletedEventArgs
{
public Uri Uri { get; private set; }// 爬虫URL地址
public int ThreadId { get; private set; }// 任务线程ID
public string PageSource { get; private set; }// 页面源代码
public long Milliseconds { get; private set; }// 爬虫请求执行事件
public OnCompletedEventArgs(Uri uri, int threadId, long milliseconds, string pageSource)
{
this.Uri = uri;
this.ThreadId = threadId;
this.Milliseconds = milliseconds;
this.PageSource = pageSource;
}
}
}

OnErrorEventArgs.cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
using System;
using System.Collections.Generic;
using System.Text;

namespace Caty.Spider.Crawler.Events
{
/// <summary>
/// 爬虫错误事件
/// </summary>
public class OnErrorEventArgs
{
public Uri Uri { get; set; }

public Exception Exception { get; set; }

public OnErrorEventArgs(Uri uri, Exception exception)
{
this.Uri = uri;
this.Exception = exception;
}
}
}

OnStartEventArgs.cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
using System;
using System.Collections.Generic;
using System.Text;

namespace Caty.Spider.Crawler.Events
{
/// <summary>
/// 爬虫启动事件
/// </summary>
public class OnStartEventArgs
{
public Uri Uri { get; set; }// 爬虫URL地址

public OnStartEventArgs(Uri uri)
{
this.Uri = uri;
}
}
}

如何使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
var Url = "http://mebook.cc/";
var kindleCrawler = new SimpleCrawler();
kindleCrawler.OnStart += (s, e) =>
{
//启动时执行的代码
};
kindleCrawler.OnError += (s, e) =>
{
//出错时执行的代码
};
kindleCrawler.OnCompleted += (s, e) =>
{
//完成时执行的代码
}

.Net 爬虫封装
http://blog.chcaty.cn/2018/03/06/net-pa-chong-feng-zhuang/
作者
caty
发布于
2018年3月6日
许可协议