标签:跟踪 open 不同的 方便 fse 2gb argument readline 实体
[Code 2.1.1]
1 using System.Crawler; 2 3 { 4 #region 以GET方式请求数据 5 var ant = new WorkerAnt 6 { 7 WorkerId = (uint)Math.Abs(DateTime.Now.ToString("HHmmssfff").GetHashCode()), 8 }; 9 var job = new JobContext 10 { 11 JobName = "Mike test job 1", 12 Uri = @"https://www.cnblogs.com/mikecheers/p/12090487.html", 13 }; 14 ant.Work(job); 15 #endregion 16 17 #region 以POST方式请求数据 18 var requestDataBuilder = new StringBuilder(); 19 requestDataBuilder.AppendLine("using System;"); 20 requestDataBuilder.AppendLine("namespace HelloWorldApplication"); 21 requestDataBuilder.AppendLine("{"); 22 requestDataBuilder.AppendLine(" class HelloWorld"); 23 requestDataBuilder.AppendLine(" {"); 24 requestDataBuilder.AppendLine(" static void Main(string[] args)"); 25 requestDataBuilder.AppendLine(" {"); 26 requestDataBuilder.AppendLine(" Console.WriteLine(\"《C# 爬虫 破境之道》\");"); 27 requestDataBuilder.AppendLine(" }"); 28 requestDataBuilder.AppendLine(" }"); 29 requestDataBuilder.AppendLine("}"); 30 31 var requestData = Encoding.UTF8.GetBytes( 32 @"code=" + System.Web.HttpUtility.UrlEncode(requestDataBuilder.ToString()) 33 + @"&token=4381fe197827ec87cbac9552f14ec62a&language=10&fileext=cs"); 34 35 new WorkerAnt 36 { 37 WorkerId = (uint)Math.Abs(DateTime.Now.ToString("HHmmssfff").GetHashCode()) 38 }.Work(new JobContext 39 { 40 JobName = "Mike test job 2", 41 Uri = @"https://tool.runoob.com/compile.php", 42 ContentType = @"application/x-www-form-urlencoded; charset=UTF-8", 43 Method = WebRequestMethods.Http.Post, 44 Buffer = requestData, 45 }); 46 #endregion 47 48 Console.WriteLine("End of Main method."); 49 Console.ReadLine(); 50 }
[Code 2.1.2]
Worker 471365603 JobStatus: WaitingForActivation Worker 471365603 is starting a job named ‘Mike test job 1‘. Worker 471365603 JobStatus: WaitingToRun Worker 471365603 JobStatus: Running Worker 1110506678 JobStatus: WaitingForActivation Worker 1110506678 is starting a job named ‘Mike test job 2‘. Worker 1110506678 JobStatus: WaitingToRun Worker 1110506678 JobStatus: Running End of Main method. Totally 0 downloaded. Totally 512 downloaded. Totally 1024 downloaded. Totally 1536 downloaded. Totally 2048 downloaded. Totally 2560 downloaded. Totally 2624 downloaded. Totally 3136 downloaded. Totally 3648 downloaded. Totally 4024 downloaded. Totally 4028 downloaded. Totally 4540 downloaded. Totally 5052 downloaded. Totally 5422 downloaded. Totally 5934 downloaded. Totally 6446 downloaded. Totally 6822 downloaded. Totally 7334 downloaded. Totally 7846 downloaded. Totally 8222 downloaded. Totally 8734 downloaded. Totally 9246 downloaded. Totally 9758 downloaded. Totally 10270 downloaded. Totally 10782 downloaded. Totally 10886 downloaded. <!DOCTYPE html> <html lang="zh-cn"> <head> <meta charset="utf-8" /> <meta name="v... /* ********************** using 000.75ms / request ******************** */ Worker 471365603 JobStatus: RanToCompletion Totally 0 downloaded. Totally 81 downloaded. {"output":"\u300aC# \u722c\u866b \u7834\u5883\u4e4b\u9053\u300b\n","errors":"\n"} /* ********************** using 012.32ms / request ******************** */ Worker 1110506678 JobStatus: RanToCompletion
在[Code 2.1.1]中,主要涉及两个类:
在[Code 2.1.2]中,我们可以跟踪到每只小工蚁的运行状态,采集到的数据以及耗时。同时也可以看到“End of Main method.”出现在比较靠前的位置,这说明我们的小工蚁还是有点儿小聪明的,可以采用异步的方式采集数据。
[Code 2.1.3]
1 namespace System.Crawler 2 { 3 using System; 4 using System.Diagnostics; 5 using System.IO; 6 using System.Net; 7 using System.Security.Cryptography.X509Certificates; 8 using System.Text; 9 using System.Threading.Tasks; 10 11 public class JobContext 12 { 13 /// <summary> 14 /// 任务名称 15 /// </summary> 16 public String JobName { get; set; } 17 /// <summary> 18 /// 任务状态 19 /// </summary> 20 public TaskStatus JobStatus { get; set; } 21 /// <summary> 22 /// 跑表,计时器。 23 /// </summary> 24 public Stopwatch Watch { get; set; } 25 26 public WebRequest Request { get; set; } 27 public WebResponse Response { get; set; } 28 public Stream RequestStream { get; set; } 29 public Stream ResponseStream { get; set; } 30 public MemoryStream Memory { get; set; } 31 public byte[] Buffer { get; set; } 32 33 /// <summary> 34 /// 请求的目标Uri 35 /// </summary> 36 public String Uri { get; set; } 37 38 /// <summary> 39 /// 设置509证书集合 40 /// </summary> 41 public X509CertificateCollection ClientCertificates { get; set; } 42 /// <summary> 43 /// Headers 44 /// </summary> 45 public WebHeaderCollection Headers { get; set; } 46 /// <summary> 47 /// 代理 48 /// </summary> 49 public IWebProxy Proxy { get; set; } 50 /// <summary> 51 /// 权限认证信息 52 /// </summary> 53 public ICredentials Credentials { get; set; } 54 55 /// <summary> 56 /// 获取或设置用于请求的 HTTP 版本。返回结果:用于请求的 HTTP 版本。默认为 System.Net.HttpVersion.Version11。 57 /// </summary> 58 public Version ProtocolVersion { get; set; } 59 60 /// <summary> 61 /// 获取或设置一个 System.Boolean 值,该值确定是否使用 100-Continue 行为。如果 POST 请求需要 100-Continue 响应,则为 true;否则为 false。默认值为 true。 62 /// </summary> 63 public bool Expect100Continue { get; set; } 64 65 /// <summary> 66 /// 设置Request请求方式 67 /// </summary> 68 public String Method { get; set; } 69 70 // Summary: 71 // Gets or sets the time-out value in milliseconds for the System.Net.HttpWebRequest.GetResponse() 72 // and System.Net.HttpWebRequest.GetRequestStream() methods. 73 // 74 // Returns: 75 // The number of milliseconds to wait before the request times out. The default 76 // value is 100,000 milliseconds (100 seconds). 77 // 78 // Exceptions: 79 // System.ArgumentOutOfRangeException: 80 // The value specified is less than zero and is not System.Threading.Timeout.Infinite. 81 public TimeSpan Timeout { get; set; } 82 83 // Summary: 84 // Gets or sets a time-out in milliseconds when writing to or reading from a 85 // stream. 86 // 87 // Returns: 88 // The number of milliseconds before the writing or reading times out. The default 89 // value is 300,000 milliseconds (5 minutes). 90 // 91 // Exceptions: 92 // System.InvalidOperationException: 93 // The request has already been sent. 94 // 95 // System.ArgumentOutOfRangeException: 96 // The value specified for a set operation is less than or equal to zero and 97 // is not equal to System.Threading.Timeout.Infinite 98 public TimeSpan ReadWriteTimeout { get; set; } 99 100 // Summary: 101 // Gets or sets the value of the Accept HTTP header. 102 // 103 // Returns: 104 // The value of the Accept HTTP header. The default value is null. 105 public string Accept { get; set; } 106 107 // Summary: 108 // Gets or sets the value of the Content-type HTTP header. 109 // 110 // Returns: 111 // The value of the Content-type HTTP header. The default value is null. 112 public string ContentType { get; set; } 113 114 // Summary: 115 // Gets or sets the value of the User-agent HTTP header. 116 // 117 // Returns: 118 // The value of the User-agent HTTP header. The default value is null.NoteThe 119 // value for this property is stored in System.Net.WebHeaderCollection. If WebHeaderCollection 120 // is set, the property value is lost. 121 public string UserAgent { get; set; } 122 123 /// <summary> 124 /// 返回数据编码默认为NUll,可以自动识别,一般为utf-8,gbk,gb2312 125 /// </summary> 126 public Encoding Encoding { get; set; } 127 128 /// <summary> 129 /// 请求时的Cookie 130 /// </summary> 131 public string Cookie { get; set; } 132 133 public CookieCollection Cookies { get; set; } 134 135 /// <summary> 136 /// 来源地址 137 /// </summary> 138 public string Referer { get; set; } 139 140 /// <summary> 141 /// 是否允许自动跳转 142 /// </summary> 143 public bool AllowAutoRedirect { get; set; } 144 145 /// <summary> 146 /// 最大连接数 147 /// </summary> 148 public int ConnectionLimit { get; set; } 149 150 public JobContext() 151 { 152 Uri = null; 153 ClientCertificates = null; 154 Headers = null; 155 Proxy = null; 156 ProtocolVersion = System.Net.HttpVersion.Version11; 157 Expect100Continue = true; 158 Method = WebRequestMethods.Http.Get; 159 Timeout = TimeSpan.FromSeconds(100); 160 ReadWriteTimeout = TimeSpan.FromMinutes(5); 161 Accept = null; 162 ContentType = null; 163 UserAgent = null; 164 Encoding = null; 165 Cookie = null; 166 Cookies = null; 167 Referer = null; 168 AllowAutoRedirect = true; 169 ConnectionLimit = 100; 170 Credentials = null; 171 } 172 } 173 }
[Code 2.1.4]
1 namespace System.Crawler 2 { 3 using System; 4 using System.Diagnostics; 5 using System.IO; 6 using System.Net; 7 using System.Net.Security; 8 using System.Security.Cryptography.X509Certificates; 9 using System.Text; 10 using System.Threading.Tasks; 11 12 /// <summary> 13 /// 一个爬虫的最小任务单位,一只小工蚁。 14 /// </summary> 15 public class WorkerAnt 16 { 17 public UInt32 WorkerId { get; set; } 18 19 public void Work(JobContext context) 20 { 21 Console.WriteLine($"Worker { WorkerId } JobStatus: " + (context.JobStatus = TaskStatus.WaitingForActivation).ToString()); 22 23 if (null == context) 24 throw new ArgumentNullException($"Worker { WorkerId } can not start a job with no context."); 25 26 if (null == context.Method) 27 throw new ArgumentNullException($"Worker { WorkerId } can not start a job with no method."); 28 29 if (null == context.Uri || !Uri.IsWellFormedUriString(context.Uri, UriKind.RelativeOrAbsolute)) 30 throw new FormatException($"Worker { WorkerId } can not start a job with uri ‘{context.Uri}‘ is not well formed."); 31 32 if (string.IsNullOrEmpty(context.JobName)) 33 Trace.WriteLine($"Worker {WorkerId} is starting a job with no name."); 34 else 35 Trace.WriteLine($"Worker {WorkerId} is starting a job named ‘{context.JobName}‘."); 36 37 Console.WriteLine($"Worker { WorkerId } JobStatus: " + (context.JobStatus = TaskStatus.WaitingToRun).ToString()); 38 context.Watch = new Stopwatch(); 39 context.Watch.Start(); 40 41 //这一句一定要写在创建连接的前面。使用回调的方法进行证书验证。 42 if (null != context.ClientCertificates && 0 < context.ClientCertificates.Count) 43 ServicePointManager.ServerCertificateValidationCallback = (sender, certificate, chain, errors) => true; 44 45 var Request = (context.Request = WebRequest.CreateHttp(context.Uri)) as HttpWebRequest; 46 47 if (null != context.ClientCertificates && 0 < context.ClientCertificates.Count) 48 foreach (X509Certificate item in context.ClientCertificates) 49 Request.ClientCertificates.Add(item); 50 51 if (null != context.Headers && context.Headers.Count > 0) 52 Request.Headers.Add(context.Headers); 53 54 Request.Proxy = context.Proxy; 55 56 if (null != context.ProtocolVersion) 57 Request.ProtocolVersion = context.ProtocolVersion; 58 59 Request.ServicePoint.Expect100Continue = context.Expect100Continue; 60 61 Request.Method = context.Method; 62 63 Request.Timeout = (Int32)context.Timeout.TotalMilliseconds; 64 65 Request.ReadWriteTimeout = (Int32)context.ReadWriteTimeout.TotalMilliseconds; 66 67 Request.Accept = context.Accept; 68 69 Request.ContentType = context.ContentType; 70 71 Request.UserAgent = context.UserAgent; 72 73 if (!string.IsNullOrEmpty(context.Cookie)) 74 Request.Headers[HttpRequestHeader.Cookie] = context.Cookie; 75 76 if (null != context.Cookies) 77 { 78 Request.CookieContainer = new CookieContainer(); 79 Request.CookieContainer.Add(context.Cookies); 80 } 81 82 Request.Referer = context.Referer; 83 84 Request.AllowAutoRedirect = context.AllowAutoRedirect; 85 86 if (0 < context.ConnectionLimit) 87 Request.ServicePoint.ConnectionLimit = context.ConnectionLimit; 88 89 Console.WriteLine($"Worker { WorkerId } JobStatus: " + (context.JobStatus = TaskStatus.Running).ToString()); 90 91 if (null != context.Buffer && 0 < context.Buffer.Length) 92 { 93 Request.ContentLength = context.Buffer.Length; 94 Request.BeginGetRequestStream(acGetRequestStream => 95 { 96 var contextGetRequestStream = acGetRequestStream.AsyncState as JobContext; 97 contextGetRequestStream.RequestStream = contextGetRequestStream.Request.EndGetRequestStream(acGetRequestStream); 98 contextGetRequestStream.RequestStream.BeginWrite(context.Buffer, 0, context.Buffer.Length, acWriteRequestStream => 99 { 100 var contextWriteRequestStream = acWriteRequestStream.AsyncState as JobContext; 101 contextWriteRequestStream.RequestStream.EndWrite(acWriteRequestStream); 102 contextWriteRequestStream.RequestStream.Close(); 103 GetResponse(contextWriteRequestStream); 104 }, contextGetRequestStream); 105 }, context); 106 } 107 else 108 GetResponse(context); 109 } 110 111 private void GetResponse(JobContext context) 112 { 113 context.Request.BeginGetResponse(new AsyncCallback(acGetResponse => 114 { 115 var contextGetResponse = acGetResponse.AsyncState as JobContext; 116 using (contextGetResponse.Response = contextGetResponse.Request.EndGetResponse(acGetResponse)) 117 using (contextGetResponse.ResponseStream = contextGetResponse.Response.GetResponseStream()) 118 using (contextGetResponse.Memory = new MemoryStream()) 119 { 120 var readCount = 0; 121 if (null == contextGetResponse.Buffer) contextGetResponse.Buffer = new byte[512]; 122 IAsyncResult ar = null; 123 do 124 { 125 if (0 < readCount) contextGetResponse.Memory.Write(contextGetResponse.Buffer, 0, readCount); 126 ar = contextGetResponse.ResponseStream.BeginRead( 127 contextGetResponse.Buffer, 0, contextGetResponse.Buffer.Length, null, contextGetResponse); 128 Console.WriteLine($"Totally {contextGetResponse.Memory.Length} downloaded."); 129 } while (0 < (readCount = contextGetResponse.ResponseStream.EndRead(ar))); 130 131 contextGetResponse.Request.Abort(); 132 contextGetResponse.Response.Close(); 133 contextGetResponse.Buffer = null; 134 135 var content = new UTF8Encoding(false).GetString(contextGetResponse.Memory.ToArray()); 136 Console.WriteLine(content.Length > 100 ? content.Substring(0, 90) + "..." : content); 137 138 contextGetResponse.Watch.Stop(); 139 Console.WriteLine("/* ********************** using {0}ms / request ******************** */" 140 + Environment.NewLine + Environment.NewLine, (contextGetResponse.Watch.Elapsed.TotalMilliseconds / 100).ToString("000.00")); 141 Console.WriteLine($"Worker { WorkerId } JobStatus: " + (contextGetResponse.JobStatus = TaskStatus.RanToCompletion).ToString()); 142 } 143 }), context); 144 } 145 } 146 }
Work(JobContext context)方法:
GetResponse(JobContext context)方法:
另外就是附上说好的100 Continue报文:(通过Wireshar捕获)
[Code 2.1.5]
>>> 1. 发送HTTP报头,包含Expect: 100-continue ---------------------------------------------- POST /xxxxxxxxxxxxxx.ashx HTTP/1.1 Content-Type: application/x-www-form-urlencoded encode_key: Ha1P29PAhyzRRmBiBkTJ6Q== Host: Content-Length: 480 Expect: 100-continue Connection: Keep-Alive <<< 2. 收到100Continue ---------------------------------------------- HTTP/1.1 100 Continue Content-Length: 0 Date: Sat, 21 Sep 2019 01:27:18 GMT Server: WebSphere Application Server/8.0 >>> 3. 发送请求的数据实体 ---------------------------------------------- LloRmU0xleMjr8VibuqgDUvL9++cFpBDwtRt89fbWw2UsHjS1+cCPVmn0t9y4NysUZXAYIAlS5odowFdI/h5HAsSNk7jjaVsEK9dFseNfN+TaIIlwagFwvEEZ6tjZ0pF90hmq90iiHzH5ylDjuSfC3OJUpPrDEfAogcq/nRe8TwVRtVVSZ20RH5o0hDc/ibMSOBI/qVW+c1Ala2xfknQHi5RRGXSd3NauL9Bd0Oxk4lDIbGcWxVByoU9oZCeB8in4KdbjQtiHebigTRNiyS6lglZXY482ArxRq2Gourld/9F/gFhSCExiiBGkfwy6nzmdB66/JxBk4GYiO9fEfjamQAt3hPs8cE7zEDnPN25dVvpwhP66e3c81aUigOi6+P6634CyoSjMqyivy5p9SJsdFLeZueqH7QhZUAkR4+o4lyHVcdfs2FXlZnl23AWyBEMlcrwwzuGEYzLJqzkoxWVJ9KJP5qRbjQM <<< 4. 收到 Response ---------------------------------------------- HTTP/1.1 200 OK Content-Language: zh-CN Set-Cookie: JSESSIONID=0000Hc6wSQjAvFXM1m2GbqKaRSE:-1; Path=/; HttpOnly Transfer-Encoding: chunked Date: Sat, 21 Sep 2019 01:27:18 GMT Server: WebSphere Application Server/8.0 Expires: Thu, 01 Dec 1994 16:00:00 GMT Cache-Control: no-cache="set-cookie, set-cookie2" 168 1279tozwn5CTZnLt1r2pAYHxJ8HES8K0Sc0yhi5O2Tsk+/uZLPRraJlU9mqe/m6NwKRaWYraQmdz oWGsKdAgWFge5tGXlr1mvQCZO4/fWXFxM117snEnBm5bfwB9Zq+NOiF2E3L1WmT2Ooet40WAvMoR 3ZznhdI5Fm6gS0H7nLaYeujOlzc/lZWIl29HQHdHbnIWqqxIXbvdb9wXIycgHwAecFNtAWT7iS9H BQcqajo5he2h1ehDn/kJns9YMwCWVDQ7iQW/tqqlRzxFhpaAaHQXT+fZK/nhFbomFwdAekz32M6t 4qnHFXBsU6ABX50+bCj+QZ/e4t1M6On/nJXyQoytQwfKFJWt 0
喜欢本系列丛书的朋友,可以点击链接加入QQ交流群(994761602)【C# 破境之道】
《C# 爬虫 破境之道》:第二境 爬虫应用 — 第一节:HTTP协议数据采集
标签:跟踪 open 不同的 方便 fse 2gb argument readline 实体