码迷,mamicode.com
首页 > Web开发 > 详细

如何遍历一个网站的所有页面

时间:2015-05-12 16:02:58      阅读:332      评论:0      收藏:0      [点我收藏+]

标签:

using System;
using System.Collections;
using System.Collections.Generic;

namespace WebSiteIterate
{
    class WebSiteIterate
    {
        private string Url;
        private string Domain;
        private List<string> K_V_Already = new List<string>();

        public WebSiteIterate(string Url)
        {
            this.Url = Url;
            Uri Uri = new System.Uri(this.Url);
            Domain = Uri.Host;         
            Handle();
        }

        private void Handle()
        {
            List<string> K_V_Not = new List<string>();
            K_V_Not.Add(this.Url);
            Proxy.Proxy Proxy = (Proxy.Proxy)Activator.GetObject(typeof(Proxy.Proxy), "ipc://Proxy/Proxy");
            Proxy.ProxyListGet();
            List<string> ProxyList = Proxy.ProxyList();
            while (K_V_Not.Count > 0)
            {               
                string TempUrl = K_V_Not[0].ToString();
                List<string> K_V_Temp =new  List<string>();
                int Number=0;
                while (K_V_Temp.Count == 0 && Number < ProxyList.Count)
                {
                    try
                    {
                        Extraction.Http.HttpResponseMgr HttpResponseMgr = new Extraction.Http.HttpResponseMgr(TempUrl, ProxyList[Number]);
                        string result = HttpResponseMgr.GetResult;
                        string str = result;
                        str = str.Replace("\r\n", "");
                        str = str.Replace("\r", "");
                        str = str.Replace("\n", "");
                        str = str.Replace("‘", "\"");
                        str = str.Replace("&nbsp;", "");
                        str = str.Replace(" ", "");
                        System.Text.RegularExpressions.Regex r1 = new System.Text.RegularExpressions.Regex(@"<head[^>]*?>.*?</head>");
                        str = r1.Replace(str, "");
                        System.Text.RegularExpressions.Regex r2 = new System.Text.RegularExpressions.Regex(@"<script[^>]*?>.*?</script>");
                        str = r2.Replace(str, "");
                        string p = "<a\\s?[^>]*?\\s?href=\"([^\"]+)\"[^>]*>([^<]+)</a>";    // 这个表达式只能抓取纯文本的链接  如何链接中含有图片抓取不到            
                        var collection = System.Text.RegularExpressions.Regex.Matches(str, p);
                        if (collection.Count > 0)
                        {
                            foreach (System.Text.RegularExpressions.Match it in collection)
                            {
                                string link = it.Groups[1].Value;
                                if (!(link.Contains("script") || link.Contains("#") || link.Contains(".rar") || link.Contains(".doc") || link.Contains(".pdf") || link.Contains("mailto") || link.Contains(".xls") || link.Contains("{") || link.Contains("\"")))
                                {
                                    Uri absoluteUri = new Uri(new Uri(TempUrl), link);
                                    K_V_Temp.Add(absoluteUri.ToString());
                                }
                            }
                        }
                    }
                    catch(Exception e)
                    {
                    }
                    Number++;
                }

                for (int i = 0; i < K_V_Temp.Count; i++)
                {
                    if ((!K_V_Already.Contains(K_V_Temp[i].ToString())) && (!K_V_Not.Contains(K_V_Temp[i].ToString())) && K_V_Temp[i].ToString().Contains(Domain))
                    {
                        K_V_Not.Add(K_V_Temp[i]);
                    }
                }
                K_V_Already.Add(K_V_Not[0]);
                K_V_Not.RemoveAt(0);
            }       
        }
    }
}

如何遍历一个网站的所有页面

标签:

原文地址:http://my.oschina.net/u/855028/blog/413750

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!