码迷,mamicode.com
首页 > 其他好文 > 详细

C# 访问一个页面,并根据该页面上的a标签递归访问每个页面

时间:2014-05-08 20:30:34      阅读:389      评论:0      收藏:0      [点我收藏+]

标签:des   style   blog   class   code   java   

bubuko.com,布布扣
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using System.Net;
using HtmlAgilityPack;

namespace SimpleWebRobot
{
    class Program
    {
        public static int LINKCOUNT = 0;
        public static int ERRCOUNT = 0;

        static void Main(string[] args)
        {
            Output.WriteLine("Createing log ...");
            LINKCOUNT = 0;
            //FileInfo logFile = new FileInfo("E:\\SimpleWebRobot\\SimpleWebRobot\\SimpleWebRobot\\bin\\Debug\\log\\log_111.txt");
            //logFile.Create();
            StreamWriter fs = new StreamWriter("log.txt");
            try
            {
                Output.WriteLine("Start at :" + DateTime.Now.ToString(), fs);
                //load start link list
                Output.WriteLine("Loading start links list ...", fs);
                IList<string> linkList = new List<string>();
                FileInfo listFile = new FileInfo("list.txt");
                StreamReader sr = new StreamReader(listFile.OpenRead());
                string line;
                while ((line = sr.ReadLine()) != null)
                {
                    linkList.Add(line);
                    Output.WriteLine("URL added:\"" + line + "\"", fs);
                }
                sr.Close();
                Output.WriteLine("Loading link complate.", fs);
                //do visit
                IList<string> visitedList = new List<string>();
                foreach (string link in linkList)
                {
                    DoVisit(link, ref visitedList, fs);
                }
                Output.WriteLine("Finished...", fs);
                Output.WriteLine("Visited " + LINKCOUNT + " links in all. " + (LINKCOUNT - ERRCOUNT) + " SUCCESS AND " + ERRCOUNT + " ERRORS", fs);
                //end
            }
            catch (Exception ex)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Output.WriteLine("Exception!! :" + ex.Message, fs);
                Console.ForegroundColor = ConsoleColor.White;
                //throw;
            }
            finally
            {
                fs.Close();
                Output.WriteLine("PRESS ANY KEY TO CLOSE...");
                Console.Read();
            }
            
        }

        private static void DoVisit(string link, ref IList<string> visitedList, StreamWriter fs)
        {
            if (!visitedList.Contains(AddToVistitedList(link)))
            {
                visitedList.Add(AddToVistitedList(link));
                Output.WriteLine("Visiting :" + link, fs);
                LINKCOUNT++;
                try
                {
                    Uri Uri = new Uri(link);
                    HtmlWeb hw = new HtmlWeb();
                    HtmlDocument doc = hw.Load(link);
                    if (doc == null || doc.DocumentNode == null) throw new Exception("Can not visit this url!");
                    if (hw.StatusCode != HttpStatusCode.OK) throw new Exception("Can not visit this url! StatusCode:" + hw.StatusCode.ToString());
                    foreach (HtmlNode a in doc.DocumentNode.SelectNodes("//a[@href]"))
                    {
                        string innerLink = a.GetAttributeValue("href", null);
                        if (!string.IsNullOrEmpty(innerLink) && !innerLink.ToLower().StartsWith("javascript") && !innerLink.StartsWith("#") && !innerLink.ToLower().StartsWith("mailto"))//排除javascript里面的连接,以及mailto
                        {
                            if (innerLink.StartsWith("http"))
                            {
                                var innerUri = new Uri(innerLink);
                                if (innerUri.Host.ToLower() != Uri.Host.ToLower())
                                {
                                    continue;
                                }
                            }
                            else if (innerLink.StartsWith("/"))
                            {
                                innerLink = "http://" + Uri.Host + innerLink;
                            }
                            else if (innerLink.StartsWith("../"))
                            {
                                int wildCard = 1;
                                innerLink = innerLink.Substring(3);
                                if(innerLink.StartsWith("../"))
                                {
                                    wildCard += 1;
                                    innerLink = innerLink.Substring(3);
                                }
                                var pathArray = link.Split(/);
                                string combinLink = "";
                                foreach (string path in pathArray.Take(pathArray.Length - wildCard - 1))
                                {
                                    combinLink = combinLink + path + "/";
                                }
                                innerLink = combinLink + innerLink;
                            }
                            else
                            {
                                innerLink = link.Substring(0, link.LastIndexOf("/") + 1) + innerLink;
                            }
                            innerLink = innerLink.Replace(".html", "");//此处业务需求将*.html页面转化为*.aspx页面
                            DoVisit(innerLink, ref visitedList, fs);
                        }
                    }
                }
                catch (Exception ex)
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Output.WriteLine("Exception!! :" + ex.Message, fs);
                    Console.ForegroundColor = ConsoleColor.White;
                    ERRCOUNT++;
                    //throw;
                }
                
            }
        }

        private static string AddToVistitedList(string url)
        {
            url = url.Replace("http://", "");
            if(url.IndexOf("#") > 0)
            {
                url = url.Substring(0, url.IndexOf("#"));
            }
            return url.ToLower();
        }
    }
}
bubuko.com,布布扣

list.txt 、log.txt放在Debug文件夹下,list.txt文件内容如下:

http://dev.static.com/en-gb/home
http://dev.static.com/de-de/home
http://dev.static.com/it-it/home
http://dev.static.com/en-gb/404
http://dev.static.com/de-de/404
http://dev.static.com/it-it/404

 

C# 访问一个页面,并根据该页面上的a标签递归访问每个页面,布布扣,bubuko.com

C# 访问一个页面,并根据该页面上的a标签递归访问每个页面

标签:des   style   blog   class   code   java   

原文地址:http://www.cnblogs.com/1017283242zhu/p/3709499.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!