码迷,mamicode.com
首页 > Windows程序 > 详细

简简单单C#爬虫小计

时间:2015-09-04 12:38:31      阅读:194      评论:0      收藏:0      [点我收藏+]

标签:

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;

namespace 正则
{
    class Program
    {
        static void Main(string[] args)
        {
            string url = "http://www.admin5.com/browse/177/";
            string html = GetHtml(url, Encoding.UTF8);
            Regex r = new Regex("(?<=href=\").*?(?=\")");
            MatchCollection mc = r.Matches(html);
            int a = 1;
            foreach (Match m in mc)
            {
                if (m.Value.Contains("article"))
                {
                    Console.WriteLine("http://www.admin5.com/" + m.Value);
                    Console.WriteLine("抓取内容");
                    string content = GetHtml(m.Value, Encoding.UTF8);
                    Regex i = new Regex("(?<=title>).*?(?=</title>)");
                    MatchCollection mm = i.Matches(content);
                    Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
                    MatchCollection nr = rcontent.Matches(content);
                    string title = mm[0].Value;
                    string neirong = nr[0].Value;
                    Console.WriteLine("保存数据");
                    string path = Directory.GetCurrentDirectory();
                    if (!Directory.Exists(path + "\\data"))
                    {
                        Directory.CreateDirectory(path + "\\data");
                    }
                    File.WriteAllText(path + "\\data" + "\\" + a + ".txt", title + "\r\n" + neirong);
                    a++;
                    Console.WriteLine("保存成功");
                }
            }
            Console.WriteLine("ok");
            Console.ReadKey();
        }

        private static string GetHtml(string url, Encoding encoding)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream s = response.GetResponseStream();
            StreamReader sr = new StreamReader(s);
            return sr.ReadToEnd();
        }
    }
}

  技术分享

 

技术分享

简简单单C#爬虫小计

标签:

原文地址:http://www.cnblogs.com/hexd1230/p/4781526.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!