标签:
HtmlAgilityPack是一个开源的解析HTML元素的类库,最大的特点是可以通过XPath来解析HMTL,如果您以前用C#操作过XML,那么使用起HtmlAgilityPack也会得心应手。目前最新版本为1.4.6。
程序示例如下:
代码如下:
using HtmlAgilityPack; using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; using System.Windows.Forms; namespace WinformHtmlAgilityPack { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void Form1_Load(object sender, EventArgs e) { Thread thread = new Thread(DownloadData); thread.Start(); } private void DownloadData() { GZipWebClient c = new GZipWebClient(); List<HouseModel> houseList = new List<HouseModel>(); int pageCount = 50; int index = 1; for (int m = 1; m <= pageCount; m++) { byte[] data = c.DownloadData(new Uri("http://newhouse.fang.com/house/s/b9" + m + "/")); string strx = Encoding.Default.GetString(data); HtmlWeb htmlWeb = new HtmlWeb(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(strx); //HtmlNode node = htmlDoc.GetElementbyId("sjina_C01_37"); //HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=‘nl_con clearfix‘]"); //HtmlNode list = node.SelectSingleNode("//ul"); //根据xpath ul下的li li[2]代表索引 得到 名称 //HtmlNode nodeli_name = htmlDoc.DocumentNode.SelectSingleNode("//*[@id=‘bx1‘]/div/div[1]/div[1]/div/div/ul/li[1]/div/div[2]/div[1]/div[1]/a"); //根据xpath ul下的li li[2]代表索引 得到 价格 //HtmlNode nodeli_price = htmlDoc.DocumentNode.SelectSingleNode("//*[@id=‘bx1‘]/div/div[1]/div[1]/div/div/ul/li[3]/div/div[2]/div[4]/span"); for (int i = 1; i <= 20; i++) { //通过Xpath选中html的指定元素 HouseModel houseModel = new HouseModel(); HtmlNode nodeli_name = htmlDoc.DocumentNode.SelectSingleNode("//*[@id=‘bx1‘]/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[1]/div[1]/a"); if (nodeli_name != null) { houseModel.楼盘名称 = NoHTML(nodeli_name.InnerHtml); } HtmlNode nodeli_price = htmlDoc.DocumentNode.SelectSingleNode("//*[@id=‘bx1‘]/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[4]/span"); if (nodeli_price != null) { houseModel.价格 = NoHTML(nodeli_price.InnerHtml); } HtmlNode nodeli_unit = htmlDoc.DocumentNode.SelectSingleNode("//*[@id=‘bx1‘]/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[4]/em"); if (nodeli_unit != null) { houseModel.单位 = NoHTML(nodeli_unit.InnerHtml); } HtmlNode nodeli_address = htmlDoc.DocumentNode.SelectSingleNode("//*[@id=‘bx1‘]/div/div[1]/div[1]/div/div/ul/li[" + i + "]/div/div[2]/div[2]/div[1]/a"); if (nodeli_address != null) { string district = nodeli_address.InnerHtml.Split(‘]‘)[0]; houseModel.所在区域 = NoHTML(district).Substring(1); houseModel.地址 = NoHTML(nodeli_address.InnerHtml); } houseModel.序号 = index; houseList.Add(houseModel); this.label1.Invoke(new MethodInvoker(delegate { this.label1.Text = "已经抓取:"+houseList.Count.ToString()+" 条....."; })); index++; this.dataGridView1.Invoke(new MethodInvoker(delegate { this.dataGridView1.DataSource = null; this.dataGridView1.DataSource = houseList; this.dataGridView1.Columns[5].Width = 500; })); Thread.Sleep(100); } } this.label1.Invoke(new MethodInvoker(delegate { this.label1.Text = "已经抓取完毕。"; })); } public class GZipWebClient : WebClient { protected override WebRequest GetWebRequest(Uri address) { HttpWebRequest webrequest = (HttpWebRequest)base.GetWebRequest(address); webrequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; return webrequest; } } public class HouseModel { public int 序号 { get; set; } public string 楼盘名称 { get; set; } public string 所在区域 { get; set; } public string 价格 { get; set; } public string 单位 { get; set; } public string 地址 { get; set; } } private void btn_exportexcel_Click(object sender, EventArgs e) { DataGridViewToExcel(this.dataGridView1); } #region private void DataGridViewToExcel(DataGridView dgv) { SaveFileDialog dlg = new SaveFileDialog(); dlg.Filter = "Execl files (*.xls)|*.xls"; dlg.FilterIndex = 0; dlg.RestoreDirectory = true; dlg.CreatePrompt = true; dlg.Title = "保存为Excel文件"; if (dlg.ShowDialog() == DialogResult.OK) { Stream myStream; myStream = dlg.OpenFile(); StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding(-0)); string columnTitle = ""; try { //写入列标题 for (int i = 0; i < dgv.ColumnCount; i++) { if (i > 0) { columnTitle += "\t"; } columnTitle += dgv.Columns[i].HeaderText; } sw.WriteLine(columnTitle); //写入列内容 for (int j = 0; j < dgv.Rows.Count; j++) { string columnValue = ""; for (int k = 0; k < dgv.Columns.Count; k++) { if (k > 0) { columnValue += "\t"; } if (dgv.Rows[j].Cells[k].Value == null) columnValue += ""; else columnValue += dgv.Rows[j].Cells[k].Value.ToString().Trim(); } sw.WriteLine(columnValue); } sw.Close(); myStream.Close(); } catch (Exception e) { MessageBox.Show(e.ToString()); } finally { sw.Close(); myStream.Close(); } } } #endregion public static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase); Htmlstring.Replace("<", ""); Htmlstring.Replace(">", ""); Htmlstring.Replace("\r\n", ""); return Htmlstring; } } }
标签:
原文地址:http://www.cnblogs.com/sunyj/p/5025240.html