标签:style class blog c code java
http://www.catalogueoflife.org/col/browse/classification
这是一个国外的生物信息网站
今天的代码可以抓取指定分类的信息(id,学名)
没有把多线程写进去,略失败...
运用:webclient,regex,io
项目在>>>开源中国
1 using System; 2 using System.Collections.Generic; 3 using System.ComponentModel; 4 using System.Data; 5 using System.Drawing; 6 using System.Linq; 7 using System.Text; 8 using System.Threading.Tasks; 9 using System.Windows.Forms; 10 using System.Net; 11 using System.Text.RegularExpressions; 12 using System.Threading; 13 using System.IO; 14 namespace cateoflife 15 { 16 public partial class Form1 : Form 17 { 18 WebClient wc = new WebClient(); 19 int start; 20 int end; 21 string url; 22 string reg; 23 string msg; 24 int now = 1; 25 public Form1() 26 { 27 InitializeComponent(); 28 29 } 30 31 private void button1_Click(object sender, EventArgs e) 32 { 33 start = int.Parse(textBox2.Text); 34 35 FileInfo fifo = new FileInfo(start+".txt"); 36 FileStream fs= fifo.OpenWrite(); 37 StreamWriter w = new StreamWriter(fs); 38 w.BaseStream.Seek(0, SeekOrigin.End); 39 40 end=(int.Parse(textBox3.Text)==0)?99999:int.Parse(textBox3.Text); 41 url = textBox1.Text; 42 reg = textBox4.Text; 43 wc.Encoding = Encoding.UTF8; 44 string Htm; 45 for (int i = start; i <= end; i++) 46 { 47 try 48 { 49 Htm = wc.DownloadString(url + i); 50 foreach (Match m in Regex.Matches(Htm, reg)) 51 { 52 gettxt(m.ToString()); 53 w.Write(msg); 54 w.Flush(); 55 } 56 } 57 catch (Exception) 58 { 59 Htm = wc.DownloadString(url + i); 60 foreach (Match m in Regex.Matches(Htm, reg)) 61 { 62 gettxt(m.ToString()); 63 w.Write(msg); 64 w.Flush(); 65 } 66 } 67 68 } 69 w.Close(); 70 } 71 void gettxt(string html) 72 { 73 msg=Regex.Match(html,"(?<=/)\\d+").ToString()+"\t"+Regex.Match(html,"(?<=>)\\w+\\s*\\w+").ToString()+"\r\n"; 74 } 75 } 76 }
标签:style class blog c code java
原文地址:http://www.cnblogs.com/Fadinglemon/p/3737058.html