最先想试试python的爬虫功能,于是用easy_install安装了beautifulsoup,下面是我写的demo代码,可以简单看看
#coding=utf-8 import urllib2 from BeautifulSoup import BeautifulSoup as bs url_addr = 'http://car.autohome.com.cn/baoyang/detail_74_442_0_0_0_57.html' def save_to_file(content, fileName): with open(fileName, 'w+') as f: if not f: return False f.write(content) f.close() def parse_content(content): soup = bs(content) tds = soup.findAll('td') for i in tds: print i.text def is_unicode(c): if isinstance(c, unicode): return True return False def is_asii(c): if isinstance(c, str): return True return False if __name__ == '__main__': content = urllib2.urlopen(url_addr).read() #对于任何格式的str先转换为中间编码unicode #调用decode转换为unicode编码【根据自身的编码】 #然后再调用encode将unicode转换为指定的编码 #将content【编码为gb2312】转换为unicode编码 c = content.decode('gb2312') print('c is unicode:', is_unicode(c)) #print(c) save_to_file(c.encode('gb2312'), 'd:/content.html') parse_content(content)
原文地址:http://blog.csdn.net/davidsu33/article/details/43764961