标签:CSV
1、urllib.request.urlretrieve可以根据文件的URL下载文件:
# -*- coding: utf-8 -*-
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/")
bsObj = BeautifulSoup(html, "lxml")
imageLocation = bsObj.find("a", {"id":"logo"}).find("img")["src"]
#print(imageLocation)
urlretrieve(imageLocation, "logo.jpg")这段程序从 http://pythonscraping.com下载logo图片,然后在程序运行的文件夹里保存为logo.jpg文件。
下面的程序会把 http://pythonscraping.com 主页上所有src属性且图片后缀为.jpg的文件都下载下来:
# -*- coding: utf-8 -*-
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
downloadDirectory = "downloaded"
baseUrl = "http://pythonscraping.com"
def getAbsoluteURL(baseUrl, source):
if source.startswith("http://www."):
url = "http://"+source[11:]
elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url = source[4:]
url = "http://"+source
else:
url = baseUrl+"/"+source
if baseUrl not in url:
return None
return url
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace("www.", "")
path = path.replace(baseUrl, "")
path = downloadDirectory + path
if path.endswith(".jpg"):
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
#print(path)
return path
html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html, "lxml")
downloadList = bsObj.findAll(src=True)
for download in downloadList:
#print(download["src"])
fileUrl = getAbsoluteURL(baseUrl, download["src"])
if fileUrl is not None:
print(fileUrl)
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))2、
# -*- coding: utf-8 -*-
import csv
csvFile = open("test.csv", 'w+')
try:
writer = csv.writer(csvFile)
writer.writerow(('number', 'number plus 2', 'number times 2'))
for i in range(10):
writer.writerow( (i, i+2, i*2))
finally:
csvFile.close()运行以上代码后,你会看到一个CSV文件:
| number | number plus 2 | number times 2 |
| 0 | 2 | 0 |
| 1 | 3 | 2 |
| 2 | 4 | 4 |
| 3 | 5 | 6 |
| 4 | 6 | 8 |
| 5 | 7 | 10 |
| 6 | 8 | 12 |
| 7 | 9 | 14 |
| 8 | 10 | 16 |
| 9 | 11 | 18 |
获取维基百科词条中的HTML表格并写入CSV文件。
# -*- coding: utf-8 -*-
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html, "lxml")
# 主对比表格是当前页面上的第一个表格
table = bsObj.findAll("table",{"class":"wikitable"})[0]
rows = table.findAll("tr")
csvFile = open("editors.csv", 'wt', newline="", encoding='utf-8')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()标签:CSV
原文地址:http://blog.51cto.com/9473774/2109036