标签:
读取太多url有问题
#coding=utf-8 import urllib import requests import sys import re import time def getxml(url): xml = urllib.urlopen(url+‘/crossdomain.xml‘) xmlread = xml.read() reg = str(r‘(?=domain=)(.*?)(?=/>)‘) #reg = str(r‘<?xml*(.*?)</‘) reg = re.compile(reg) domaintxt = re.findall(reg,xmlread) #print domaintxt return domaintxt f = open(‘xmlsource.txt‘,‘r‘) f1 = open(‘reslut.txt‘,‘w‘) #try: context=list_of_all_the_lines = f.readlines( ) for i in context: #context: x = i.strip() print ‘website:‘+x+‘ have ‘+str(len(getxml(x)))+‘ domain:‘ print >>f1,‘website:‘+x+‘ have ‘+str(len(getxml(x)))+‘ domain:‘ #print context[i] +str(len(getxml(x))) xmllen = len(getxml(x)) for m in range(0,xmllen,1): falresult = getxml(x)[m] falresult = falresult.replace(‘"‘,‘‘) falresult = falresult.replace(‘domain=‘,‘‘) print falresult print >>f1,falresult print (‘\n‘) print >>f1,(‘\n‘) time.sleep(1) print (‘Over‘) print >>f1,(‘Over‘) f1.close()
xml:
http://www.sina.com.cn/ http://www.discuz.net/ http://www.rising.com.cn/ http://www.ifeng.com// http://www.sdo.com/ http://www.sogou.com/ http://www.163.com/
标签:
原文地址:http://www.cnblogs.com/crac/p/5451639.html