标签:
CCCF
《中国计算机学会通讯》月刊(Communications of the CCF, 简称CCCF)由中国计算机学会主办,高等教育出版社出版,面向计算机专业人士及信息领域的相关人士。杂志利用学会的学术优势,组织信息技术各个领域最有影响的专家撰稿,全面、宏观介绍计算机科学技术发展的最新发展状况,预测未来技术发展趋势,可以帮助读者更加开阔视野,了解IT最前沿的动态,把握IT发展方向,具有权威性和指导性,适合与计算机相关的科研、教学,以及产业和管理等各方面的人士阅读。
地址:http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp
下载问题
先来看下下载过程,进入CCCF的页面之后,如果我们想下载某一期通讯下边的文章,就要点击期刊》点击标题》点击下载》然后就会得到一个类似于“0.pdf”,“1.pdf”的文件。
假设一期通讯中有15篇,要下载全部文章的话,要点击几十次鼠标,下载之后的文件名也都是数字,如果要逐个修改的话,也要花费一定时间。
解决方案(2个程序):
(1)提供期刊id,自动下载该期刊下的全部文章;
GitHub:https://github.com/cheesezhe/ccf_crawler (里边有帮助文档)
(2)自动下载全部期刊的全部文章,直接运行源代码就行;
源代码:
1 #!/usr/bin/env python 2 #-*-coding:utf-8-*- 3 __author__ = ‘ZhangHe‘ 4 import urllib2,re,os,httplib,urllib 5 6 7 def download_by_paper_url(src_url, dest_file): 8 """ 9 根据paper链接src_url下载文件并保存为dest_file 10 :param src_url: 11 :param dest_file: 12 :return: 13 """ 14 f = urllib2.urlopen(src_url) 15 try: 16 data = f.read() 17 except httplib.IncompleteRead as e: 18 with open(‘err_log.txt‘,‘a+‘) as err:#错误日志信息 19 err.write("%s %s\n"%(src_url,err)) 20 print ‘Error‘ 21 return -1 22 with open(dest_file, "wb") as code: 23 code.write(data) 24 25 26 def parse_data_from_journal_url(src_url): 27 """ 28 根据期刊链接获取paper的urls,titles和期刊名字 29 :param src_url: 30 :return:[paper_urls, paper_titles, journal_name] 31 """ 32 request = urllib2.Request(src_url) 33 response = urllib2.urlopen(request) 34 content = response.read().decode(‘utf-8‘) 35 36 print ‘parsing paper IDs...‘ 37 pattern_str1 = ‘<a target=.*?title=.*?href=.*?contentId=(.*?)">‘ 38 pattern_str2 = ‘<span id=.*?class="cfqwz">(.*?)</span>‘ 39 pattern_str3 = ‘<title>(.*?)-.*?</title>‘ 40 pattern1 = re.compile(pattern_str1, re.S) 41 pattern2 = re.compile(pattern_str2, re.S) 42 pattern3 = re.compile(pattern_str3, re.S) 43 ids = re.findall(pattern1, content) 44 titles = re.findall(pattern2, content) 45 name = re.findall(pattern3, content) 46 47 return [ids, titles, name[0].strip()] 48 49 50 def get_url_by_paper_id(id): 51 """ 52 根据paperid获取下载链接 53 :param src_url: 54 :return: 55 """ 56 src_url = ‘http://www.ccf.org.cn/sites/ccf/freexiazai.jsp?contentId=‘+str(id) 57 request = urllib2.Request(src_url) 58 response = urllib2.urlopen(request) 59 content = response.read().decode(‘utf-8‘) 60 61 pattern_str = ‘class=""><a href="(.*?)">.*?</a></span>‘ 62 pattern = re.compile(pattern_str, re.S) 63 urls = re.findall(pattern, content) 64 # 65 #If there is no url, return -1 66 if len(urls) < 1: 67 return -1 68 # 69 #process Chinese words in url 70 tmps = urls[0].split(‘/‘) 71 l = len(tmps) 72 tmps[l-1] = urllib.quote(tmps[l-1].encode(‘utf-8‘)) 73 tmp = ‘‘ 74 # 75 # or tmp = ‘/‘.join(tmps) 76 for i in tmps: 77 tmp += ‘/‘+i 78 return ‘http://www.ccf.org.cn/sites/ccf/download.jsp?file=‘+tmp 79 80 81 def get_all_journals_ids(): 82 """ 83 获取所有期刊对应的的id 84 """ 85 urls = [ 86 ‘http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp?jportal=SFXxdDjYKXLl06cz1fxjkzihsqP9JcoP‘,#89-118期 87 ‘http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp?jportal=SFXxdDjYKXLl06cz1fxjk%2FySA9FzIG2g‘,#59-88期 88 ‘http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp?jportal=SFXxdDjYKXLl06cz1fxjk7R3hW0kV5Np‘,#29-58期 89 ‘http://www.ccf.org.cn/sites/ccf/zgjsjxhtx.jsp?jportal=SFXxdDjYKXLl06cz1fxjk%2BP28%2Bg%2BBW1u‘#01-28期 90 ] 91 res = [] 92 93 for src_url in urls: 94 print ‘processing\t‘+src_url 95 request = urllib2.Request(src_url) 96 response = urllib2.urlopen(request) 97 content = response.read().decode(‘utf-8‘) 98 99 pattern_str = ‘<li id="(.*?)">.*?<a target=‘ 100 pattern = re.compile(pattern_str, re.S) 101 ids = re.findall(pattern, content) 102 res.extend(ids) 103 return res 104 105 106 def get_all_done_papers_ids(): 107 """ 108 获取所有已下载文章的id列表 109 :return: 110 """ 111 dl_ids = [] 112 with open(‘dl_list.txt‘,‘r‘) as dl:#已下载文章id 113 for i in dl: 114 dl_ids.append(i.strip()) 115 return dl_ids 116 117 118 def get_all_done_journals_ids(): 119 """ 120 获取全部已下载期刊对应的id列表 121 :return: 122 """ 123 dl_j = [] 124 with open(‘dl_list_j.txt‘,‘r‘) as dl:#已下载期刊id 125 for i in dl: 126 dl_j.append(i.strip()) 127 return dl_j 128 129 130 def create_new_directory(dir_name): 131 """ 132 创建一个文件夹,文件夹名为dir_name 133 :param dir_name: 134 :return: 135 """ 136 try: 137 os.mkdir(dir_name) 138 except WindowsError as e: 139 pass 140 141 142 def get_paper_title(origin_title): 143 """ 144 格式化文章标题 145 :param origin_title: 146 :return ret: 147 """ 148 ret = origin_title.strip() 149 ret = ret.replace(‘/‘,‘-‘) 150 ret = ret.replace(‘?‘,‘‘) 151 ret = ret.replace(‘*‘,‘_x_‘) 152 return ret 153 154 if __name__ == ‘__main__‘: 155 """ 156 Step 1:获取期刊id列表,已下载期刊id列表,已下载文章id列表 157 """ 158 all_journals_ids = get_all_journals_ids() 159 all_done_journals_ids = get_all_done_journals_ids() 160 all_done_papers_ids = get_all_done_papers_ids() 161 162 """ 163 Step 2:遍历期刊id列表,逐个处理 164 """ 165 for journal_id in all_journals_ids: 166 # 167 #如果已下载当前期刊,则跳过 168 if journal_id in all_done_journals_ids: 169 print ‘%s has been downloaded.‘%(journal_id) 170 continue 171 # 172 #根据期刊id,获取解析数据ret_data = [文章id列表,文章标题列表,期刊名] 173 journal_url = ‘http://www.ccf.org.cn/sites/ccf/jsjtbbd.jsp?contentId=‘+journal_id 174 ret_data = parse_data_from_journal_url(journal_url) 175 print ‘Start Download %s\t %s‘ % (journal_id, ret_data[2]) 176 # 177 #根据期刊名创建目录 178 create_new_directory(ret_data[2]) 179 finished = 0 180 """ 181 Step 3:遍历ret_data中的文章id列表,逐个处理 182 """ 183 for idx in xrange(len(ret_data[0])): 184 paper_id = ret_data[0][idx] 185 # 186 #如果文章paper_id已下载,则跳过 187 if paper_id in all_done_papers_ids: 188 print ‘Paper %s has been downloaded.‘ % paper_id 189 finished += 1 190 continue 191 # 192 #根据paper_id获得下载链接 193 title = get_paper_title(ret_data[1][idx]) 194 print ‘Downloading (%s/%s) ID:%s Title:%s‘ % (str(idx+1), str(len(ret_data[0])), paper_id, title) 195 target_url = get_url_by_paper_id(paper_id) 196 # 197 # if target_url is -1, it means there is no url 198 # for paper_id(this is very special situation) 199 if target_url == -1: 200 print ‘There is no url for paper %s‘ % paper_id 201 finished += 1 202 continue 203 """ 204 Step 4:根据下载链接,下载文件 205 """ 206 dl_result = download_by_paper_url(target_url, ret_data[2]+‘\\‘+title+‘.pdf‘) 207 if dl_result != -1: 208 finished += 1 209 with open(‘dl_list.txt‘, ‘a+‘) as dl:#存储已下载的文章的id 210 dl.write(paper_id+‘\n‘) 211 else: 212 with open(‘err_list.txt‘, ‘a+‘) as err:#存储下载失败的期刊id 文章id 213 err.write(journal_id+‘ ‘+paper_id+‘\n‘) 214 if finished == len(ret_data[0]): 215 with open(‘dl_list_j.txt‘, ‘a+‘) as dl:#存储已下载期刊的id 216 dl.write(journal_id+‘\n‘) 217 print ‘All finished.‘
标签:
原文地址:http://www.cnblogs.com/CheeseZH/p/5103548.html