标签:
承接上文。
现在来进行response模块的开发。
这一模块所完成的任务是,如果本地的词库中没有用户需要查询的词汇,那么就去网络上寻找到相应的词条作为结果返回,并存入本地数据库。
我选择的网上的源是iciba,理由很简单,不需要复杂的cookie管理,所查词汇的内容基本集成在返回的html源文件中。
值得注意的是,如果请求过于频繁,那么会被iciba ban掉,所以如果要利用这段代码爬iciba的词库,请自行加个sleep。不过好像我代码中也有,注意改下便是。
该模块的逻辑为:
0、提供一个接口给其他模块调用,输入为待查词汇。
1、构造url请求,获得返回的数据。
2、根据数据的格式,解析返回的数据并获取相应词条的内容
3、按照约定的格式返回相应词条的内容给调用其的其他模块
具体的做法参考源代码
# -*- coding:utf-8 -*- __author__ = ‘wmydx‘ import urllib import re import urllib2 import time class GetResponse: def __init__(self): self.url = ‘http://www.iciba.com/‘ self.isEng = re.compile(r‘(([a-zA-Z]*)(\s*))*$‘) self.group_pos = re.compile(r‘<div class="group_pos">(.*?)</div>‘, re.DOTALL) self.net_paraphrase = re.compile(r‘<div class="net_paraphrase">(.*?)</div>‘, re.DOTALL) self.sentence = re.compile(r‘<dl class="vDef_list">(.*?)</dl>‘, re.DOTALL) def process_input(self, word): word = word.strip() word = word.replace(‘ ‘, ‘_‘) return word def get_data_from_web(self, word): headers = {‘Referer‘: ‘http://www.iciba.com/‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36‘} request = urllib2.Request(self.url + word, headers=headers) while True: try: f = urllib2.urlopen(request).read() break except: pass return f def get_eng_from_chinese(self, word): word = self.process_input(word) word = urllib.quote(word) data = self.get_data_from_web(word) label_lst = re.compile(r‘<span class="label_list">(.*?)</span>‘, re.DOTALL) label_itm = re.compile(r‘<label>(?P<item>.*?)</a>(.*?)</label>‘, re.DOTALL) first = label_lst.search(data) data = data[first.start():first.end()] start_itm = 0 res = [] while 1: second = label_itm.search(data, start_itm) if not second: break word = self.get_sentence_from_dt(data[second.start(‘item‘):second.end(‘item‘)]) res.append(word) start_itm = second.end() return res def get_dict_data(self, word): englst = [] res = [] match = self.isEng.match(word) if not match: englst = self.get_eng_from_chinese(word) else: englst.append(word) for item in englst: word = self.process_input(item) data = self.get_data_from_web(word) if data.find(‘对不起,没有找到‘) != -1: res.append(-1) else: tmp_dict = self.analysis_eng_data(data) tmp_dict[‘word‘] = word tmp_dict[‘times‘] = 1 res.append(tmp_dict) return res def analysis_eng_data(self, data): res = {} explain = self.group_pos.search(data) if explain: explain = data[explain.start():explain.end()] res[‘explain‘] = self.generate_explain(explain) else: res[‘explain‘] = -1 net_explain = self.net_paraphrase.search(data) if net_explain: net_explain = data[net_explain.start():net_explain.end()] res[‘net_explain‘] = self.generate_net_explain(net_explain) else: res[‘net_explain‘] = -1 sentence_start = 0 sentence_end = len(data) sentence_lst = [] while sentence_start < sentence_end: sentence = self.sentence.search(data, sentence_start) if sentence: sentence_str = data[sentence.start():sentence.end()] else: break sentence_lst.append(self.generate_sentence(sentence_str)) sentence_start = sentence.end() res[‘sentence‘] = "\n\n".join(sentence_lst) return res def generate_explain(self, target): start_word = 0 end_word = len(target) meta_word = re.compile(r‘<strong class="fl">(?P<meta_word>.*?)</strong>‘, re.DOTALL) label_lst = re.compile(r‘<span class="label_list">(.*?)</span>‘, re.DOTALL) label_itm = re.compile(r‘<label>(?P<item>.*?)</label>‘, re.DOTALL) res = ‘‘ while start_word < end_word: first = meta_word.search(target, start_word) if first: word_type = target[first.start(‘meta_word‘):first.end(‘meta_word‘)] else: break res += word_type + ‘ ‘ second = label_lst.search(target, first.end(‘meta_word‘)) start_label = second.start() end_label = second.end() while start_label < end_label: third = label_itm.search(target, start_label) if third: res += target[third.start(‘item‘):third.end(‘item‘)] start_label = third.end() else: break res += ‘\n‘ start_word = end_label return res def generate_net_explain(self, target): start_itm = 0 end_itm = len(target) li_item = re.compile(r‘<li>(?P<item>.*?)</li>‘, re.DOTALL) res = ‘网络释义: ‘ while 1: first = li_item.search(target, start_itm) if first: res += target[first.start(‘item‘):first.end(‘item‘)] else: break start_itm = first.end() return res def generate_sentence(self, target): res = ‘‘ english = re.compile(r‘<dt>(?P<eng>.*?)</dt>‘, re.DOTALL) chinese = re.compile(r‘<dd>(?P<chn>.*?)</dd>‘, re.DOTALL) first = english.search(target) second = chinese.search(target) res += self.get_sentence_from_dt(target[first.start(‘eng‘):first.end(‘eng‘)]) + ‘\n‘ res += target[second.start(‘chn‘):second.end(‘chn‘)] return res def get_sentence_from_dt(self, target): res = ‘‘ length = len(target) index = 0 while index < length: if target[index] == ‘<‘: while target[index] != ‘>‘: index += 1 else: res += target[index] index += 1 return res if __name__ == ‘__main__‘: p = GetResponse() test = [‘hello‘, ‘computer‘, ‘nothing‘, ‘bad guy‘, ‘someday‘] for item in test: res = p.get_dict_data(item) for key in res: for (k, v) in key.items(): print "dict[%s]=" % k, v print time.sleep(3)
【原创】shadowebdict开发日记:基于linux的简明英汉字典(三)
标签:
原文地址:http://www.cnblogs.com/shadowmydx/p/4335901.html