标签:info custom 根据 weight roc self arc pes meta
elasticsearch搜索提示(补全)接口需要新增suggest字段并设type为:completion,结合到scrapy,修改es_types.py文件:
from datetime import datetime from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer from elasticsearch_dsl.connections import connections connections.create_connection(hosts=[‘localhost‘]) class ArticleType(DocType): #文章类型 suggest = Completion(analyzer="ik_max_word") #这样做由于原码问题这里会报错 title = Text(analyzer="ik_max_word") create_date = Date() praise_nums = Integer() fav_nums = Integer() comment_nums = Integer() tags = Text(analyzer="ik_max_word") front_image_url = Keyword() url_object_id = Keyword() front_image_path = Keyword() url = Keyword() content = Text(analyzer="ik_max_word") class Meta: index = ‘jobbole‘ doc_type = ‘article‘ if __name__ == ‘__main__‘: ArticleType.init()
解决办法:自定义CustomAnalysis类,继承自elasticsearch_dsl.analysis下的CustomAnalysis类:
from datetime import datetime from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer from elasticsearch_dsl.connections import connections connections.create_connection(hosts=["localhost"]) class CustomAnalyzer(_CustomAnalyzer): def get_analysis_definition(self): return {} ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])#大小写转换(搜索时忽略大小写影响) class ArticleType(DocType): #伯乐在线文章类型 suggest = Completion(analyzer=ik_analyzer) title = Text(analyzer="ik_max_word") create_date = Date() url = Keyword() url_object_id = Keyword() front_image_url = Keyword() front_image_path = Keyword() praise_nums = Integer() comment_nums = Integer() fav_nums = Integer() tags = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") class Meta: index = "jobbole" doc_type = "article" if __name__ == "__main__": ArticleType.init()
在item中生成搜索建议词:
from spider.models.es_types import ArticleType from elasticsearch_dsl.connections import connections es = connections.create_connection(ArticleType._doc_type.using) def gen_suggests(index, info_tuple): #根据字符串生成搜索建议数组 used_words = set() #set为去重功能 suggests = [] for text, weight in info_tuple: if text: #字符串不为空时,调用elasticsearch的analyze接口分析字符串(分词、大小写转换) words = es.indices.analyze(index=index, analyzer="ik_max_word", params={‘filter‘:["lowercase"]}, body=text) anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"])>1]) new_words = anylyzed_words - used_words else: new_words = set() if new_words: suggests.append({‘input‘: list(new_words), ‘weight‘: weight}) return suggests class JobboleArticleItem(scrapy.Item): title = scrapy.Field() create_date = scrapy.Field(input_processor=MapCompose(date_convert)) praise_nums = scrapy.Field(input_processor=MapCompose(number_convert)) fav_nums = scrapy.Field(input_processor=MapCompose(number_convert)) comment_nums = scrapy.Field(input_processor=MapCompose(number_convert)) tags = scrapy.Field(input_processor=MapCompose(remove_comment_tags), output_processor=Join(‘,‘)) front_image_url = scrapy.Field(output_processor=MapCompose(returnValue)) url_object_id = scrapy.Field(input_processor=MapCompose(get_md5)) front_image_path = scrapy.Field() url = scrapy.Field() content = scrapy.Field()
def save_to_elasticsearch(self): article = ArticleType() article.title = self[‘title‘] article.create_date = self[‘create_date‘] article.content = remove_tags(self[‘content‘]) # remove_tags()去除html标签 article.front_image_url = self[‘front_image_url‘] if ‘front_image_path‘ in self: article.front_image_path = self[‘front_image_path‘] article.praise_nums = self[‘praise_nums‘] article.fav_nums = self[‘fav_nums‘] article.comment_nums = self[‘comment_nums‘] article.url = self[‘url‘] article.tags = self[‘tags‘] article.meta.id = self[‘url_object_id‘] #生成搜索建议词 article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save() # 保存 return
标签:info custom 根据 weight roc self arc pes meta
原文地址:http://www.cnblogs.com/jp-mao/p/6937260.html