码迷,mamicode.com
首页 > 其他好文 > 详细

统计中国,美国,世界排名前50的关键词并进行比较

时间:2016-12-14 13:38:17      阅读:178      评论:0      收藏:0      [点我收藏+]

标签:not   sub   rac   span   pen   ons   author   cal   country   

1 获取中国所有关键词

import pymysql
import json

conn= pymysql.connect(
        host=localhost,
        port = 3306,
        user=root,
        passwd=‘‘,
        db =python,
        )
cursor = conn.cursor()

sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where authorinfor like ‘%china%‘ && union_kwd_str != ‘‘"
a = cursor.execute(sql)
print a
b = cursor.fetchmany(a)  #b has 7887 abstract list

abstract_list = []
pmc_id_dict= {}

for j in range(a):
    abstract_list.append(b[j][0])
    pmc_id_dict[j] = b[j][1]



def output_to_json(data,filename):
    with open(filename,w) as file:
        file.write(json.dumps(data))
        file.close()
    return json.dumps(data)

output_data = {
        country: "china",
        count: a,
        keyword: abstract_list
    }
output_to_json(output_data, 1203_china_kwd.json)

选出排名前50的关键词

import re  
import collections  
import json

def input_from_json(filename):
    with open(filename,r) as file:
        data = json.loads(file.read())
        file.close()
        return data

def count_word(path):  
    result = {}
    keyword_list = input_from_json(path)[keyword]  
    for all_the_text in keyword_list:
        for word in all_the_text.split(,): 
            if word not in result:  
                result[word] = 0  
            result[word] += 1                 
    return result
  
 
def sort_by_count(d):  

    d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))  
    return d  

 
if __name__ == __main__:  
    file_name = "1203_china_kwd.json"  
    fobj1 = open(1204_top50_china_kwd_list.json,w)
    fobj2 = open(1203_top15_china_kwd.json,w)
 
    dword = count_word(file_name)  
    dword = sort_by_count(dword)  
      
    jsonlist = []
    num = 0

    top_china_kwd_list = []
    for key,value in dword.items():
        num += 1
        key = re.sub("_", " ", key)
        data = {
        name: key,
        value: value
        }
        json_data = json.dumps(data)

        if num < 50:
            top_china_kwd_list.append(key)
            fobj2.write(json_data)
            fobj2.write(,)
        if num == 50:
            top_china_kwd_list.append(key)
            fobj2.write(json_data)
            
    data = {
    china_kwd:top_china_kwd_list
    }
    json_data = json.dumps(data)
    fobj1.write(json_data)

2.获取美国的所有关键词,并做统计,与中国的统计代码相似,下一步工作是整合代码。

import pymysql
import json

conn= pymysql.connect(
        host=localhost,
        port = 3306,
        user=root,
        passwd=‘‘,
        db =python,
        )
cursor = conn.cursor()

sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where authorinfor like ‘%USA%‘ && union_kwd_str != ‘‘"
a = cursor.execute(sql)
print a
b = cursor.fetchmany(a)  #b has 7887 abstract list

abstract_list = []
pmc_id_dict= {}

for j in range(a):
    abstract_list.append(b[j][0])
    pmc_id_dict[j] = b[j][1]



def output_to_json(data,filename):
    with open(filename,w) as file:
        file.write(json.dumps(data))
        file.close()
    return json.dumps(data)

output_data = {
        country: "USA",
        count: a,
        keyword: abstract_list
    }
output_to_json(output_data, 1204_USA_kwd.json)

美国前50的关键词

import re  
import collections  
import json

def input_from_json(filename):
    with open(filename,r) as file:
        data = json.loads(file.read())
        file.close()
        return data

def count_word(path):  
    result = {}
    keyword_list = input_from_json(path)[keyword]  
    for all_the_text in keyword_list:
        for word in all_the_text.split(,): 
            if word not in result:  
                result[word] = 0  
            result[word] += 1                 
    return result
  
 
def sort_by_count(d):  

    d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))  
    return d  

 
if __name__ == __main__:  
    file_name = "1204_USA_kwd.json"  
    fobj1 = open(1204_top50_USA_kwd_list.json,w)
    fobj2 = open(1204_top50_USA_kwd.json,w)
 
    dword = count_word(file_name)  
    dword = sort_by_count(dword)  
      
    jsonlist = []
    num = 0

    top_USA_kwd_list = []
    for key,value in dword.items():
        num += 1
        key = re.sub("_", " ", key)
        data = {
        name: key,
        value: value
        }
        json_data = json.dumps(data)

        if num < 50:
            top_USA_kwd_list.append(key)
            fobj2.write(json_data)
            fobj2.write(,)
        if num == 50:
            top_USA_kwd_list.append(key)
            fobj2.write(json_data)
            
    data = {
    USA_kwd:top_USA_kwd_list
    }
    json_data = json.dumps(data)
    fobj1.write(json_data)

3,世界的前50的关键词

import pymysql
import json

conn= pymysql.connect(
        host=localhost,
        port = 3306,
        user=root,
        passwd=‘‘,
        db =python,
        )
cursor = conn.cursor()

sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where  union_kwd_str != ‘‘"
a = cursor.execute(sql)
print a
b = cursor.fetchmany(a)  #b has 7887 abstract list

abstract_list = []
pmc_id_dict= {}

for j in range(a):
    abstract_list.append(b[j][0])
    pmc_id_dict[j] = b[j][1]



def output_to_json(data,filename):
    with open(filename,w) as file:
        file.write(json.dumps(data))
        file.close()
    return json.dumps(data)

output_data = {
        country: "world",
        count: a,
        keyword: abstract_list
    }
output_to_json(output_data, 1203_world_kwd.json)

世界前50关键词

import re  
import collections  
import json

def input_from_json(filename):
    with open(filename,r) as file:
        data = json.loads(file.read())
        file.close()
        return data

def count_word(path):  
    result = {}
    keyword_list = input_from_json(path)[keyword]  
    for all_the_text in keyword_list:
        for word in all_the_text.split(,): 
            if word not in result:  
                result[word] = 0  
            result[word] += 1                 
    return result
  
 
def sort_by_count(d):  

    d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))  
    return d  

 
if __name__ == __main__:  
    file_name = "1203_world_kwd.json"  
    fobj1 = open(1204_top50_world_kwd_list.json,w)
    fobj2 = open(1203_top15_world_kwd.json,w)
 
    dword = count_word(file_name)  
    dword = sort_by_count(dword)  
      
    jsonlist = []
    num = 0
    
    top_world_kwd_list = []
    for key,value in dword.items():
        num += 1
        key = re.sub("_", " ", key)
        data = {
        name: key,
        value: value
        }
        json_data = json.dumps(data)

        if num < 50:
            top_world_kwd_list.append(key)
            fobj2.write(json_data)
            fobj2.write(,)
        if num == 50:
            top_world_kwd_list.append(key)
            fobj2.write(json_data)
    
    data = {
    world_kwd:top_world_kwd_list
    }
    json_data = json.dumps(data)
    fobj1.write(json_data)

4.比较中国与美国的关键词有哪些相似的,以及中国与世界的研究热点有哪些相似的

import json


def input_from_json(filename):
    with open(filename,r) as file:
        data = json.loads(file.read())
        file.close()
        return data

china_path = 1204_top50_china_kwd_list.json
world_path =  1204_top50_world_kwd_list.json
USA_path =  1204_top50_USA_kwd_list.json
china_kwd_list = input_from_json(china_path)[china_kwd]
world_kwd_list = input_from_json(world_path)[world_kwd]
USA_kwd_list = input_from_json(USA_path)[USA_kwd]

a = set(china_kwd_list)
b = set(world_kwd_list)
c = set(USA_kwd_list)

china_world_same_kwd =list(a&b)
for kwd in china_world_same_kwd:
    kwd = kwd.encode(utf-8)
    print kwd

print len(china_world_same_kwd)

print \n

china_USA_same_kwd =list(a&c)
for kwd in china_USA_same_kwd:
    kwd = kwd.encode(utf-8)
    print kwd

print len(china_world_same_kwd)

 

统计中国,美国,世界排名前50的关键词并进行比较

标签:not   sub   rac   span   pen   ons   author   cal   country   

原文地址:http://www.cnblogs.com/lovely7/p/6178853.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!