CODE:
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 2014-8-19
@author: guaguastd
@name: company_suffix_normalize.py
'''
# import json
import os
import csv
from collections import Counter
from operator import itemgetter
from prettytable import PrettyTable
# specify csv directory
CSV_FILE = os.path.join(r"E:", "\\", "eclipse", "LinkedIn", "dfile", "my_connections.csv")
# define a set of transforms that converts the first item
# to the second item
transforms = [(', Inc.', ''), (', Inc', ''), (', LLC', ''), (', LLP', ''), (' LLC', ''), (' Inc.', ''), (' Inc', '')]
csvReader = csv.DictReader(open(CSV_FILE), delimiter=',', quotechar='"')
contacts = [row for row in csvReader]
companies = [c['Company'].strip() for c in contacts if c['Company'].strip() != '']
for i, _ in enumerate(companies):
for transform in transforms:
companies[i] = companies[i].replace(*transform)
pt = PrettyTable(field_names=['Company', 'Freq'])
pt.align = 'l'
c = Counter(companies)
[pt.add_row([company, freq])
for (company, freq) in sorted(c.items(), key=itemgetter(1), reverse=True)
if freq > 0]
print pt+---------------------------------------+------+ | Company | Freq | +---------------------------------------+------+ | ?????????? | 1 | | ?? | 1 | | SoftTalent Consulting ??????????????? | 1 | | SJTU | 1 | | WatchGuard Technologies | 1 | | Hebei Meishen Chemical Group CO.,Ltd | 1 | | Bloomberg LP | 1 | | DiHao trading Co.,Ltd | 1 | | CET | 1 | | Pica8 | 1 | | Microsoft | 1 | +---------------------------------------+------+
Python 规范化LinkedIn用户的联系人所在公司后缀 (data normalization),布布扣,bubuko.com
Python 规范化LinkedIn用户的联系人所在公司后缀 (data normalization)
原文地址:http://blog.csdn.net/guaguastd/article/details/38676547