标签:man local pen location 北京 false replace else 城市
G-->f
import csv import time import xlrd from openpyxl import Workbook ZHITONGZI_CITY_DIC = {} f = open(‘直筒子市_东莞中山.txt‘, ‘r‘, encoding=‘utf-8‘) ZHITONGZI_CITY_DIC[‘东莞市‘] = [] ZHITONGZI_CITY_DIC[‘中山市‘] = [] c = 0 for i in f: ii = i.replace(‘ ‘, ‘‘).split(‘;‘) for iii in ii: iv = iii.split(‘、‘) if len(iv) > 2: c += 1 for v in iv: if v.find(‘(‘) > -1: v_ = v.split(‘(‘)[1] elif v.find(‘)‘) > -1: v_ = v.split(‘)‘)[0] else: v_ = v if c == 1 or c == 2: ZHITONGZI_CITY_DIC[‘东莞市‘].append(v_) elif c == 3 or c == 4: ZHITONGZI_CITY_DIC[‘中山市‘].append(v_) f.closed zh_num_list = [‘零‘, ‘一‘, ‘二‘, ‘三‘, ‘四‘, ‘五‘, ‘六‘, ‘七‘, ‘八‘, ‘九‘] zh_num_zhk_dic = {} zh_num_numk_dic = {} for i in range(0, 10, 1): zh_num = zh_num_list[i] zh_num_numk_dic[str(i)] = zh_num zh_num_zhk_dic[zh_num] = str(i) # 天河区 中石化大厦A塔 nswe_m_list = [‘东‘, ‘西‘, ‘南‘, ‘北‘, ‘中‘] roman_numerals_12_list = [‘Ⅰ‘, ‘Ⅱ‘, ‘Ⅲ‘, ‘Ⅳ‘, ‘Ⅴ‘, ‘Ⅵ‘, ‘Ⅶ‘, ‘Ⅷ‘, ‘Ⅸ‘, ‘Ⅹ‘, ‘Ⅺ‘, ‘Ⅻ‘] arabic_numerals_10_list = [str(i) for i in range(0, 10, 1)] postfix_list = [‘座‘, ‘区‘, ‘栋‘, ‘楼‘, ‘院‘, ‘阁‘, ‘期‘, ‘单元‘, ‘号‘, ‘塔‘, ‘幢‘, ‘馆‘] alphabet_list = [chr(i).upper() for i in range(97, 123)] name_split_list = [] del_char_list = [‘.‘, ‘·‘, ‘-‘, ‘ ‘] del_tail_list = [‘第‘] suspect_char_list = [‘(‘, ‘(‘] name_format_replace_dic = {} name_format_replace_dic[‘ ‘] = ‘‘ name_format_replace_dic[‘+‘] = ‘加‘ gd_paralleling = ‘|‘ gd_separator = ‘;‘ diy_join_tag = ‘||‘ for postfix in postfix_list: for nswe_m in nswe_m_list: str_ = ‘%s%s‘ % (nswe_m, postfix) name_split_list.append(str_) for numeral in roman_numerals_12_list: str_ = ‘%s%s‘ % (numeral, postfix) name_split_list.append(str_) for alphabet in alphabet_list: str_ = ‘%s%s‘ % (alphabet.upper(), postfix) name_split_list.append(str_) for i in range(0, 9, 1): str_ = ‘%s%s‘ % (i, postfix) name_split_list.append(str_) def zh_num_format(str_): global zh_num_zhk_dic for i in zh_num_zhk_dic: str_ = str_.replace(i, zh_num_zhk_dic[i]) return str_ def replace_zhnum_num(str_): for i in zh_num_numk_dic: if str_.find(i) > -1: str_ = str_.replace(i, zh_num_numk_dic[i]) return str_ def alphabet_upper_format(str_): global zh_num_zhk_dic for i in alphabet_list: str_ = str_.replace(i, i.upper()) return str_ def del_char(str_): global del_char_list for i in del_char_list: str_ = str_.replace(i, ‘‘) return str_ # ART SPACE # ‘虹口SOHO‘ # 6A def del_tail_filter_list(str_, filter_list): len_ = len(str_) len__ = len_ - 1 index_ = len_ for i in range(len__, 0, -1): tail_ = str_[i] if tail_ in filter_list: index_ = i else: break return str_[0:index_] def del_tail_filter(str_): global arabic_numerals_10_list, alphabet_list res = del_tail_filter_list(str_, arabic_numerals_10_list) res = del_tail_filter_list(res, alphabet_list) return res # ‘上海加华商务中心A9座‘ def name_reduction_format(original_): for i in name_format_replace_dic: original_ = original_.replace(i, name_format_replace_dic[i]) if original_.find(‘(‘) > -1: original_ = original_.split(‘(‘)[0] if original_.find(‘(‘) > -1: original_ = original_.split(‘(‘)[0] if original_.find(‘-‘) > -1: original_ = original_.split(‘-‘)[0] if original_.find(‘、‘) > -1: original_ = original_.split(‘、‘)[0] format_ = original_.upper() format_ = del_char(format_) format_ = zh_num_format(format_) format_ = alphabet_upper_format(format_) if len(format_) < MIN_NAME_LEN: return original_ return format_ def name_reduction(format_): global name_split_list reduction_ = name_reduction_format(format_) for i in name_split_list: index_ = reduction_.find(i) if index_ > -1: reduction_ = reduction_.split(i)[0] # break#佳兆业可园六期2区C座湖西路 # 12区 reduction_ = del_tail_filter(reduction_) reduction_ = replace_zhnum_num(reduction_) if len(reduction_) < MIN_NAME_LEN: reduction_ = format_ for i in del_tail_list: if reduction_[-1:] == i: reduction_ = reduction_[:-1] return reduction_ # X大厦(abc # abc def chk_cross_name(str_, str__): if len(str_) > len(str__): a = str_ str_ = str__ str__ = a res = 0 if str__.find(str_) > -1: l = str__.split(‘(‘) if len(l) > 1: if l[1].find(str_) > -1: res = 1 return res def res_list_str(dic_, dk, filter_list=[‘,‘]): l = [] for i in dic_[dk]: l.append(str(i)) str_ = diy_join_tag.join(l) for i in filter_list: str_ = str_.replace(i, ‘‘) str_ = str_.replace(‘\n‘, ‘‘) return str_ def chk_name_subname(str_, str__): if len(str_) > len(str__): a = str__ str__ = str_ str_ = a if str__.split(str_)[0] == ‘‘: return 0 return 1 def gen_show_addr(l, district): len_ = len(l) res_ = sorted(l, key=lambda l: len_)[len_ - 1] ll = res_.split(district) if len(ll) > 1: res_ = ll[1].strip() return res_ def gen_gd_type_single_str(gd_type_list, filter_): gd_type_single_str = ‘‘ for i in gd_type_list: if i.find(filter_) > -1: if i.find(gd_paralleling) > -1: l = i.split(gd_paralleling) for ii in l: if ii.find(filter_) > -1: gd_type_single_str = ii break else: gd_type_single_str = i break return gd_type_single_str def gen_show_gd_type_dic(gd_type_list, filter_): dic_ = {} dic_[‘gd_type_list_str‘] = diy_join_tag.join(gd_type_list) dic_[‘gd_type_0‘] = ‘‘ dic_[‘gd_type_1‘] = ‘‘ dic_[‘gd_type_2‘] = ‘‘ if filter_.find(‘楼宇‘) > -1: filter_ = ‘楼宇‘ elif filter_.find(‘住宅小区‘) > -1: filter_ = ‘住宅小区‘ gd_type_single_str = gen_gd_type_single_str(gd_type_list, filter_) dic_[‘gd_type_0‘], dic_[‘gd_type_1‘], dic_[‘gd_type_2‘] = gd_type_single_str.split(gd_separator) return dic_ def gen_show_gd_type_dic_fromstr(gd_type_str, filter_): dic_ = {} dic_[‘gd_type_0‘], dic_[‘gd_type_1‘], dic_[‘gd_type_2‘] = ‘‘, ‘‘, ‘‘ if filter_.find(‘楼宇‘) > -1: filter_ = ‘楼宇‘ elif filter_.find(‘住宅小区‘) > -1: filter_ = ‘住宅小区‘ gd_type_list_paralleling = gd_type_str.split(gd_paralleling) for gd_type in gd_type_list_paralleling: if gd_type.find(filter_) > -1: dic_[‘gd_type_0‘], dic_[‘gd_type_1‘], dic_[‘gd_type_2‘] = gd_type.split(gd_separator) return dic_ def compute_list(l): sum_ = 0 for i in l: i_ = float(i) sum_ += i_ return sum_ / len(l) def res_list(dic_, dk): l = dic_[dk] return compute_list(l) target_city_list = [] FEXCEL = ‘【商场任务】28个城市_任务列表_20170727.xlsx‘ data = xlrd.open_workbook(FEXCEL) table = data.sheets()[0] nrows = table.nrows ncols = table.ncols flag_title = 0 res_dic = {} source_file_line_num = 0 for i in range(0, nrows): source_file_line_num += 1 l = table.row_values(i) if flag_title == 0: flag_title = 1 continue city = l[2] if city not in target_city_list: target_city_list.append(city) city_zhixiashi_list = [‘北京市‘, ‘上海市‘, ‘天津市‘, ‘重庆市‘] filter_city_list = [‘北京市‘, ‘上海市‘, ‘广州市‘, ‘深圳市‘] file_house = ‘住宅小区.csv‘ file_bizbuilding = ‘楼宇.csv‘ file_gen_house = ‘含北上广深28城市-住宅小区-归约化‘ file_gen_bizbuilding = ‘含北上广深28城市-商住楼宇-归约化‘ file_title_str = ‘province,city,district,商圈,商圈类型,归约后的名,name_original,show_addr,show_addr_num,gd_type,gd_type_1,gd_type_2,gd_type_3,locationx,locationy,gpsx,gpsy,bdx,bdy,localtime‘ file_title_str_statistics = ‘省份,城市,源文件行数,处理后的文件行数,压缩率,参考总建筑区总楼栋数目,总建筑区名字数,参考独栋数,参考独栋数率,参考单建筑区的平均楼栋数目‘ MIN_NAME_LEN = 2 def data_file_extract(file_name): res_dic = {} with open(file_name, ‘r‘, encoding=‘utf-8-sig‘) as csvfile: reader = csv.DictReader(csvfile) file_line_num = 1 for ordered_dic in reader: file_line_num += 1 province = ordered_dic[‘province‘] city = ordered_dic[‘city‘] district = ordered_dic[‘district‘] if city.find(‘[‘) > -1: city = province if city not in target_city_list: continue if province not in res_dic: res_dic[province] = {} if city not in res_dic[province]: res_dic[province][city] = {} res_dic[province][city][‘source_file_sum_city_district‘] = 0 res_dic[province][city][‘district_dic‘] = {} if city == ‘东莞市‘: district_ = ordered_dic[‘addr‘].split(‘东莞市‘)[1] district = ‘松山湖‘ for tag_ in ZHITONGZI_CITY_DIC[‘东莞市‘]: if district_.find(tag_) > -1: district = tag_ if district not in res_dic[province][city][‘district_dic‘]: res_dic[province][city][‘district_dic‘][district] = {} # {name_reduction:num} res_dic[province][city][‘district_dic‘][district][‘name_reduction_dic‘] = {} res_dic[province][city][‘district_dic‘][district][‘dic_list‘] = [] d = ordered_dic name_original = d[‘name‘] name_ = name_reduction(name_original) name_ = name_reduction(name_) # 水岸阳光B小区b区C幢(C-幢) name_ = name_reduction(name_) d[‘file_line_num‘] = file_line_num d[‘name_reduction‘] = name_ if name_ not in res_dic[province][city][‘district_dic‘][district][‘name_reduction_dic‘]: res_dic[province][city][‘district_dic‘][district][‘name_reduction_dic‘][name_] = 0 res_dic[province][city][‘district_dic‘][district][‘name_reduction_dic‘][name_] += 1 res_dic[province][city][‘source_file_sum_city_district‘] += 1 res_dic[province][city][‘district_dic‘][district][‘dic_list‘].append(d) return res_dic def data_self_reduction(self_dic_): for province in self_dic_: for city in self_dic_[province]: for district in self_dic_[province][city][‘district_dic‘]: name_reduction_dic = self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘] dic_list = self_dic_[province][city][‘district_dic‘][district][‘dic_list‘] name_reduction_list = sorted(name_reduction_dic, reverse=False) for name_reduction in name_reduction_list: for i in dic_list: name_reduction_order = i[‘name_reduction‘] if name_reduction_order == name_reduction: continue longer_, shorter_ = name_reduction_order, name_reduction if len(name_reduction_order) < len(name_reduction): shorter_, longer_ = name_reduction_order, name_reduction if longer_.find(shorter_) > -1: if shorter_ in self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘]: self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘][shorter_] += 1 if longer_ in self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘]: del self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘][longer_] name_reduction_list = sorted(name_reduction_dic, reverse=True) for name_reduction in name_reduction_list: for i in dic_list: name_reduction_order = i[‘name_reduction‘] if name_reduction_order == name_reduction: continue longer_, shorter_ = name_reduction_order, name_reduction if len(name_reduction_order) < len(name_reduction): shorter_, longer_ = name_reduction_order, name_reduction if longer_.find(shorter_) > -1: if shorter_ in self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘]: self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘][shorter_] += 1 if longer_ in self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘]: del self_dic_[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘][longer_] return self_dic_ def gen_file(data_file_reduction_dic, file_name, file_title_str, file_title_str_statistics): wb = Workbook() worksheet = wb.active file_title_str = file_title_str.replace(‘ ‘, ‘‘) worksheet.append(file_title_str.split(‘,‘)) name_reduction_all_num, row_original_all_num, row_res_all_num, name_reduction_single_all_num = 0, 0, 0, 0 wb_statistics = Workbook() worksheet_statistics = wb_statistics.active worksheet_statistics.append(file_title_str_statistics.replace(‘ ‘, ‘‘).split(‘,‘)) for province in data_file_reduction_dic: for city in data_file_reduction_dic[province]: name_reduction_num, row_original_num, row_res_num = 0, 0, 0 name_reduction_single_num_l = [] for district in data_file_reduction_dic[province][city][‘district_dic‘]: name_reduction_dic = data_file_reduction_dic[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘] dic_list = data_file_reduction_dic[province][city][‘district_dic‘][district][‘dic_list‘] name_reduction_num += len(name_reduction_dic) row_original_num += len(dic_list) for name_reduction in name_reduction_dic: for i in dic_list: name_reduction_order = i[‘name_reduction‘] if name_reduction_order != name_reduction: continue if data_file_reduction_dic[province][city][‘district_dic‘][district][ ‘name_reduction_dic‘][name_reduction] == 1: if name_reduction not in name_reduction_single_num_l: name_reduction_single_num_l.append(name_reduction) name_original = i[‘name‘] name_format = name_reduction_format(name_original) if name_format in name_reduction_dic and name_reduction_dic[name_format] > 1: continue gd_type, locationx, locationy, gpsx, gpsy, bdx, bdy = i[‘type‘], i[‘locationx‘], i[‘locationy‘], i[‘gpsx‘], i[‘gpsy‘], i[‘bdx‘], i[‘bdy‘] show_gd_type_dic = gen_show_gd_type_dic_fromstr(gd_type, file_name) show_addr_num = ‘%s%s‘ % (i[‘street‘], i[‘number‘]) show_addr = show_addr_num if len(i[‘address‘].strip()) > 2: show_addr = i[‘address‘] row_res_num += 1 localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) xlsx_list = [province, city, district, ‘todo‘, ‘todo‘, name_reduction, name_original, show_addr, show_addr_num, gd_type, show_gd_type_dic[‘gd_type_0‘], show_gd_type_dic[‘gd_type_1‘], show_gd_type_dic[ ‘gd_type_2‘], locationx, locationy, gpsx, gpsy, bdx, bdy, localtime_] worksheet.append(xlsx_list) name_reduction_single_num = len(name_reduction_single_num_l) # file_title_str1 = ‘省份,城市,源文件行数,处理后的文件行数,压缩率,参考总建筑区总楼栋数目,总建筑区名字数,参考独栋数,参考独栋数率,参考单建筑区的平均楼栋数目‘ xlsx_list_statistics = [province, city, row_original_num, row_res_num, row_res_num / row_original_num, row_res_num, name_reduction_num, name_reduction_single_num, name_reduction_single_num / row_res_num, row_res_num / name_reduction_num] worksheet_statistics.append(xlsx_list_statistics) row_original_all_num += row_original_num row_res_all_num += row_res_num name_reduction_all_num += name_reduction_num row_original_all_num += row_original_num name_reduction_single_all_num += name_reduction_single_num xlsx_list_statistics = [‘ALL‘, ‘ALL‘, row_original_all_num, row_res_all_num, row_res_all_num / row_original_all_num, row_res_all_num, name_reduction_all_num, name_reduction_single_all_num, name_reduction_single_all_num / row_res_all_num, row_res_all_num / name_reduction_all_num] worksheet_statistics.append(xlsx_list_statistics) file_name_save = ‘%s%s%s‘ % (file_name, localtime_, ‘-统计.xlsx‘) wb_statistics.save(file_name_save) file_name_save = ‘%s%s%s‘ % (file_name, localtime_, ‘.xlsx‘) wb.save(file_name_save) data_file_extract_house = data_file_extract(file_house) data_file_extract_bizbuilding = data_file_extract(file_bizbuilding) data_self_reduction_house = data_self_reduction(data_file_extract_house) data_self_reduction_bizbuilding = data_self_reduction(data_file_extract_bizbuilding) gen_file(data_self_reduction_house, file_gen_house, file_title_str, file_title_str_statistics) gen_file(data_self_reduction_bizbuilding, file_gen_bizbuilding, file_title_str, file_title_str_statistics)
标签:man local pen location 北京 false replace else 城市
原文地址:http://www.cnblogs.com/yuanjiangw/p/7359661.html