G-->f

import csv
import time
import xlrd
from openpyxl import Workbook
ZHITONGZI_CITY_DIC = {}
f = open(‘直筒子市_东莞中山.txt‘, ‘r‘, encoding=‘utf-8‘)
ZHITONGZI_CITY_DIC[‘东莞市‘] = []
ZHITONGZI_CITY_DIC[‘中山市‘] = []
c = 0
for i in f:
ii = i.replace(‘ ‘, ‘‘).split(‘;‘)
for iii in ii:
iv = iii.split(‘、‘)
if len(iv) > 2:
c += 1
for v in iv:
if v.find(‘(‘) > -1:
v_ = v.split(‘(‘)[1]
elif v.find(‘)‘) > -1:
v_ = v.split(‘)‘)[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC[‘东莞市‘].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC[‘中山市‘].append(v_)
f.closed
zh_num_list = [‘零‘, ‘一‘, ‘二‘, ‘三‘, ‘四‘, ‘五‘, ‘六‘, ‘七‘, ‘八‘, ‘九‘]
zh_num_zhk_dic = {}
zh_num_numk_dic = {}
for i in range(0, 10, 1):
zh_num = zh_num_list[i]
zh_num_numk_dic[str(i)] = zh_num
zh_num_zhk_dic[zh_num] = str(i)
# 天河区 中石化大厦A塔
nswe_m_list = [‘东‘, ‘西‘, ‘南‘, ‘北‘, ‘中‘]
roman_numerals_12_list = [‘Ⅰ‘, ‘Ⅱ‘, ‘Ⅲ‘, ‘Ⅳ‘, ‘Ⅴ‘, ‘Ⅵ‘, ‘Ⅶ‘, ‘Ⅷ‘, ‘Ⅸ‘, ‘Ⅹ‘, ‘Ⅺ‘, ‘Ⅻ‘]
arabic_numerals_10_list = [str(i) for i in range(0, 10, 1)]
postfix_list = [‘座‘, ‘区‘, ‘栋‘, ‘楼‘, ‘院‘, ‘阁‘, ‘期‘, ‘单元‘, ‘号‘, ‘塔‘, ‘幢‘, ‘馆‘]
alphabet_list = [chr(i).upper() for i in range(97, 123)]
name_split_list = []
del_char_list = [‘.‘, ‘·‘, ‘-‘, ‘ ‘]
del_tail_list = [‘第‘]
suspect_char_list = [‘(‘, ‘(‘]
name_format_replace_dic = {}
name_format_replace_dic[‘ ‘] = ‘‘
name_format_replace_dic[‘+‘] = ‘加‘
gd_paralleling = ‘|‘
gd_separator = ‘;‘
diy_join_tag = ‘||‘
for postfix in postfix_list:
for nswe_m in nswe_m_list:
str_ = ‘%s%s‘ % (nswe_m, postfix)
name_split_list.append(str_)
for numeral in roman_numerals_12_list:
str_ = ‘%s%s‘ % (numeral, postfix)
name_split_list.append(str_)
for alphabet in alphabet_list:
str_ = ‘%s%s‘ % (alphabet.upper(), postfix)
name_split_list.append(str_)
for i in range(0, 9, 1):
str_ = ‘%s%s‘ % (i, postfix)
name_split_list.append(str_)
def zh_num_format(str_):
global zh_num_zhk_dic
for i in zh_num_zhk_dic:
str_ = str_.replace(i, zh_num_zhk_dic[i])
return str_
def replace_zhnum_num(str_):
for i in zh_num_numk_dic:
if str_.find(i) > -1:
str_ = str_.replace(i, zh_num_numk_dic[i])
return str_
def alphabet_upper_format(str_):
global zh_num_zhk_dic
for i in alphabet_list:
str_ = str_.replace(i, i.upper())
return str_
def del_char(str_):
global del_char_list
for i in del_char_list:
str_ = str_.replace(i, ‘‘)
return str_
# ART SPACE
# ‘虹口SOHO‘
# 6A
def del_tail_filter_list(str_, filter_list):
len_ = len(str_)
len__ = len_ - 1
index_ = len_
for i in range(len__, 0, -1):
tail_ = str_[i]
if tail_ in filter_list:
index_ = i
else:
break
return str_[0:index_]
def del_tail_filter(str_):
global arabic_numerals_10_list, alphabet_list
res = del_tail_filter_list(str_, arabic_numerals_10_list)
res = del_tail_filter_list(res, alphabet_list)
return res
# ‘上海加华商务中心A9座‘
def name_reduction_format(original_):
for i in name_format_replace_dic:
original_ = original_.replace(i, name_format_replace_dic[i])
if original_.find(‘(‘) > -1:
original_ = original_.split(‘(‘)[0]
if original_.find(‘(‘) > -1:
original_ = original_.split(‘(‘)[0]
if original_.find(‘-‘) > -1:
original_ = original_.split(‘-‘)[0]
if original_.find(‘、‘) > -1:
original_ = original_.split(‘、‘)[0]
format_ = original_.upper()
format_ = del_char(format_)
format_ = zh_num_format(format_)
format_ = alphabet_upper_format(format_)
if len(format_) < MIN_NAME_LEN:
return original_
return format_
def name_reduction(format_):
global name_split_list
reduction_ = name_reduction_format(format_)
for i in name_split_list:
index_ = reduction_.find(i)
if index_ > -1:
reduction_ = reduction_.split(i)[0]
# break#佳兆业可园六期2区C座湖西路
# 12区
reduction_ = del_tail_filter(reduction_)
reduction_ = replace_zhnum_num(reduction_)
if len(reduction_) < MIN_NAME_LEN:
reduction_ = format_
for i in del_tail_list:
if reduction_[-1:] == i:
reduction_ = reduction_[:-1]
return reduction_
# X大厦(abc
# abc
def chk_cross_name(str_, str__):
if len(str_) > len(str__):
a = str_
str_ = str__
str__ = a
res = 0
if str__.find(str_) > -1:
l = str__.split(‘(‘)
if len(l) > 1:
if l[1].find(str_) > -1:
res = 1
return res
def res_list_str(dic_, dk, filter_list=[‘,‘]):
l = []
for i in dic_[dk]:
l.append(str(i))
str_ = diy_join_tag.join(l)
for i in filter_list:
str_ = str_.replace(i, ‘‘)
str_ = str_.replace(‘\n‘, ‘‘)
return str_
def chk_name_subname(str_, str__):
if len(str_) > len(str__):
a = str__
str__ = str_
str_ = a
if str__.split(str_)[0] == ‘‘:
return 0
return 1
def gen_show_addr(l, district):
len_ = len(l)
res_ = sorted(l, key=lambda l: len_)[len_ - 1]
ll = res_.split(district)
if len(ll) > 1:
res_ = ll[1].strip()
return res_
def gen_gd_type_single_str(gd_type_list, filter_):
gd_type_single_str = ‘‘
for i in gd_type_list:
if i.find(filter_) > -1:
if i.find(gd_paralleling) > -1:
l = i.split(gd_paralleling)
for ii in l:
if ii.find(filter_) > -1:
gd_type_single_str = ii
break
else:
gd_type_single_str = i
break
return gd_type_single_str
def gen_show_gd_type_dic(gd_type_list, filter_):
dic_ = {}
dic_[‘gd_type_list_str‘] = diy_join_tag.join(gd_type_list)
dic_[‘gd_type_0‘] = ‘‘
dic_[‘gd_type_1‘] = ‘‘
dic_[‘gd_type_2‘] = ‘‘
if filter_.find(‘楼宇‘) > -1:
filter_ = ‘楼宇‘
elif filter_.find(‘住宅小区‘) > -1:
filter_ = ‘住宅小区‘
gd_type_single_str = gen_gd_type_single_str(gd_type_list, filter_)
dic_[‘gd_type_0‘], dic_[‘gd_type_1‘], dic_[‘gd_type_2‘] = gd_type_single_str.split(gd_separator)
return dic_
def gen_show_gd_type_dic_fromstr(gd_type_str, filter_):
dic_ = {}
dic_[‘gd_type_0‘], dic_[‘gd_type_1‘], dic_[‘gd_type_2‘] = ‘‘, ‘‘, ‘‘
if filter_.find(‘楼宇‘) > -1:
filter_ = ‘楼宇‘
elif filter_.find(‘住宅小区‘) > -1:
filter_ = ‘住宅小区‘
gd_type_list_paralleling = gd_type_str.split(gd_paralleling)
for gd_type in gd_type_list_paralleling:
if gd_type.find(filter_) > -1:
dic_[‘gd_type_0‘], dic_[‘gd_type_1‘], dic_[‘gd_type_2‘] = gd_type.split(gd_separator)
return dic_
def compute_list(l):
sum_ = 0
for i in l:
i_ = float(i)
sum_ += i_
return sum_ / len(l)
def res_list(dic_, dk):
l = dic_[dk]
return compute_list(l)
target_city_list = []
FEXCEL = ‘【商场任务】28个城市_任务列表_20170727.xlsx‘
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows = table.nrows
ncols = table.ncols
flag_title = 0
res_dic = {}
source_file_line_num = 0
for i in range(0, nrows):
source_file_line_num += 1
l = table.row_values(i)
if flag_title == 0:
flag_title = 1
continue
city = l[2]
if city not in target_city_list:
target_city_list.append(city)
city_zhixiashi_list = [‘北京市‘, ‘上海市‘, ‘天津市‘, ‘重庆市‘]
filter_city_list = [‘北京市‘, ‘上海市‘, ‘广州市‘, ‘深圳市‘]
file_house = ‘住宅小区.csv‘
file_bizbuilding = ‘楼宇.csv‘
file_gen_house = ‘含北上广深28城市-住宅小区-归约化‘
file_gen_bizbuilding = ‘含北上广深28城市-商住楼宇-归约化‘
file_title_str = ‘province,city,district,商圈,商圈类型,归约后的名,name_original,show_addr,show_addr_num,gd_type,gd_type_1,gd_type_2,gd_type_3,locationx,locationy,gpsx,gpsy,bdx,bdy,localtime‘
file_title_str_statistics = ‘省份,城市,源文件行数,处理后的文件行数,压缩率,参考总建筑区总楼栋数目,总建筑区名字数,参考独栋数,参考独栋数率,参考单建筑区的平均楼栋数目‘
MIN_NAME_LEN = 2
def data_file_extract(file_name):
res_dic = {}
with open(file_name, ‘r‘, encoding=‘utf-8-sig‘) as csvfile:
reader = csv.DictReader(csvfile)
file_line_num = 1
for ordered_dic in reader:
file_line_num += 1
province = ordered_dic[‘province‘]
city = ordered_dic[‘city‘]
district = ordered_dic[‘district‘]
if city.find(‘[‘) > -1:
city = province
if city not in target_city_list:
continue
if province not in res_dic:
res_dic[province] = {}
if city not in res_dic[province]:
res_dic[province][city] = {}
res_dic[province][city][‘source_file_sum_city_district‘] = 0
res_dic[province][city][‘district_dic‘] = {}
if city == ‘东莞市‘:
district_ = ordered_dic[‘addr‘].split(‘东莞市‘)[1]
district = ‘松山湖‘
for tag_ in ZHITONGZI_CITY_DIC[‘东莞市‘]:
if district_.find(tag_) > -1:
district = tag_
if district not in res_dic[province][city][‘district_dic‘]:
res_dic[province][city][‘district_dic‘][district] = {}
# {name_reduction:num}
res_dic[province][city][‘district_dic‘][district][‘name_reduction_dic‘] = {}
res_dic[province][city][‘district_dic‘][district][‘dic_list‘] = []
d = ordered_dic
name_original = d[‘name‘]
name_ = name_reduction(name_original)
name_ = name_reduction(name_)
# 水岸阳光B小区b区C幢(C-幢)
name_ = name_reduction(name_)
d[‘file_line_num‘] = file_line_num
d[‘name_reduction‘] = name_
if name_ not in res_dic[province][city][‘district_dic‘][district][‘name_reduction_dic‘]:
res_dic[province][city][‘district_dic‘][district][‘name_reduction_dic‘][name_] = 0
res_dic[province][city][‘district_dic‘][district][‘name_reduction_dic‘][name_] += 1
res_dic[province][city][‘source_file_sum_city_district‘] += 1
res_dic[province][city][‘district_dic‘][district][‘dic_list‘].append(d)
return res_dic
def data_self_reduction(self_dic_):
for province in self_dic_:
for city in self_dic_[province]:
for district in self_dic_[province][city][‘district_dic‘]:
name_reduction_dic = self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘]
dic_list = self_dic_[province][city][‘district_dic‘][district][‘dic_list‘]
name_reduction_list = sorted(name_reduction_dic, reverse=False)
for name_reduction in name_reduction_list:
for i in dic_list:
name_reduction_order = i[‘name_reduction‘]
if name_reduction_order == name_reduction:
continue
longer_, shorter_ = name_reduction_order, name_reduction
if len(name_reduction_order) < len(name_reduction):
shorter_, longer_ = name_reduction_order, name_reduction
if longer_.find(shorter_) > -1:
if shorter_ in self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘]:
self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘][shorter_] += 1
if longer_ in self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘]:
del self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘][longer_]
name_reduction_list = sorted(name_reduction_dic, reverse=True)
for name_reduction in name_reduction_list:
for i in dic_list:
name_reduction_order = i[‘name_reduction‘]
if name_reduction_order == name_reduction:
continue
longer_, shorter_ = name_reduction_order, name_reduction
if len(name_reduction_order) < len(name_reduction):
shorter_, longer_ = name_reduction_order, name_reduction
if longer_.find(shorter_) > -1:
if shorter_ in self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘]:
self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘][shorter_] += 1
if longer_ in self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘]:
del self_dic_[province][city][‘district_dic‘][district][
‘name_reduction_dic‘][longer_]
return self_dic_
def gen_file(data_file_reduction_dic, file_name, file_title_str, file_title_str_statistics):
wb = Workbook()
worksheet = wb.active
file_title_str = file_title_str.replace(‘ ‘, ‘‘)
worksheet.append(file_title_str.split(‘,‘))
name_reduction_all_num, row_original_all_num, row_res_all_num, name_reduction_single_all_num = 0, 0, 0, 0
wb_statistics = Workbook()
worksheet_statistics = wb_statistics.active
worksheet_statistics.append(file_title_str_statistics.replace(‘ ‘, ‘‘).split(‘,‘))
for province in data_file_reduction_dic:
for city in data_file_reduction_dic[province]:
name_reduction_num, row_original_num, row_res_num = 0, 0, 0
name_reduction_single_num_l = []
for district in data_file_reduction_dic[province][city][‘district_dic‘]:
name_reduction_dic = data_file_reduction_dic[province][city][‘district_dic‘][district][
‘name_reduction_dic‘]
dic_list = data_file_reduction_dic[province][city][‘district_dic‘][district][‘dic_list‘]
name_reduction_num += len(name_reduction_dic)
row_original_num += len(dic_list)
for name_reduction in name_reduction_dic:
for i in dic_list:
name_reduction_order = i[‘name_reduction‘]
if name_reduction_order != name_reduction:
continue
if data_file_reduction_dic[province][city][‘district_dic‘][district][
‘name_reduction_dic‘][name_reduction] == 1:
if name_reduction not in name_reduction_single_num_l:
name_reduction_single_num_l.append(name_reduction)
name_original = i[‘name‘]
name_format = name_reduction_format(name_original)
if name_format in name_reduction_dic and name_reduction_dic[name_format] > 1:
continue
gd_type, locationx, locationy, gpsx, gpsy, bdx, bdy = i[‘type‘], i[‘locationx‘], i[‘locationy‘], i[‘gpsx‘], i[‘gpsy‘], i[‘bdx‘], i[‘bdy‘]
show_gd_type_dic = gen_show_gd_type_dic_fromstr(gd_type, file_name)
show_addr_num = ‘%s%s‘ % (i[‘street‘], i[‘number‘])
show_addr = show_addr_num
if len(i[‘address‘].strip()) > 2:
show_addr = i[‘address‘]
row_res_num += 1
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
xlsx_list = [province, city, district, ‘todo‘, ‘todo‘, name_reduction, name_original, show_addr,
show_addr_num, gd_type, show_gd_type_dic[‘gd_type_0‘],
show_gd_type_dic[‘gd_type_1‘], show_gd_type_dic[
‘gd_type_2‘], locationx, locationy, gpsx, gpsy, bdx, bdy,
localtime_]
worksheet.append(xlsx_list)
name_reduction_single_num = len(name_reduction_single_num_l)
# file_title_str1 = ‘省份,城市,源文件行数,处理后的文件行数,压缩率,参考总建筑区总楼栋数目,总建筑区名字数,参考独栋数,参考独栋数率,参考单建筑区的平均楼栋数目‘
xlsx_list_statistics = [province, city, row_original_num, row_res_num, row_res_num / row_original_num,
row_res_num,
name_reduction_num, name_reduction_single_num,
name_reduction_single_num / row_res_num,
row_res_num / name_reduction_num]
worksheet_statistics.append(xlsx_list_statistics)
row_original_all_num += row_original_num
row_res_all_num += row_res_num
name_reduction_all_num += name_reduction_num
row_original_all_num += row_original_num
name_reduction_single_all_num += name_reduction_single_num
xlsx_list_statistics = [‘ALL‘, ‘ALL‘, row_original_all_num, row_res_all_num, row_res_all_num / row_original_all_num,
row_res_all_num,
name_reduction_all_num, name_reduction_single_all_num,
name_reduction_single_all_num / row_res_all_num,
row_res_all_num / name_reduction_all_num]
worksheet_statistics.append(xlsx_list_statistics)
file_name_save = ‘%s%s%s‘ % (file_name, localtime_, ‘-统计.xlsx‘)
wb_statistics.save(file_name_save)
file_name_save = ‘%s%s%s‘ % (file_name, localtime_, ‘.xlsx‘)
wb.save(file_name_save)
data_file_extract_house = data_file_extract(file_house)
data_file_extract_bizbuilding = data_file_extract(file_bizbuilding)
data_self_reduction_house = data_self_reduction(data_file_extract_house)
data_self_reduction_bizbuilding = data_self_reduction(data_file_extract_bizbuilding)
gen_file(data_self_reduction_house, file_gen_house, file_title_str, file_title_str_statistics)
gen_file(data_self_reduction_bizbuilding, file_gen_bizbuilding, file_title_str, file_title_str_statistics)
原文:http://www.cnblogs.com/yuanjiangw/p/7359661.html