标签:types cli port cos DPoS strong first 读取 文件中
def read_file( file_path, use_col=None, converters=None): """ :param file_path: 读取路径 :param use_col: 筛选列 :param converters: :return: """ if re.search(‘\.xlsx‘, file_path): data = pd.read_excel(file_path, usecols=use_col, converters=converters) return data elif re.search(‘\.csv‘, file_path): try: data = pd.read_csv(file_path, usecols=use_col, converters=converters ) except OSError: data = pd.read_csv(file_path, engine=‘python‘, usecols=use_col, converters=converters ) except UnicodeDecodeError: data = pd.read_csv(open(file_path, encoding=‘gbk‘), usecols=use_col, converters=converters ) except Exception as e: raise Exception(e) return data elif re.search(‘\.pkl‘, file_path): data = pd.read_pickle(file_path) return data else: raise Exception(‘文件类型错误,仅支持csv、xlsx、‘ ‘pickle格式‘) # pivot的再封装 def pivot_table(data, index, values, **kwargs): data[values] = data[values].astype(‘float‘) data[index] = data[index].astype(‘str‘) return pd.pivot_table(data, index=index, values=values, **kwargs)
在pivot之后,常出现多个二维columns的情况
cols = [(‘ftime‘, ‘‘), (‘uid‘, ‘‘), (‘agent_id‘, ‘‘), (‘industry_name_level1‘, ‘‘), (‘product_type‘, ‘‘), (‘adpos‘, ‘‘), (‘trace_cnt‘, ‘0‘), (‘trace_cnt‘, ‘PRODUCTTYPE_APPLE_APP_STORE‘), (‘trace_cnt‘, ‘PRODUCTTYPE_JD_URL‘), (‘trace_cnt‘, ‘PRODUCTTYPE_LEAD_AD‘), (‘trace_cnt‘, ‘PRODUCTTYPE_OPEN_PLATFORM_APP_MOB‘), (‘trace_cnt‘, ‘PRODUCTTYPE_WECHAT‘)]
如何处理呢?
def col_rename(list_MultiIndex): list_name=[] for colname in list_MultiIndex: colnewname=str(colname[1])+str(colname[0]) list_name.append(colnewname) return list_name
def combine_pivot_col_name(single_col): ‘‘‘ 用于处理pivot返回的二维columns :param single_col: :return: ‘‘‘ if single_col[1] in (‘‘, ‘0‘): return single_col[0] else: return single_col[1]
>> [‘ftime‘, ‘uid‘, ‘agent_id‘, ‘industry_name_level1‘, ‘product_type‘, ‘adpos‘, ‘trace_cnt‘, ‘PRODUCTTYPE_APPLE_APP_STORE‘, ‘PRODUCTTYPE_JD_URL‘, ‘PRODUCTTYPE_LEAD_AD‘, ‘PRODUCTTYPE_OPEN_PLATFORM_APP_MOB‘, ‘PRODUCTTYPE_WECHAT‘]
b = list(map(lambda x: x[len(x[0]) < len(x[1])], cols)) >>[‘ftime‘, ‘uid‘, ‘agent_id‘, ‘industry_name_level1‘, ‘product_type‘, ‘adpos‘, ‘trace_cnt‘, ‘PRODUCTTYPE_APPLE_APP_STORE‘, ‘PRODUCTTYPE_JD_URL‘, ‘PRODUCTTYPE_LEAD_AD‘, ‘PRODUCTTYPE_OPEN_PLATFORM_APP_MOB‘, ‘PRODUCTTYPE_WECHAT‘]
创建一个对象,是一个完整的个体,有属性和方法等接口
revenueday_select_col=[‘ftime‘,‘fuid‘,‘adv_name‘,‘agent_id‘,‘agent_name‘,‘appid_id‘,‘KPI_first_ind‘,‘new_first_industry‘, ‘new_second_industry‘,‘show‘, ‘click‘,‘real_cost‘,‘is_smb‘,‘adv_sale_tag_name‘,‘sign‘, ‘f_year‘,‘f_month‘,‘f_M‘,‘f_Q‘,‘f_sweek‘,‘f_yweek‘,‘f_YW‘,‘first_year‘,‘first_month‘,‘first_sweek‘, ‘product‘,‘flow_name_level2‘,‘product2‘,‘R_Group‘,‘R_emp‘, ‘big_area‘,‘province_manual‘,‘city‘, ‘Q_new_old‘,‘is_tc‘,‘lingyu‘,‘is_industry_one‘,‘is_industry_two‘, ‘first_cost_date‘,‘bidtype‘,‘CZ_amount‘,‘year_new_old‘] revenueday = pd.read_csv(r‘E:/行业效果数据和OCPA数据/数据源/revenueday/revenue_new_day.csv‘, encoding=‘gbk‘, usecols=revenueday_select_col) revenueday.rename(columns={‘appid_id‘:‘wechatappid‘,‘new_first_industry‘:‘three_ind1‘,‘new_second_industry‘:‘three_ind2‘,‘is_smb‘:‘is_smb_original‘, ‘show‘:‘exposure_cnt‘,‘click‘: ‘click_cnt‘, ‘product‘: ‘type‘, ‘province_manual‘:‘province_area_manual‘}, inplace=True) product_type_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/产品类型映射表.xlsx‘) track_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/效果映射表.xlsx‘) flow_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/站点映射表.xlsx‘) gzh_yinshe = pd.read_csv(r‘E:/行业效果数据和OCPA数据/映射表/公众号名称.csv‘, encoding=‘gbk‘, usecols=[‘fuid‘,‘昵称‘]) desttype_yinshe = pd.read_excel("E:/行业效果数据和OCPA数据/映射表/落地页映射表.xlsx") creative_size_yinshe = pd.read_excel("E:/行业效果数据和OCPA数据/映射表/素材尺寸映射表.xlsx") ## 处理映射表 product_type_yinshe.info() product_type_yinshe.loc[:, ‘product_type‘] = product_type_yinshe.loc[:,‘product_type‘].apply(lambda x: str(x)) product_type_yinshe.product_type.unique() track_yinshe.info() track_yinshe.loc[:, ‘acttion_track_type‘] = track_yinshe.loc[:,‘acttion_track_type‘].apply(lambda x: str(x)) track_yinshe.acttion_track_type.unique() gzh_yinshe.info() gzh_yinshe.loc[:, ‘fuid‘] = gzh_yinshe.loc[:, ‘fuid‘].astype(‘int64‘).astype(str) gzh_yinshe.fuid.unique() gzh_yinshe.drop_duplicates(subset=[‘fuid‘], keep=‘last‘, inplace=True) desttype_yinshe.info() desttype_yinshe.loc[:, ‘desttype‘] = desttype_yinshe.loc[:, ‘desttype‘].apply(lambda x: str(x)) desttype_yinshe.desttype.unique() creative_size_yinshe.info() creative_size_yinshe.loc[:, ‘creative_size‘] = creative_size_yinshe.loc[:, ‘creative_size‘].apply(lambda x: str(x)) creative_size_yinshe.creative_size.unique() flow_yinshe.info() ### 处理revenueday revenueday.info() revenueday.replace([np.nan, np.inf, ‘NA‘, ‘-‘, ‘nan‘], [0, 0, 0, 0, 0], inplace=True) if revenueday.loc[:, ‘fuid‘].dtypes != ‘object‘: revenueday.loc[:, ‘fuid‘] = revenueday[:, ‘fuid‘].astype(str) revenueday[‘f_year‘] = revenueday[‘f_year‘].astype(‘int64‘).astype(str) revenueday[‘f_yweek‘] = revenueday[‘f_yweek‘].astype(‘int64‘).astype(str) revenueday[‘f_month‘] = revenueday[‘f_month‘].astype(‘int64‘).astype(str) revenueday[‘f_yweek‘].unique() is_ind = revenueday[‘KPI_first_ind‘].isin(industry) is_adv_sale_tag_name = revenueday[‘adv_sale_tag_name‘].isin([‘地方站‘]) is_year_sel = revenueday[‘f_year‘].isin(Y) if Y else Y is_quarter_sel = revenueday[‘f_Q‘].isin(quarter) if quarter else quarter is_month_sel = revenueday[‘f_month‘].isin(month) if month else month is_yweek_sel = revenueday[‘f_yweek‘].isin(yweek) if yweek else yweek selector = is_ind & ~is_adv_sale_tag_name for sel_date in [is_year_sel, is_quarter_sel, is_month_sel, is_yweek_sel]: if sel_date is not None: selector &= sel_date revenueday = revenueday.loc[selector, :] revenueday.KPI_first_ind.unique() revenueday.adv_sale_tag_name.unique() revenueday.f_sweek.unique() revenueday.type.unique() revenueday.loc[:,‘agent_id‘] = revenueday.loc[:,‘agent_id‘].apply(lambda x:str(x)) revenueday.loc[:,‘fuid‘] = revenueday.loc[:,‘fuid‘].astype(str) revnueday_groupby_scol = [‘fuid‘,‘wechatappid‘,‘adv_name‘,‘agent_id‘,‘agent_name‘,‘KPI_first_ind‘,‘three_ind1‘, ‘three_ind2‘,‘is_smb_original‘,‘big_area‘,‘province_area_manual‘] revenueday = revenueday.groupby(revnueday_groupby_scol).agg({‘real_cost‘:sum}).reset_index().fillna(‘0‘) revenueday.drop([‘real_cost‘], axis=1, inplace=True) revenueday = revenueday.loc[~(revenueday[‘fuid‘].isin([‘0‘,0])),:] revenueday.info() gzh_yinshe.info() revenueday.loc[:, revenueday.columns] = revenueday.loc[:,revenueday.columns].apply(lambda x:x.astype(str)) revenueday = pd.merge(revenueday,gzh_yinshe,on =[‘fuid‘], how=‘left‘) revenueday.rename(columns={‘昵称‘: ‘公众号名称‘}, inplace=True) revenueday.loc[:, ‘公众号名称‘].fillna(‘0‘, inplace=True) revenueday.drop_duplicates(subset=[‘fuid‘,‘agent_id‘],keep = ‘last‘,inplace=True) revenueday.info() revenueday.loc[:, revenueday.columns] = revenueday.loc[:, revenueday.columns].apply(lambda x:x.astype(str)) ## 处理GDT的数据 ## 处理GDT的数据 ## 处理GDT的数据 ### 处理 greenspan revenueday_id = revenueday.fuid.unique().tolist()
class RevenueDay: """ 读取并处理周报数据源 """ use_col = [‘ftime‘, ‘fuid‘, ‘adv_name‘, ‘agent_id‘, ‘agent_name‘, ‘appid_id‘, ‘KPI_first_ind‘, ‘new_first_industry‘, ‘new_second_industry‘, ‘show‘, ‘click‘, ‘real_cost‘, ‘is_smb2‘, ‘adv_sale_tag_name‘, ‘sign‘, ‘f_year‘, ‘f_month‘, ‘f_M‘, ‘f_Q‘, ‘f_sweek‘, ‘f_yweek‘, ‘f_YW‘, ‘first_year‘, ‘first_month‘, ‘first_sweek‘, ‘product‘, ‘flow_name_level2‘, ‘product2‘, ‘R_Group‘, ‘R_emp‘, ‘big_area‘, ‘province_manual‘, ‘city‘, ‘Q_new_old‘, ‘is_tc‘, ‘first_cost_date‘, ‘CZ_amount‘] # ‘lingyu‘, ‘is_industry_one‘, ‘is_industry_two‘, ‘bidtype‘, ‘year_new_old‘ REVENUE_DATA = read_file(os.path.join(DATA_CENTER[‘REVENUE_DATA‘], ‘revenue_day.pkl‘))[use_col] REVENUE_DATA.rename(columns={ ‘appid_id‘: ‘wechatappid‘, ‘new_first_industry‘: ‘three_ind1‘, ‘new_second_industry‘: ‘three_ind2‘, ‘is_smb2‘: ‘is_smb_original‘, ‘show‘: ‘exposure_cnt‘, ‘click‘: ‘click_cnt‘, ‘product‘: ‘type‘, ‘province_manual‘: ‘province_area_manual‘ }, inplace=True) REVENUE_DATA.replace([np.nan, np.inf, ‘NA‘, ‘-‘, ‘nan‘], [0, 0, 0, 0, 0], inplace=True) # 处理nan值 REVENUE_DATA[[‘fuid‘, ‘f_year‘, ‘f_yweek‘, ‘f_month‘]] = REVENUE_DATA[ [‘fuid‘, ‘f_year‘, ‘f_yweek‘, ‘f_month‘]].astype(‘int64‘).astype(str) print(‘读入revenueday‘) def __init__(self, quarter, year, **kwargs): self.quarter = quarter self.year = year self.revenueday = RevenueDay.REVENUE_DATA self.kwargs = kwargs def charge_type(self, key): if isinstance(key, str): return [key] elif isinstance(key, int): return [str(key)] elif isinstance(key, list): return key else: raise KeyError(‘无法识别键值!‘) def check_variable_type(self): for key, value in self.kwargs.items(): self.kwargs[key] = self.charge_type(value) self.quarter = self.charge_type(self.quarter) self.year = self.charge_type(self.year) def get_select_array(self): year_sel = self.revenueday[‘f_year‘].isin(self.year) if self.year else self.year sel_col_mapping = { ‘KPI_first_ind‘: self.kwargs.get(‘industry‘), ‘f_Q‘: self.quarter, ‘f_month‘: self.kwargs.get(‘month‘), ‘f_week‘: self.kwargs.get(‘week‘) } for sel_col, variable in sel_col_mapping.items(): if variable: year_sel &= self.revenueday[sel_col].isin(variable) return year_sel def get_revenue_data(self, group_by_cols): self.check_variable_type() revenue = self.revenueday.loc[self.get_select_array(), :].copy() revenue = revenue.groupby(group_by_cols).agg({‘real_cost‘: sum}).reset_index().fillna( ‘0‘).sort_values(by=[‘fuid‘, ‘real_cost‘]) revenue.drop([‘real_cost‘], axis=1, inplace=True) revenue = revenue.loc[~(revenue[‘fuid‘].isin([‘0‘, 0])), :] revenue = pd.merge(revenue, OfficialAccount().official_account_mapping, on=[‘fuid‘], how=‘left‘) revenue.loc[:, ‘公众号名称‘].fillna(‘0‘, inplace=True) revenue.rename(columns={‘公众号名称‘: ‘app_name‘}, inplace=True) revenue.loc[:, :] = revenue.loc[:, :].astype(str) return revenue @property def revenue_data(self): groupby_fuid_agent = [‘fuid‘, ‘wechatappid‘, ‘adv_name‘, ‘agent_id‘, ‘agent_name‘, ‘KPI_first_ind‘, ‘three_ind1‘, ‘three_ind2‘, ‘is_smb_original‘, ‘big_area‘, ‘province_area_manual‘] revenue = self.get_revenue_data(groupby_fuid_agent) revenue = revenue.drop_duplicates(subset=[‘fuid‘, ‘agent_id‘], keep=‘last‘) return revenue @property def revenue_data_unique_fuid(self): groupby_fuid = [‘fuid‘, ‘wechatappid‘, ‘adv_name‘, ‘agent_id‘, ‘agent_name‘, ‘KPI_first_ind‘, ‘three_ind1‘, ‘three_ind2‘, ‘is_smb_original‘, ‘big_area‘, ‘province_area_manual‘] revenue = self.get_revenue_data(groupby_fuid) revenue = revenue.drop_duplicates(subset=[‘fuid‘], keep=‘last‘) return revenue # @property # def revenue_id(self): # return self.revenue_data[‘fuid‘].unique().tolist()
product_type_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/产品类型映射表.xlsx‘) track_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/效果映射表.xlsx‘) flow_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/站点映射表.xlsx‘)
这种情况在一个文件中有23个*5个文件=115次
PATH = ‘E:/code_piggy/kiki_env/my_env/TsToolkit/report/行业效果数据和OCPA数据‘ PATH_DATA_SOURCE = os.path.join(PATH, ‘数据源‘) PATH_MAPPING = os.path.join(PATH, ‘映射表‘) PATH_RESULT = os.path.join(PATH, ‘结果表‘) product_type_yinshe = pd.read_excel(PATH_MAPPING + ‘/产品类型映射表.xlsx‘, dtype=str) track_yinshe = pd.read_excel(PATH_MAPPING + ‘/效果映射表.xlsx‘,dtype=str)
from settings import DATA_CENTER, PATH_MATCHED, PATH_SOURCE
revenueday_id = revenueday.fuid.unique().tolist() filehome=greenspan_file_name fileall=os.listdir(filehome)
=, -, +=, ==, in, is not, and ...
RevenueData = pd.DataFrame()
revenueData = pd.DataFrame()
revenue_data = pd.DataFrame()
标签:types cli port cos DPoS strong first 读取 文件中
原文地址:https://www.cnblogs.com/pikiki/p/13092409.html