标签:上网 记录 coding 类型转换 终端 pre frame 数据清洗 lan
import pandas as pd import numpy as np from pandas import DataFrame import datetime import sys import pymysql import csv from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker # db = pymysql.connect(‘localhost‘, ‘root‘, ‘123456‘, ‘languid‘) engine = create_engine(‘mysql+pymysql://root:123456@localhost/languid?charset=utf8‘) db = scoped_session(sessionmaker(bind=engine)) col_list = [‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘, ‘up_flux‘, ‘down_flux‘]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量 filepath=‘C://百度网盘//20181007_flux_40.csv‘ # def data_deal(filepath): if __name__ == ‘__main__‘: df_flux = pd.read_csv(filepath, sep=‘,‘, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=‘python‘,encoding = "utf-8",nrows=22222) df_flux.dropna(how=‘all‘,inplace=True) df_flux.dropna(subset=[‘user‘],inplace=True,axis=0) df_flux[‘record_time‘]=‘2019-5-28‘ df_flux2 = df_flux.groupby(by=[‘user‘,‘tm_type‘,‘serv‘,‘app‘,‘record_time‘])[‘up_flux‘,‘down_flux‘].sum() df_flux3 = df_flux.groupby(by=[‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘]).count() df_flux4 = df_flux3.drop([‘down_flux‘], axis=1) df_flux5 = df_flux4.rename(columns={‘up_flux‘: ‘counts‘}, inplace=False) df_flux2=DataFrame(df_flux2) df_flux2 = df_flux2.rename(columns={‘up_flux‘: ‘up_flux_sum‘,‘down_flux‘:‘down_flux_sum‘}) result = pd.concat([df_flux5, df_flux2], axis=1) print(result)
1.清洗数据中的全空行 2.清洗user列中的空值的行 3.统计上行流量列以及下行流量列的当天每人每终端服务app的总量。 4.统计每人每天终端服务app的次数。
import pandas as pd import numpy as np from pandas import DataFrame import datetime import sys import pymysql import csv from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker # db = pymysql.connect(‘localhost‘, ‘root‘, ‘123456‘, ‘languid‘) engine = create_engine(‘mysql+pymysql://root:123456@localhost/languid?charset=utf8‘) db = scoped_session(sessionmaker(bind=engine)) col_list = [‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘, ‘up_flux‘, ‘down_flux‘]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量 student_list=[‘user‘,‘age‘,‘low‘,‘high‘,‘time‘] filepath=‘C://百度网盘//20181007_flux_40.csv‘ filepath2=‘C://百度网盘//v_student_net.csv‘ # def data_deal(filepath): if __name__ == ‘__main__‘: df_flux = pd.read_csv(filepath, sep=‘,‘, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=‘python‘,encoding = "utf-8") df_flux.dropna(how=‘all‘,inplace=True) df_flux.dropna(subset=[‘user‘],inplace=True,axis=0) df_flux[‘record_time‘]=‘2019-5-28‘ df_flux2 = df_flux.groupby([‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘], as_index=False)[‘up_flux‘, ‘down_flux‘].sum() df_flux3 = df_flux.groupby(by=[‘user‘, ‘tm_type‘, ‘serv‘, ‘app‘, ‘record_time‘],as_index=False).count() df_flux4 = df_flux3.drop([‘down_flux‘], axis=1) df_flux5 = df_flux4.rename(columns={‘up_flux‘: ‘counts‘}, inplace=False) df_flux2=DataFrame(df_flux2) df_flux2 = df_flux2.rename(columns={‘up_flux‘: ‘up_flux_sum‘,‘down_flux‘:‘down_flux_sum‘}) result = pd.concat([df_flux2, df_flux5[‘counts‘]], axis=1) result_1 = df_flux2[~df_flux2[‘user‘].str.contains(‘10\.‘)] result_1[‘down_flux_sum‘] = result_1[‘down_flux_sum‘].astype(float) # result_1[‘user‘] = result_1[‘user‘].astype(float) # qqq = result_1[result_1[‘user‘]] result_1[‘tm_type‘].replace(‘\/移动终端\/\w*系统移动终端‘,‘mobile‘,regex=True,inplace=True) result_1.loc[result_1[‘tm_type‘].str.contains(‘多终端‘),‘tm_type‘]=‘多终端‘ result_1.loc[result_1[‘tm_type‘].str.contains(‘未知类型‘), ‘tm_type‘] = ‘Unknown‘ result_1[‘tm_type‘].replace(‘\/PC\/MAC PC‘,‘PC‘,regex=True,inplace=True) v_student = pd.read_csv(filepath2,sep=‘,‘,error_bad_lines=False,engine=‘python‘,encoding=‘utf-8‘,header=0,index_col=[0]) unique_value = v_student[‘username‘].nunique() v_student = v_student.rename(columns={‘username‘: ‘user‘}, inplace=False) student_merge=pd.merge(v_student,result_1,how=‘inner‘) student_group = student_merge.groupby([‘class_code‘],as_index=False)[‘down_flux_sum‘] student_group_2 =student_merge.groupby([‘class_code‘],as_index=False)[‘up_flux_sum‘].count() student_group_3 = student_group_2.rename(columns={‘up_flux_sum‘: ‘counts‘}, inplace=False)
1.用正则表达以及loc清洗tm_type列的数据,做以下更改
系统移动终端=mobile()
pc=pc()
多终端=多终端()
未知=unknown()
2.ip数据过滤() 将user列中的为ip的数据行过滤
3.类型转换=上行流量转化成其他类型()
标签:上网 记录 coding 类型转换 终端 pre frame 数据清洗 lan
原文地址:https://www.cnblogs.com/languid/p/10960559.html