码迷,mamicode.com
首页 > 其他好文 > 详细

pandas数据清洗(二)

时间:2019-06-01 19:40:33      阅读:97      评论:0      收藏:0      [点我收藏+]

标签:上网   记录   coding   类型转换   终端   pre   frame   数据清洗   lan   

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect(localhost, root, 123456, languid)
engine = create_engine(mysql+pymysql://root:123456@localhost/languid?charset=utf8)
db = scoped_session(sessionmaker(bind=engine))


col_list = [user, tm_type, serv, app, record_time, up_flux, down_flux]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量

filepath=C://百度网盘//20181007_flux_40.csv
# def data_deal(filepath):
if __name__ == __main__:
    df_flux = pd.read_csv(filepath, sep=,, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=python,encoding = "utf-8",nrows=22222)
    df_flux.dropna(how=all,inplace=True)
    df_flux.dropna(subset=[user],inplace=True,axis=0)
    df_flux[record_time]=2019-5-28
    df_flux2 = df_flux.groupby(by=[user,tm_type,serv,app,record_time])[up_flux,down_flux].sum()
    df_flux3 = df_flux.groupby(by=[user, tm_type, serv, app, record_time]).count()
    df_flux4 = df_flux3.drop([down_flux], axis=1)
    df_flux5 = df_flux4.rename(columns={up_flux: counts}, inplace=False)
  
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns={up_flux: up_flux_sum,down_flux:down_flux_sum})
   
    result = pd.concat([df_flux5, df_flux2], axis=1)
    
    print(result)

1.清洗数据中的全空行 2.清洗user列中的空值的行 3.统计上行流量列以及下行流量列的当天每人每终端服务app的总量。 4.统计每人每天终端服务app的次数。

 

import pandas as pd
import numpy as np
from pandas import DataFrame
import datetime
import sys
import pymysql
import csv
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker


# db = pymysql.connect(localhost, root, 123456, languid)
engine = create_engine(mysql+pymysql://root:123456@localhost/languid?charset=utf8)
db = scoped_session(sessionmaker(bind=engine))


col_list = [user, tm_type, serv, app, record_time, up_flux, down_flux]#上网账号#终端类型#服务#app#记录时间#上行流量#下行流量
student_list=[user,age,low,high,time]

filepath=C://百度网盘//20181007_flux_40.csv
filepath2=C://百度网盘//v_student_net.csv
# def data_deal(filepath):
if __name__ == __main__:
    df_flux = pd.read_csv(filepath, sep=,, error_bad_lines=False, usecols=[3, 10, 11, 12, 15, 16, 17], names=col_list,engine=python,encoding = "utf-8")
    df_flux.dropna(how=all,inplace=True)
    df_flux.dropna(subset=[user],inplace=True,axis=0)
    df_flux[record_time]=2019-5-28
    df_flux2 = df_flux.groupby([user, tm_type, serv, app, record_time], as_index=False)[up_flux, down_flux].sum()
    df_flux3 = df_flux.groupby(by=[user, tm_type, serv, app, record_time],as_index=False).count()
    df_flux4 = df_flux3.drop([down_flux], axis=1)
    df_flux5 = df_flux4.rename(columns={up_flux: counts}, inplace=False)
    df_flux2=DataFrame(df_flux2)
    df_flux2 = df_flux2.rename(columns={up_flux: up_flux_sum,down_flux:down_flux_sum})
    result = pd.concat([df_flux2, df_flux5[counts]], axis=1)
    result_1 = df_flux2[~df_flux2[user].str.contains(10\.)]
    result_1[down_flux_sum] = result_1[down_flux_sum].astype(float)
    # result_1[user] = result_1[user].astype(float)
    # qqq = result_1[result_1[user]]
    result_1[tm_type].replace(\/移动终端\/\w*系统移动终端,mobile,regex=True,inplace=True)
    result_1.loc[result_1[tm_type].str.contains(多终端),tm_type]=多终端
    result_1.loc[result_1[tm_type].str.contains(未知类型), tm_type] = Unknown
    result_1[tm_type].replace(\/PC\/MAC PC,PC,regex=True,inplace=True)




    v_student = pd.read_csv(filepath2,sep=,,error_bad_lines=False,engine=python,encoding=utf-8,header=0,index_col=[0])
    unique_value = v_student[username].nunique()
    v_student = v_student.rename(columns={username: user}, inplace=False)
    student_merge=pd.merge(v_student,result_1,how=inner)
    student_group = student_merge.groupby([class_code],as_index=False)[down_flux_sum]
    student_group_2 =student_merge.groupby([class_code],as_index=False)[up_flux_sum].count()
    student_group_3 = student_group_2.rename(columns={up_flux_sum: counts}, inplace=False)

1.用正则表达以及loc清洗tm_type列的数据,做以下更改

系统移动终端=mobile()

pc=pc()

多终端=多终端()

未知=unknown()

2.ip数据过滤() 将user列中的为ip的数据行过滤

3.类型转换=上行流量转化成其他类型()

pandas数据清洗(二)

标签:上网   记录   coding   类型转换   终端   pre   frame   数据清洗   lan   

原文地址:https://www.cnblogs.com/languid/p/10960559.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!