标签:auto 复制 ali pre end ram 异常 info name
# 复制原数据 df3 = df.copy()
df3.info()
<class ‘pandas.core.frame.DataFrame‘> RangeIndex: 3004 entries, 0 to 3003 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 3004 non-null int64 1 name 3004 non-null object 2 gender 2868 non-null object 3 age 2904 non-null float64 4 edu 1073 non-null object 5 custom_amt 3004 non-null object 6 order_date 3004 non-null object dtypes: float64(1), int64(1), object(5) memory usage: 164.4+ KB
# 数值型转字符串类型 df3["id"] = df3["id"].astype("str") # 字符串类型转浮点型 df3["custom_amt"] = df3["custom_amt"].str.strip("¥").astype("float") # 去掉部分字符 # 字符串类型转日期型 df3["order_date"] = pd.to_datetime(df3["order_date"],format="%Y年%m月%d日")
df3.info()
<class ‘pandas.core.frame.DataFrame‘> RangeIndex: 3004 entries, 0 to 3003 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 3004 non-null object 1 name 3004 non-null object 2 gender 2868 non-null object 3 age 2904 non-null float64 4 edu 1073 non-null object 5 custom_amt 3002 non-null float64 6 order_date 3004 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(2), object(4) memory usage: 164.4+ KB
df3.dtypes # 方法是针对特定对象的函数
id object name object gender object age float64 edu object custom_amt float64 order_date datetime64[ns] dtype: object
df3.duplicated().sum()
# 删除重复值,重排索引 df3.drop_duplicates(inplace=True ,ignore_index=True)
df3
3000 rows × 7 columns
id | name | gender | age | edu | custom_amt | order_date | |
---|---|---|---|---|---|---|---|
0 | 890 | 李小胆李l | female | 43.0 | NaN | 2177.94 | 2018-12-25 |
1 | 2391 | 881xt | male | 52.0 | NaN | 2442.18 | 2017-05-24 |
2 | 2785 | haoah | male | 39.0 | NaN | 849.79 | 2018-05-15 |
3 | 1361 | snaen | female | 26.0 | NaN | 2482.22 | 2018-05-16 |
4 | 888 | sue女少 | female | 61.0 | 本科 | 2027.90 | 2018-01-21 |
... | ... | ... | ... | ... | ... | ... | ... |
2995 | 2926 | 宝哒哇是我 | female | NaN | NaN | 542.02 | 2018-09-01 |
2996 | 1616 | 和花花豆豆 | female | 64.0 | 本科 | 2593.38 | 2018-11-03 |
2997 | 1129 | 小清新桉蓝 | female | 61.0 | NaN | 139.68 | 2018-11-07 |
2998 | 674 | xyyx毓 | female | 34.0 | 本科 | 670.89 | 2018-06-08 |
2999 | 229 | 大侦探野荒 | male | 64.0 | NaN | 118.37 | 2018-07-13 |
# 异常值的处理 df3.describe([0.25,0.5,0.75,0.99])
age | custom_amt | |
---|---|---|
count | 2900.000000 | 2998.000000 |
mean | 41.605862 | 1535.732572 |
std | 14.162957 | 842.658685 |
min | 19.000000 | 68.100000 |
25% | 30.000000 | 828.232500 |
50% | 40.000000 | 1510.275000 |
75% | 55.000000 | 2276.527500 |
99% | 65.000000 | 2963.879000 |
max | 230.000000 | 2999.950000 |
df3.loc[df3["age"]>200,:]
id | name | gender | age | edu | custom_amt | order_date | |
---|---|---|---|---|---|---|---|
118 | 2113 | 089bt | female | 230.0 | NaN | 1193.06 | 2018-04-03 |
df3.drop(index=118)
2999 rows × 7 columns
id | name | gender | age | edu | custom_amt | order_date | |
---|---|---|---|---|---|---|---|
0 | 890 | 李小胆李l | female | 43.0 | NaN | 2177.94 | 2018-12-25 |
1 | 2391 | 881xt | male | 52.0 | NaN | 2442.18 | 2017-05-24 |
2 | 2785 | haoah | male | 39.0 | NaN | 849.79 | 2018-05-15 |
3 | 1361 | snaen | female | 26.0 | NaN | 2482.22 | 2018-05-16 |
4 | 888 | sue女少 | female | 61.0 | 本科 | 2027.90 | 2018-01-21 |
... | ... | ... | ... | ... | ... | ... | ... |
2995 | 2926 | 宝哒哇是我 | female | NaN | NaN | 542.02 | 2018-09-01 |
2996 | 1616 | 和花花豆豆 | female | 64.0 | 本科 | 2593.38 | 2018-11-03 |
2997 | 1129 | 小清新桉蓝 | female | 61.0 | NaN | 139.68 | 2018-11-07 |
2998 | 674 | xyyx毓 | female | 34.0 | 本科 | 670.89 | 2018-06-08 |
2999 | 229 | 大侦探野荒 | male | 64.0 | NaN | 118.37 | 2018-07-13 |
# 删除异常值,~取反 df3 = df3.loc[~(df3["age"]>200),:]
# 查看缺失值 df3.isnull().sum() # 缺失值的比例 df3.isnull().sum()/df3.shape[0]
id 0.000000 name 0.000000 gender 0.045348 age 0.033344 edu 0.642214 custom_amt 0.000667 order_date 0.000000 dtype: float64
df3.isnull().mean()
id 0.000000 name 0.000000 gender 0.045348 age 0.033344 edu 0.642214 custom_amt 0.000667 order_date 0.000000 dtype: float64
# 删除某一列 df3.drop(columns="edu")
2999 rows × 6 columns
id | name | gender | age | custom_amt | order_date | |
---|---|---|---|---|---|---|
0 | 890 | 李小胆李l | female | 43.0 | 2177.94 | 2018-12-25 |
1 | 2391 | 881xt | male | 52.0 | 2442.18 | 2017-05-24 |
2 | 2785 | haoah | male | 39.0 | 849.79 | 2018-05-15 |
3 | 1361 | snaen | female | 26.0 | 2482.22 | 2018-05-16 |
4 | 888 | sue女少 | female | 61.0 | 2027.90 | 2018-01-21 |
... | ... | ... | ... | ... | ... | ... |
2995 | 2926 | 宝哒哇是我 | female | NaN | 542.02 | 2018-09-01 |
2996 | 1616 | 和花花豆豆 | female | 64.0 | 2593.38 | 2018-11-03 |
2997 | 1129 | 小清新桉蓝 | female | 61.0 | 139.68 | 2018-11-07 |
2998 | 674 | xyyx毓 | female | 34.0 | 670.89 | 2018-06-08 |
2999 | 229 | 大侦探野荒 | male | 64.0 | 118.37 | 2018-07-13 |
df3.loc[:,df3.isnull().mean()<=0.5]
2999 rows × 6 columns
id | name | gender | age | custom_amt | order_date | |
---|---|---|---|---|---|---|
0 | 890 | 李小胆李l | female | 43.0 | 2177.94 | 2018-12-25 |
1 | 2391 | 881xt | male | 52.0 | 2442.18 | 2017-05-24 |
2 | 2785 | haoah | male | 39.0 | 849.79 | 2018-05-15 |
3 | 1361 | snaen | female | 26.0 | 2482.22 | 2018-05-16 |
4 | 888 | sue女少 | female | 61.0 | 2027.90 | 2018-01-21 |
... | ... | ... | ... | ... | ... | ... |
2995 | 2926 | 宝哒哇是我 | female | NaN | 542.02 | 2018-09-01 |
2996 | 1616 | 和花花豆豆 | female | 64.0 | 2593.38 | 2018-11-03 |
2997 | 1129 | 小清新桉蓝 | female | 61.0 | 139.68 | 2018-11-07 |
2998 | 674 | xyyx毓 | female | 34.0 | 670.89 | 2018-06-08 |
2999 | 229 | 大侦探野荒 | male | 64.0 | 118.37 | 2018-07-13 |
df_dropna = df3.loc[:,df3.isnull().mean()<=0.5].dropna() # 剔除缺失值比例大于一半的列,删除有缺失值的行
df_dropna.isnull().sum()
id 0 name 0 gender 0 age 0 custom_amt 0 order_date 0 dtype: int64
# 填补缺失值 df3["age"] = df3["age"].fillna(df3["age"].mean())
df3["age"].fillna(method="bfill")# method{‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None
0 43.000000 1 52.000000 2 39.000000 3 26.000000 4 61.000000 ... 2995 41.540876 2996 64.000000 2997 61.000000 2998 34.000000 2999 64.000000 Name: age, Length: 2999, dtype: float64
# 文本数据的处理 df4 = df_dropna.copy()
df4.isnull().sum()
id 0 name 0 gender 0 age 0 custom_amt 0 order_date 0 dtype: int64
df4.head()
id | name | gender | age | custom_amt | order_date | |
---|---|---|---|---|---|---|
0 | 890 | 李小胆李l | female | 43.0 | 2177.94 | 2018-12-25 |
1 | 2391 | 881xt | male | 52.0 | 2442.18 | 2017-05-24 |
2 | 2785 | haoah | male | 39.0 | 849.79 | 2018-05-15 |
3 | 1361 | snaen | female | 26.0 | 2482.22 | 2018-05-16 |
4 | 888 | sue女少 | female | 61.0 | 2027.90 | 2018-01-21 |
df4["gender"] = df4["gender"].apply(lambda x:x.title()) # 对每一个自变量都执行首字母大写 map
df4["gender"] = df4["gender"].map({"Female":0,"Male":1}) # 分类变量
# 去掉数据集中一些特殊的符号 df["custom_amt"].str.replace("¥","")
0 2177.94 1 2442.18 2 849.79 3 2482.22 4 2027.9 ... 2999 542.02 3000 2593.38 3001 139.68 3002 670.89 3003 118.37 Name: custom_amt, Length: 3004, dtype: object
标签:auto 复制 ali pre end ram 异常 info name
原文地址:https://www.cnblogs.com/Cookie-Jing/p/14838595.html