标签:pyplot 方法 to_date nan obj 尺寸 taf outer 数字
import xgboost as xgb
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pima = pd.read_csv("D:\\xgbtest\\pima-indians-diabetes.csv")
#取某列
pima['Outcome']
0 1
1 0
2 1
3 0
4 1
..
763 0
764 0
765 0
766 1
767 0
Name: Outcome, Length: 768, dtype: int64
#取某行
pima.iloc[0]
Pregnancies 6.000
Glucose 148.000
BloodPressure 72.000
SkinThickness 35.000
Insulin 0.000
BMI 33.600
DiabetesPedigreeFunction 0.627
Age 50.000
Outcome 1.000
Name: 0, dtype: float64
#x选出多个列
pima.loc[:,['Pregnancies','Glucose','BMI']]
|
Pregnancies |
Glucose |
BMI |
0 |
6 |
148 |
33.6 |
1 |
1 |
85 |
26.6 |
2 |
8 |
183 |
23.3 |
3 |
1 |
89 |
28.1 |
4 |
0 |
137 |
43.1 |
... |
... |
... |
... |
763 |
10 |
101 |
32.9 |
764 |
2 |
122 |
36.8 |
765 |
5 |
121 |
26.2 |
766 |
1 |
126 |
30.1 |
767 |
1 |
93 |
30.4 |
768 rows × 3 columns
#某个位置的值,纯整数索引
pima.iloc[[0],[0]]
fig=plt.figure()
ax=fig.add_subplot(111)
x=pima['BloodPressure']
y=pima['Outcome']
ax.plot(x,y,color='lightblue',linewidth=1)
plt.show()
#查看部分data
pima.head()
|
Pregnancies |
Glucose |
BloodPressure |
SkinThickness |
Insulin |
BMI |
DiabetesPedigreeFunction |
Age |
Outcome |
0 |
6 |
148 |
72 |
35 |
0 |
33.6 |
0.627 |
50 |
1 |
1 |
1 |
85 |
66 |
29 |
0 |
26.6 |
0.351 |
31 |
0 |
2 |
8 |
183 |
64 |
0 |
0 |
23.3 |
0.672 |
32 |
1 |
3 |
1 |
89 |
66 |
23 |
94 |
28.1 |
0.167 |
21 |
0 |
4 |
0 |
137 |
40 |
35 |
168 |
43.1 |
2.288 |
33 |
1 |
# 对于非数字类型的数据(字符型 数据),可以使用pima.['这里填带统计的标签'].value_counts()统计分类数目
# 可以得到统计数目,得到平均数、方差等特征(当然是针对数字类型的数据)
pima.describe()
|
Pregnancies |
Glucose |
BloodPressure |
SkinThickness |
Insulin |
BMI |
DiabetesPedigreeFunction |
Age |
Outcome |
count |
768.000000 |
768.000000 |
768.000000 |
768.000000 |
768.000000 |
768.000000 |
768.000000 |
768.000000 |
768.000000 |
mean |
3.845052 |
120.894531 |
69.105469 |
20.536458 |
79.799479 |
31.992578 |
0.471876 |
33.240885 |
0.348958 |
std |
3.369578 |
31.972618 |
19.355807 |
15.952218 |
115.244002 |
7.884160 |
0.331329 |
11.760232 |
0.476951 |
min |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.078000 |
21.000000 |
0.000000 |
25% |
1.000000 |
99.000000 |
62.000000 |
0.000000 |
0.000000 |
27.300000 |
0.243750 |
24.000000 |
0.000000 |
50% |
3.000000 |
117.000000 |
72.000000 |
23.000000 |
30.500000 |
32.000000 |
0.372500 |
29.000000 |
0.000000 |
75% |
6.000000 |
140.250000 |
80.000000 |
32.000000 |
127.250000 |
36.600000 |
0.626250 |
41.000000 |
1.000000 |
max |
17.000000 |
199.000000 |
122.000000 |
99.000000 |
846.000000 |
67.100000 |
2.420000 |
81.000000 |
1.000000 |
# 可以显示该标签下的数据分布,50表示y轴的间隔,以直方图显示,横轴表示数值范围,y轴表示数量
# bins指bin(箱子)的个数,即每张图柱子的个数
# figsize指每张图的尺寸大小
pima.hist(bins=50,figsize=(20,15))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F597CC988>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F59690348>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F5987B7C8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F59053208>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F59080BC8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F590BA5C8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F590E8E48>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F59113F88>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F5911EB88>]],
dtype=object)
# 可以显示该标签下的数值分布,观察分布是否均衡 箱形图或盒图
pima.boxplot(column='Pregnancies')
<matplotlib.axes._subplots.AxesSubplot at 0x10f5a32b888>
# 可以将标签1下的数据再按照标签2进行数值分布绘制
pima.boxplot(column='BloodPressure', by = 'Outcome')
<matplotlib.axes._subplots.AxesSubplot at 0x10f5a9660c8>
#查看两列相关性
pima['BloodPressure'].corr(pima['Age'])
0.23952794642136363
#查看两列相关性
pima.corr()
|
Pregnancies |
Glucose |
BloodPressure |
SkinThickness |
Insulin |
BMI |
DiabetesPedigreeFunction |
Age |
Outcome |
Pregnancies |
1.000000 |
0.129459 |
0.141282 |
-0.081672 |
-0.073535 |
0.017683 |
-0.033523 |
0.544341 |
0.221898 |
Glucose |
0.129459 |
1.000000 |
0.152590 |
0.057328 |
0.331357 |
0.221071 |
0.137337 |
0.263514 |
0.466581 |
BloodPressure |
0.141282 |
0.152590 |
1.000000 |
0.207371 |
0.088933 |
0.281805 |
0.041265 |
0.239528 |
0.065068 |
SkinThickness |
-0.081672 |
0.057328 |
0.207371 |
1.000000 |
0.436783 |
0.392573 |
0.183928 |
-0.113970 |
0.074752 |
Insulin |
-0.073535 |
0.331357 |
0.088933 |
0.436783 |
1.000000 |
0.197859 |
0.185071 |
-0.042163 |
0.130548 |
BMI |
0.017683 |
0.221071 |
0.281805 |
0.392573 |
0.197859 |
1.000000 |
0.140647 |
0.036242 |
0.292695 |
DiabetesPedigreeFunction |
-0.033523 |
0.137337 |
0.041265 |
0.183928 |
0.185071 |
0.140647 |
1.000000 |
0.033561 |
0.173844 |
Age |
0.544341 |
0.263514 |
0.239528 |
-0.113970 |
-0.042163 |
0.036242 |
0.033561 |
1.000000 |
0.238356 |
Outcome |
0.221898 |
0.466581 |
0.065068 |
0.074752 |
0.130548 |
0.292695 |
0.173844 |
0.238356 |
1.000000 |
#生成排名
pima['Pregnancies'].rank()
0 574.5
1 179.0
2 663.5
3 179.0
4 56.0
...
763 722.5
764 298.0
765 521.0
766 179.0
767 179.0
Name: Pregnancies, Length: 768, dtype: float64
# rolling()遍历的是移动窗口的数据 由于窗口大小为3(window),前两个元素有空值,第三个元素的值将是n,n-1和n-2元素的平均值。 下例中 取得均值
pima['Pregnancies'].rolling(window=3).mean()
0 NaN
1 NaN
2 5.000000
3 3.333333
4 3.000000
...
763 9.333333
764 7.000000
765 5.666667
766 2.666667
767 2.333333
Name: Pregnancies, Length: 768, dtype: float64
# 数据补全
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df)
print (df['one'].fillna(df['one'].median())) #用中位数填充缺失值
one two three
a 0.018628 -0.684334 1.280671
b NaN NaN NaN
c 1.043673 -1.065631 -0.107962
d NaN NaN NaN
e 0.321906 1.229281 -0.599733
f 0.094914 -2.588472 -0.032368
g NaN NaN NaN
h 1.349480 -0.996243 1.670260
a 0.018628
b 0.321906
c 1.043673
d 0.321906
e 0.321906
f 0.094914
g 0.321906
h 1.349480
Name: one, dtype: float64
df = pd.DataFrame({'one':[10,20,30,40,50,2000],'two':[1000,0,30,40,50,60]})
#替换 用10 换1000;60换2000.可以同时替换多个值
print (df.replace({1000:10,2000:60}))
one two
0 10 10
1 20 0
2 30 30
3 40 40
4 50 50
5 60 60
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings','kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
#按'Team'分组,并打印结果
print (df.groupby('Team').groups)
{'Devils': Int64Index([2, 3], dtype='int64'), 'Kings': Int64Index([4, 6, 7], dtype='int64'), 'Riders': Int64Index([0, 1, 8, 11], dtype='int64'), 'Royals': Int64Index([9, 10], dtype='int64'), 'kings': Int64Index([5], dtype='int64')}
#按'Year'分组,并迭代输出
grouped = df.groupby('Year')
for name,group in grouped:
print (name)
print (group)
2014
Team Rank Year Points
0 Riders 1 2014 876
2 Devils 2 2014 863
4 Kings 3 2014 741
9 Royals 4 2014 701
2015
Team Rank Year Points
1 Riders 2 2015 789
3 Devils 3 2015 673
5 kings 4 2015 812
10 Royals 1 2015 804
2016
Team Rank Year Points
6 Kings 1 2016 756
8 Riders 2 2016 694
2017
Team Rank Year Points
7 Kings 1 2017 788
11 Riders 2 2017 690
# 使用get_group()方法,可以选择一个组。
print (grouped.get_group(2014))
Team Rank Year Points
0 Riders 1 2014 876
2 Devils 2 2014 863
4 Kings 3 2014 741
9 Royals 4 2014 701
# 聚合函数为每个组返回单个聚合值
grouped = df.groupby('Year')
print (grouped['Points'].agg(np.mean))
Year
2014 795.25
2015 769.50
2016 725.00
2017 739.00
Name: Points, dtype: float64
# 查看每个分组的大小的方法
grouped = df.groupby('Team')
print (grouped.agg(np.size))
Rank Year Points
Team
Devils 2 2 2
Kings 3 3 3
Riders 4 4 4
Royals 2 2 2
kings 1 1 1
# 通过分组系列,还可以传递函数的列表或字典来进行聚合
grouped = df.groupby('Team')
agg = grouped['Points'].agg([np.sum, np.mean, np.std])
print (agg)
sum mean std
Team
Devils 1536 768.000000 134.350288
Kings 2285 761.666667 24.006943
Riders 3049 762.250000 88.567771
Royals 1505 752.500000 72.831998
kings 812 812.000000 NaN
# 过滤根据定义的标准过滤数据并返回数据的子集
filter = df.groupby('Team').filter(lambda x: len(x) >= 3)
print (filter)
Team Rank Year Points
0 Riders 1 2014 876
1 Riders 2 2015 789
4 Kings 3 2014 741
6 Kings 1 2016 756
7 Kings 1 2017 788
8 Riders 2 2016 694
11 Riders 2 2017 690
# 类似SQL的两个表连接
left = pd.DataFrame({
'id':[1,2,3,4,5],
'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
{'id':[1,2,3,4,5],
'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'subject_id':['sub2','sub4','sub3','sub6','sub5']})
rs = pd.merge(left,right,on='id') #等值连接
print(rs)
id Name_x subject_id_x Name_y subject_id_y
0 1 Alex sub1 Billy sub2
1 2 Amy sub2 Brian sub4
2 3 Allen sub4 Bran sub3
3 4 Alice sub6 Bryce sub6
4 5 Ayoung sub5 Betty sub5
# 合并多个键上的两个数据框
rs = pd.merge(left,right,on=['id','subject_id'])
print(rs)
id Name_x subject_id Name_y
0 4 Alice sub6 Bryce
1 5 Ayoung sub5 Betty
#left 相当于 LEFT OUTER JOIN
# right 相当于 RIGHT OUTER JOIN
# outer 相当于 FULL OUTER JOIN
# inner 相当于INNER JOIN
rs = pd.merge(left, right, on='subject_id', how='left')
print (rs)
id_x Name_x subject_id id_y Name_y
0 1 Alex sub1 NaN NaN
1 2 Amy sub2 1.0 Billy
2 3 Allen sub4 2.0 Brian
3 4 Alice sub6 4.0 Bryce
4 5 Ayoung sub5 5.0 Betty
#生成时间
#取当前时间
print(pd.datetime.now())
2020-02-29 11:15:30.372039
# 创建一个时间戳
time = pd.Timestamp('2018-11-01')
print(time)
2018-11-01 00:00:00
# 转换成整数或浮动时期。这些的默认单位是纳秒(因为这些是如何存储时间戳的)。
time = pd.Timestamp(1588686880,unit='s')
print(time)
2020-05-05 13:54:40
# 创建一个时间范围
time = pd.date_range("12:00", "23:59", freq="30min").time
print(time)
[datetime.time(12, 0) datetime.time(12, 30) datetime.time(13, 0)
datetime.time(13, 30) datetime.time(14, 0) datetime.time(14, 30)
datetime.time(15, 0) datetime.time(15, 30) datetime.time(16, 0)
datetime.time(16, 30) datetime.time(17, 0) datetime.time(17, 30)
datetime.time(18, 0) datetime.time(18, 30) datetime.time(19, 0)
datetime.time(19, 30) datetime.time(20, 0) datetime.time(20, 30)
datetime.time(21, 0) datetime.time(21, 30) datetime.time(22, 0)
datetime.time(22, 30) datetime.time(23, 0) datetime.time(23, 30)]
# 改变时间的频率
time = pd.date_range("12:00", "23:59", freq="H").time
print(time)
[datetime.time(12, 0) datetime.time(13, 0) datetime.time(14, 0)
datetime.time(15, 0) datetime.time(16, 0) datetime.time(17, 0)
datetime.time(18, 0) datetime.time(19, 0) datetime.time(20, 0)
datetime.time(21, 0) datetime.time(22, 0) datetime.time(23, 0)]
# 要转换类似日期的对象
time = pd.to_datetime(pd.Series(['Jul 31, 2009','2019-10-10','2019/1/2','20190101', None]))
print(time)
0 2009-07-31
1 2019-10-10
2 2019-01-02
3 2019-01-01
4 NaT
dtype: datetime64[ns]
# 创建一个日期范围通过指定周期和频率,使用date.range()函数就可以创建日期序列。 默认情况下,范围的频率是天。
datelist = pd.date_range('2020/11/21', periods=5)
print(datelist)
DatetimeIndex(['2020-11-21', '2020-11-22', '2020-11-23', '2020-11-24',
'2020-11-25'],
dtype='datetime64[ns]', freq='D')
# 更改日期频率
datelist = pd.date_range('2020/11/21', periods=5,freq='M')
print(datelist)
DatetimeIndex(['2020-11-30', '2020-12-31', '2021-01-31', '2021-02-28',
'2021-03-31'],
dtype='datetime64[ns]', freq='M')
# bdate_range()用来表示商业日期范围,不同于date_range(),它不包括星期六和星期天。观察到11月3日以后,日期跳至11月6日,不包括4日和5日(因为它们是周六和周日)。
datelist = pd.date_range('2011/11/03', periods=5)
print(datelist)
DatetimeIndex(['2011-11-03', '2011-11-04', '2011-11-05', '2011-11-06',
'2011-11-07'],
dtype='datetime64[ns]', freq='D')
#时间范围
start = pd.datetime(2017, 11, 1)
end = pd.datetime(2017, 11, 5)
dates = pd.date_range(start, end)
print(dates)
DatetimeIndex(['2017-11-01', '2017-11-02', '2017-11-03', '2017-11-04',
'2017-11-05'],
dtype='datetime64[ns]', freq='D')
# 时间差(Timedelta)
timediff = pd.Timedelta('2 days 2 hours 15 minutes 30 seconds')
print(timediff)
2 days 02:15:30
timediff = pd.Timedelta(6,unit='h')
print(timediff)
0 days 06:00:00
timediff = pd.Timedelta(days=2)
print(timediff)
2 days 00:00:00
#构建两组数据
s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D'))
td = pd.Series([ pd.Timedelta(days=i) for i in range(3) ])
df = pd.DataFrame(dict(A = s, B = td))
print(df)
A B
0 2012-01-01 0 days
1 2012-01-02 1 days
2 2012-01-03 2 days
# 相加操作
df['C']=df['A']+df['B']
print(df)
A B C
0 2012-01-01 0 days 2012-01-01
1 2012-01-02 1 days 2012-01-03
2 2012-01-03 2 days 2012-01-05
# 相减操作
df['D']=df['C']-df['B']
print(df)
A B C D
0 2012-01-01 0 days 2012-01-01 2012-01-01
1 2012-01-02 1 days 2012-01-03 2012-01-02
2 2012-01-03 2 days 2012-01-05 2012-01-03
#基本绘图
df = pd.DataFrame(np.random.rand(10,4),columns=['a','b','c','d'])
df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x10f5dbe95c8>
# 条形图
df = pd.DataFrame(np.random.rand(10,4),columns=['a','b','c','d'])
df.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x10f5dc624c8>
# 堆积条形图
df = pd.DataFrame(np.random.rand(10,4),columns=['a','b','c','d'])
df.plot.bar(stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x10f5de41e48>
# 水平条形图
df = pd.DataFrame(np.random.rand(10,4),columns=['a','b','c','d'])
df.plot.barh(stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x10f5df29d88>
# 直方图
df = pd.DataFrame({'a':np.random.randn(1000)+1,'b':np.random.randn(1000),'c':np.random.randn(1000) - 1}, columns=['a', 'b', 'c'])
df.plot.hist(bins=50, figsize=(5,5))
# bins指bin(箱子)的个数,即每张图柱子的个数
# figsize指每张图的尺寸大小
<matplotlib.axes._subplots.AxesSubplot at 0x10f5f644248>
# 要为每列绘制不同的直方图
# df=pd.DataFrame({'a':np.random.randn(1000)+1,'b':np.random.randn(1000),'c':np.random.randn(1000) - 1}, columns=['a', 'b', 'c'])
df.hist(bins=20)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F5FACE7C8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F5FACE5C8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F5FB3C108>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F5FB72AC8>]],
dtype=object)
# 箱形图
# Boxplot可以绘制调用Series.box.plot()和DataFrame.box.plot()或DataFrame.boxplot()来可视化每列中值的分布。
df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E'])
df.plot.box()
<matplotlib.axes._subplots.AxesSubplot at 0x10f5fbd2788>
# 区域块图形 可以使用Series.plot.area()或DataFrame.plot.area()方法创建区域图形
df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df.plot.area()
<matplotlib.axes._subplots.AxesSubplot at 0x10f5fd7f708>
# 散点图形
df = pd.DataFrame(np.random.rand(20, 4), columns=['a', 'b', 'c', 'd'])
df.plot.scatter(x='a', y='b')
<matplotlib.axes._subplots.AxesSubplot at 0x10f611e0e08>
#折线图
df = pd.DataFrame({'a':np.random.rand(10)})
df.plot(kind='line')
<matplotlib.axes._subplots.AxesSubplot at 0x10f6145f588>
# 饼状图
df = pd.DataFrame(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], columns=['x'])
df.plot.pie(subplots=True)
array([<matplotlib.axes._subplots.AxesSubplot object at 0x0000010F6157B588>],
dtype=object)
pandas快速手册
标签:pyplot 方法 to_date nan obj 尺寸 taf outer 数字
原文地址:https://www.cnblogs.com/duoba/p/12382788.html