码迷,mamicode.com
首页 > 其他好文 > 详细

01.pandas

时间:2019-02-15 17:55:20      阅读:93      评论:0      收藏:0      [点我收藏+]

标签:csv   frame   col   column   pandas   val   2-2   label   ase   

 

01.Series

  1 # -*- coding: utf-8 -*-
  2 """
  3 Series ?? ?? 
  4  - pandas ?? 1?? ???? 
  5  - DataFrame ?? ???? 
  6  - ??/?? ?? ?? ?? 
  7  - ?? ??, ?? ?? 
  8  - indexing/slicing(list ??)
  9  - ??? ??? ?? 
 10 """
 11 
 12 import pandas as pd # pd.Series()
 13 from pandas import Series # Series()
 14 
 15 # 1. Series ?? 
 16 
 17 # 1) list ?? 
 18 lst = [4000, 3000, 2000, 3500]
 19 print(lst*2)
 20 price = Series([4000, 3000, 2000, 3500])
 21 print(price*2)
 22 
 23 print(price.index) # index
 24 print(price.values) # data
 25 
 26 print(lst[0], price[0]) # 4000 4000
 27 
 28 # 2) dict ?? : key=index : value=values
 29 person = pd.Series({name:???, age:35, addr :???})
 30 print(person)
 31 ‘‘‘
 32 addr    ???
 33 age      35
 34 name    ???
 35 ‘‘‘
 36 print(person[age]) # 35
 37 
 38 # 2. indexing(list? ??)
 39 ser_data = pd.Series([4, 4.5, 6, 8, 10.5])
 40 print(len(ser_data)) # 5
 41 
 42 print(ser_data[0]) # 4.0
 43 print(ser_data[:3]) # 3? 
 44 print(ser_data[3:]) # 2?
 45 print(ser_data[:]) # ?? 
 46 #print(ser_data[-1])
 47 
 48 # boolean ??? 
 49 print(ser_data[ser_data >= 5])
 50 ‘‘‘
 51 2     6.0
 52 3     8.0
 53 4    10.5
 54 ‘‘‘
 55 
 56 
 57 # 3. Series ??, NA ?? 
 58 data1 = Series([4000, None, 3500, 2000],
 59                index=[a, m, o, k])
 60 data2 = Series([4000, 3000, 3500, 2000],
 61                index=[a, o, k, m])
 62 # join : index ?? 
 63 result = data1 + data2 # ?? ?? 
 64 print(result)
 65 print(type(result)) # Series‘
 66 ‘‘‘
 67 a    8000.0
 68 k    5500.0
 69 m       NaN -> ??? 
 70 o    6500.0
 71 ‘‘‘
 72 
 73 # NA ?? : 0, ?? ??, ??  
 74 
 75 result2 = result.fillna(0) # 0 ?? 
 76 result3 = result.fillna(result.mean()) # ?? ?? 
 77 print(0 ?? :, result2)
 78 print(?? ?? :, result3)
 79 ‘‘‘
 80 0 ?? : a    8000.0
 81 k    5500.0
 82 m       0.0
 83 o    6500.0
 84 dtype: float64
 85 ?? ?? : a    8000.000000
 86 k    5500.000000
 87 m    6666.666667
 88 o    6500.000000
 89 ‘‘‘
 90 
 91 print(pd.notnull(result))
 92 ‘‘‘
 93 a     True
 94 k     True
 95 m    False
 96 o     True
 97 ‘‘‘
 98 # ???? ??? subset ?? 
 99 subset = result[pd.notnull(result)]
100 print(subset)
101 ‘‘‘
102 a    8000.0
103 k    5500.0
104 o    6500.0
105 ‘‘‘
106 
107 # 4. Series ?? 
108 print(ser_data)
109 
110 
111 # 1) ???? 
112 ser_data[1:4] = 50 
113 print(ser_data)
114 
115 # 2) ??/?? ?? 
116 print(ser_data.sum())
117 print(ser_data.mean())
118 print(ser_data.max())
119 print(ser_data.min())
120 
121 # 3) broacast ?? 
122 print(ser_data * 0.5) # vector(1) * scala(0)
123 ‘‘‘
124 0     2.00
125 1    25.00
126 2    25.00
127 3    25.00
128 4     5.25
129 ‘‘‘

 

 

 

02.DataFrame

  1 # -*- coding: utf-8 -*-
  2 """
  3 DataFrame ?? ?? 
  4  - pandas ?? 2?? ????(table ?? ??)
  5  - ?? ?? ??? ??? ?? 
  6  - DataFrame ???? 
  7    -> Series : 1??(vector) 
  8    -> Numpy : 1??(vector)
  9 """
 10 
 11 import pandas as pd # pd.DataFrame()
 12 from pandas import DataFrame # DataFrame()
 13 
 14 # 1. DataFrame ?? 
 15 
 16 name = [???, ???, ???, ???]
 17 age = [35,45,55,25]
 18 pay = [350,450,550,250]
 19 emp = DataFrame({name:name, age:age, pay:pay},
 20                    columns=[name, age, pay])
 21 print(emp)
 22 ‘‘‘
 23   name  age  pay
 24 0  ???   35  350
 25 1  ???   45  450
 26 2  ???   55  550
 27 3  ???   25  250
 28 ‘‘‘
 29 
 30 # 1) Series ?? ?? : column ?? 
 31 gender = pd.Series([M,M,M, F])
 32 emp[gender] = gender
 33 print(emp)
 34 
 35 # 2) Numpy ?? ??
 36 import numpy as np
 37 frame = pd.DataFrame(np.arange(12).reshape(3,4),
 38                      columns=[a,b,c,d])
 39 print(frame)
 40 ‘‘‘
 41    a  b   c   d
 42 0  0  1   2   3
 43 1  4  5   6   7
 44 2  8  9  10  11
 45 ‘‘‘
 46 
 47 # ?/? ?? ??? 
 48 print(frame.mean()) # ? ?? ??
 49 print(frame.mean(axis = 0)) # ? ?? ?? 
 50 print(frame.mean(axis = 1)) # ? ?? ?? 
 51 
 52 # 2. index ?? 
 53 print(frame.index) # RangeIndex(start=0, stop=3, step=1)
 54 print(frame.values)
 55 ‘‘‘
 56 [[ 0  1  2  3]
 57  [ 4  5  6  7]
 58  [ 8  9 10 11]]
 59 ‘‘‘
 60 print(frame.columns)
 61 # Index([‘a‘, ‘b‘, ‘c‘, ‘d‘], dtype=‘object‘)
 62 
 63 # 1) ?? ??(a) ?? index ?? 
 64 setIdx = frame.set_index(a)
 65 print(setIdx)
 66 
 67 # 2) index ??? 
 68 resetIdx = setIdx.reset_index()
 69 print(resetIdx)
 70 
 71 
 72 # 3. DF ?? ?? 
 73 
 74 # 1) ?? ?? ?? 
 75 a_col1 = frame.a # DF.column
 76 a_col2 = frame[a] # DF[‘column‘]
 77 print(a_col1)
 78 print(a_col2)
 79 print(frame[a][2]) # 8 DF[‘column‘][index]
 80 
 81 # 2) ?? ?? ?? 
 82 print(frame[[a, c]]) # [[‘a‘:‘c‘]](x)
 83 cols = [a, d] # list
 84 frame[cols]
 85 
 86 
 87 # 4. subset ??? 
 88 
 89 # 1) ?? ?? ?? 
 90 print(subset1)
 91 subset_df = frame[[a,c,d]] 
 92 print(subset_df)
 93 
 94 # 2) ?? ? ?? 
 95 print(drop)
 96 print(frame.drop(0)) # 1? ?? 
 97 print(frame.drop(1)) # 2? ?? 
 98 ‘‘‘
 99 ?? ??? ??? ? new object ??
100 ?? object? ??? 
101 ‘‘‘
102 
103 a_col = frame[a] # DF(2) -> vector(1)
104 print(type(a_col)) # Series
105 
106 # a?? ???? ? ?? 
107 subset_df2 = frame # df ??
108 print(subset_df2)
109 
110 for i, c in enumerate(a_col) :
111     print(i=, i, c=, c)
112     if c < 5 :
113         subset_df2 = subset_df2.drop(i)
114     
115 ‘‘‘
116 i= 0 c= 0
117 i= 1 c= 4
118 i= 2 c= 8
119 ‘‘‘    
120 print(subset_df2)
121 
122 
123 # 3) ??? ?? ?? 
124 iris = pd.read_csv(../data/iris.csv)
125 print(iris.info())
126 ‘‘‘
127 RangeIndex: 150 entries, 0 to 149
128 Data columns (total 5 columns):
129 ‘‘‘
130 print(type(iris)) # DataFrame
131 print(iris.columns)
132 cols = list(iris.columns) # ??? ?? 
133 print(cols)
134 ‘‘‘
135 [‘Sepal.Length‘, ‘Sepal.Width‘, ‘Petal.Length‘, ‘Petal.Width‘, ‘Species‘]
136 ‘‘‘
137 
138 print(iris[cols[0]]) # ??? ?? 
139 print(iris[cols[-1]]) # ??? ?? 
140 # 1~3?? ?? 
141 print(iris[[Sepal.Length, Sepal.Width, Petal.Length]])
142 print(iris[cols[:3]]) # ?? 
143 
144 print(iris.head())
145 
146 
147 # 1~4?? : x, 5?? : y
148 iris_x = iris[cols[:4]]
149 iris_y = iris[cols[-1]]
150 
151 print(iris_x.shape) # (150, 4) - 2?? 
152 print(iris_y.shape) # (150,) - 1?? 
153 
154 
155 # 5. DF ?? ?? : R ?? ?? [row, col1:col3]
156 ‘‘‘
157 DF.ix[row index or label,col index or label]
158  - DF ???? ?? ?? index(??) or label(??) ??
159  - ?? ???? ??(:) ?? ??
160  - label? ???? label-based ??
161 ‘‘‘
162 print(frame)
163 print(frame)
164 ‘‘‘
165    a  b   c   d
166 0  0  1   2   3
167 1  4  5   6   7
168 2  8  9  10  11
169 ‘‘‘
170 
171 print(frame.ix[1]) # ? default
172 print(frame.ix[1, 2]) # 2? 3? - 6
173 print(frame.ix[:,d]) # d? ?? 
174 print(frame.ix[:,b:c]) # b~c? ?? 
175 
176 
177 print(len(iris)) # ??? ??  - 150
178 
179 # 70% - 105, 305 - 45
180 
181 import numpy as np
182 idx = np.random.choice(10, 5, replace=False) # 1~10 -> 5 random
183 print(idx) # [4 1 3 6 8]
184 
185 
186 idx = np.random.choice(len(iris), int(len(iris)*0.7), 
187                        replace=False)
188 print(idx, len(idx)) # 105
189 
190 train_set = iris.ix[idx, :]
191 print(train_set.shape) # (105, 5)

 

 

 

03.Descriptive

 1 # -*- coding: utf-8 -*-
 2 """
 3 1. DataFrame ????? 
 4 2. ?? ?? ??? ?? 
 5 """
 6 
 7 import pandas as pd
 8 
 9 
10 product = pd.read_csv(../data/product.csv)
11 print(product.info())
12 
13 # ????? ??? 
14 summary = product.describe()
15 print(summary)
16 
17 # ?/? ??? ???  : axis=0 or 1
18 print(product.sum(axis = 0)) # ? ?? 
19 ‘‘‘
20 a    773
21 b    827
22 c    817
23 ‘‘‘
24 print(product.sum(axis = 1)) # ? ?? 
25 
26 
27 # ??? 
28 print(product.var()) # ??
29 print(product.std()) # ???? 
30 
31 # ??? 
32 a_cnt = product[a].value_counts()
33 print(a_cnt)
34 ‘‘‘
35 3    126
36 4     64
37 2     37
38 1     30
39 5      7
40 ‘‘‘
41 
42 # ?? ?? 
43 b_uni = product[b].unique()
44 print(b_uni) # [4 3 2 5 1]
45 
46 # ?? ?? ????( -1 < r < 1)
47 p_corr = product.corr()
48 print(p_corr)
49 ‘‘‘
50           a         b         c
51 a  1.000000  0.499209  0.467145
52 b  0.499209  1.000000  0.766853
53 c  0.467145  0.766853  1.000000
54 ‘‘‘
55 
56 ac_corr = product[a].corr(product[c])
57 print(ac_corr) # 0.4671449836008965
58 
59 #?) iris 1 ~ 4 ?? -> ????(r)
60 cols = list(iris.columns)
61 print(cols) # 5? ?? list
62 iris_sub = iris[cols[:4]]
63 
64 print(iris_sub.corr())

 

 

 

04.merge

 1 # -*- coding: utf-8 -*-
 2 """
 3 DataFrame marge
 4 """
 5 
 6 import pandas as pd
 7 
 8 wdbc = pd.read_csv("../data/wdbc_data.csv")
 9 print(wdbc.info())
10 ‘‘‘
11 RangeIndex: 569 entries, 0 to 568
12 Data columns (total 32 columns):
13 ‘‘‘
14 
15 cols = list(wdbc.columns)
16 print(cols)
17 
18 df1 = wdbc[cols[:16]] # 1~16
19 sid = wdbc[id] # id ?? 
20 df2 = wdbc[cols[16:]] # 17~32
21 
22 df2[id] = sid
23 
24 print(df1.shape) # (569, 16)
25 print(df2.shape) # (569, 17)
26 
27 
28 # 1. id ???? DF ?? 
29 df_merge = pd.merge(df1, df2) # id ??, how=‘inner‘
30 print(df_merge.info())
31 ‘‘‘
32 <class ‘pandas.core.frame.DataFrame‘>
33 Int64Index: 569 entries, 0 to 568
34 Data columns (total 32 columns):
35 ‘‘‘
36 
37 # 2. ?? ?? df ??? 
38 df1 = wdbc[cols[:16]] # 1~16
39 df2 = wdbc[cols[16:]] # 17~32
40 
41 df_merge2 = pd.concat([df1, df2], axis=1) # ? ?? ?? 
42 print(df_merge2.info())
43 ‘‘‘
44 <class ‘pandas.core.frame.DataFrame‘>
45 RangeIndex: 569 entries, 0 to 568
46 Data columns (total 32 columns):
47 ‘‘‘

 

 

 

05.timeSeries

 1 # -*- coding: utf-8 -*-
 2 """
 3 ??? ??? ??? 
 4  1. ???? ??(??? -> ???)
 5  2. ??? ??? 
 6  3. ???? ?? 
 7 """
 8 
 9 import pandas as pd
10 from datetime import datetime # ???? ?? 
11 
12 cospi = pd.read_csv("../data/cospi.csv")
13 print(cospi.info())
14 ‘‘‘
15 RangeIndex: 247 entries, 0 to 246
16 Data columns (total 6 columns):
17 Date      247 non-null object
18 Open      247 non-null int64
19 High      247 non-null int64
20 Low       247 non-null int64
21 Close     247 non-null int64
22 Volume    247 non-null int64
23 ‘‘‘
24 
25 print(cospi.head())
26 # 0  26-Feb-16  1180000  1187000  1172000  1172000  176906
27 # 26-Feb-16 -> 2016-2-26
28 
29 # 1. ???? ??(??? -> ???)
30 Date = cospi[Date] # cospi.Date
31 kDate = [] # ?list
32 
33 for d in Date :
34     kDate.append(datetime.strptime(d, "%d-%b-%y"))
35     
36 print(kDate[:10])
37 
38 cospi[Date] = kDate # (??? -> ???)
39 print(cospi.head())
40     
41 
42 # 2. ??? ???
43 import matplotlib.pyplot as plt
44 
45 # 1? ?? ????? 
46 cospi[High].plot(title = "Trend line of High column")
47 plt.show()
48 
49 # 2? ?? ????? 
50 cospi[[High, Low]].plot(title = "Trend line of High vs Low")
51 plt.show()
52 
53 # 2. index ?? 
54 print(cospi.index)
55 # RangeIndex(start=0, stop=247, step=1)
56 
57 # index ?? -> Date ?? 
58 new_cospi = cospi.set_index(Date)
59 print(new_cospi.head())
60 
61 # ??? ?? 
62 print(new_cospi[2016])
63 print(new_cospi[2015])
64 
65 # ?? ?? 
66 print(new_cospi[2016-02])
67 # ?? ?? 
68 print(new_cospi[2016-02:2016-01])
69 
70 new_cospi_HL = new_cospi[[High, Low]]
71 new_cospi_HL[2016].plot(title="title")
72 plt.show()
73 
74 new_cospi_HL[2016-02].plot(title="title")
75 plt.show()
76 
77 
78 # 3. ???? ?? 
79 
80 # 5?, 10?, 20? 
81 roll_mean5 = pd.Series.rolling(new_cospi.High,
82                   window=5, center=False).mean()
83 print(roll_mean5)
84 
85 roll_mean10 = pd.Series.rolling(new_cospi.High,
86                   window=10, center=False).mean()
87 
88 roll_mean20 = pd.Series.rolling(new_cospi.High,
89                   window=20, center=False).mean()
90 
91 # roll mean ??? 
92 new_cospi.High.plot(color=orange, label=High column)
93 roll_mean5.plot(color=red, label=5day rolling mean)
94 roll_mean10.plot(color=green, label=10day rolling mean)
95 roll_mean20.plot(color=blue, label=20day rolling mean)
96 plt.legend(loc=best)
97 plt.show()

 

01.pandas

标签:csv   frame   col   column   pandas   val   2-2   label   ase   

原文地址:https://www.cnblogs.com/kingboy100/p/10384528.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!