标签:set rate dig int() ril write bsp and record
第5章 pandas:数据读写 91
5.1 I/O API 工具 91
5.2 CSV和文本文件 92
5.3 读取CSV或文本文件中的数据 92
myCSV_01.csv
white,red,blue,green,animal
1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse
1 2
|
csvframe = read_csv(‘myCSV_01.csv‘) print(csvframe)
|
white red blue green animal
0 1 5 2 3 cat
1 2 7 8 5 dog
2 3 3 6 7 horse
3 2 2 8 3 duck
4 4 4 2 1 mouse
white,red,blue,green,animal
1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse
1 2
|
temp = read_table(‘ch05_01.csv‘,sep=‘,‘) print(temp)
|
white red blue green animal
0 1 5 2 3 cat
1 2 7 8 5 dog
2 3 3 6 7 horse
3 2 2 8 3 duck
4 4 4 2 1 mouse
1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse
1 2
|
temp = read_csv(‘ch05_02.csv‘) print(temp)
|
1 5 2 3 cat
0 2 7 8 5 dog
1 3 3 6 7 horse
2 2 2 8 3 duck
3 4 4 2 1 mouse
1 2
|
temp = read_csv(‘ch05_02.csv‘, header=None) print(temp)
|
0 1 2 3 4
0 1 5 2 3 cat
1 2 7 8 5 dog
2 3 3 6 7 horse
3 2 2 8 3 duck
4 4 4 2 1 mouse
1 2
|
temp = read_csv(‘ch05_02.csv‘, names=[‘white‘,‘red‘,‘blue‘,‘green‘,‘animal‘]) print(temp)
|
white red blue green animal
0 1 5 2 3 cat
1 2 7 8 5 dog
2 3 3 6 7 horse
3 2 2 8 3 duck
4 4 4 2 1 mouse
color,status,item1,item2,item3
black,up,3,4,6
black,down,2,6,7
white,up,5,5,5
white,down,3,3,2
white,left,1,2,1
red,up,2,2,2
red,down,1,1,4
1 2
|
temp = read_csv(‘ch05_03.csv‘, index_col=[‘color‘,‘status‘]) print(temp)
|
item1 item2 item3
color status
black up 3 4 6
down 2 6 7
white up 5 5 5
down 3 3 2
left 1 2 1
red up 2 2 2
down 1 1 4
5.3.1 用RegExp解析TXT文件 94
white red blue green
1 5 2 3
2 7 8 5
3 3 6 7
1 2
|
temp = read_table(‘ch05_04.txt‘,sep=‘\s+‘) print(temp)
|
white red blue green
0 1 5 2 3
1 2 7 8 5
2 3 3 6 7
ch05_05.txt
000END123AAA122
001END124BBB321
002END125CCC333
1 2
|
temp = read_table(‘ch05_05.txt‘,sep=‘\D+‘,header=None) print(temp)
|
0 1 2
0 0 123 122
1 1 124 321
2 2 125 333
D:\ProgramData\Anaconda3_32\lib\site-packages\ipykernel_launcher.py:1: ParserWarning: Falling back to the ‘python‘ engine because the ‘c‘ engine does not support regex separators (separators > 1 char and different from ‘\s+‘ are interpreted as regex); you can avoid this warning by specifying engine=‘python‘.
"""Entry point for launching an IPython kernel.
Table 5-1. Metacharacters
. single character, except newline
\d digit
\D non-digit character
\s whitespace character
\S non-whitespace character
\n new line character
\t tab character
\uxxxx unicode character specified by the hexadecimal number xxxxch05_06.txt
##### LOG FILE
This file has been generated by automatic system
white,red,blue,green,animal
12-Feb-2015: Counting of animals inside the house
1,5,2,3,cat
2,7,8,5,dog
13-Feb-2015: Counting of animals outside the house
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse
1 2
|
temp = read_table(‘ch05_06.txt‘,sep=‘,‘,skiprows=[0,1,3,6]) print(temp)
|
white red blue green animal
0 1 5 2 3 cat
1 2 7 8 5 dog
2 3 3 6 7 horse
3 2 2 8 3 duck
4 4 4 2 1 mouse
5.3.2 从TXT文件读取部分数据 96
1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse
1 2
|
temp = read_csv(‘ch05_02.csv‘,skiprows=[2],nrows=3,header=None) print(temp)
|
0 1 2 3 4
0 1 5 2 3 cat
1 2 7 8 5 dog
2 2 2 8 3 duck
white,red,blue,green,animal
1,5,2,3,cat
2,7,8,5,dog
3,3,6,7,horse
2,2,8,3,duck
4,4,2,1,mouse
1 2 3 4 5 6 7 8
|
out = Series() i = 0 pieces = read_csv(‘ch05_01.csv‘,chunksize=3) for piece in pieces: print(piece) out.set_value(i,piece[‘white‘].sum()) i = i + 1 print(out)
|
white red blue green animal
0 1 5 2 3 cat
1 2 7 8 5 dog
2 3 3 6 7 horse
white red blue green animal
3 2 2 8 3 duck
4 4 4 2 1 mouse
0 6
1 6
dtype: int64
5.3.3 往CSV文件写入数据 97
1 2 3
|
import numpy as np frame2 = DataFrame(np.arange(16).reshape((4,4)), columns = [‘ball‘,‘pen‘,‘pencil‘,‘paper‘]) print(frame2)
|
ball pen pencil paper
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
1
|
frame2.to_csv(‘ch05_07.csv‘)
|
1
|
frame2.to_csv(‘ch05_07b.csv‘, index=False, header=False)
|
1 2
|
frame3 = read_csv(‘ch05_08.csv‘) print(frame3)
|
Unnamed: 0 ball mug paper pen pencil
0 blue 6.0 NaN NaN 6.0 NaN
1 green NaN NaN NaN NaN NaN
2 red NaN NaN NaN NaN NaN
3 white 20.0 NaN NaN 20.0 NaN
4 yellow 19.0 NaN NaN 19.0 NaN
1
|
frame3.to_csv(‘ch05_08.csv‘)
|
1
|
frame3.to_csv(‘ch05_09.csv‘, na_rep =‘NaN‘)
|
ch05_08.csv
,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,ch05_09.csv
,ball,mug,paper,pen,pencil
blue,6.0,NaN,NaN,6.0,NaN
green,NaN,NaN,NaN,NaN,NaN
red,NaN,NaN,NaN,NaN,NaN
white,20.0,NaN,NaN,20.0,NaN
yellow,19.0,NaN,NaN,19.0,NaN
5.4 读写HTML文件 98
5.4.1 写入数据到HTML文件 99
1 2
|
import pandas as pd frame = pd.DataFrame(np.arange(4).reshape(2,2))
|
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>0</th>
<th>1</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>0</td>
<td>1</td>
</tr>
<tr>
<th>1</th>
<td>2</td>
<td>3</td>
</tr>
</tbody>
</table>
1 2 3 4
|
frame = pd.DataFrame( np.random.random((4,4)), index = [‘white‘,‘black‘,‘red‘,‘blue‘], columns = [‘up‘,‘down‘,‘right‘,‘left‘]) print(frame)
|
up down right left
white 0.821537 0.070376 0.131624 0.718632
black 0.723164 0.409424 0.554343 0.361086
red 0.671485 0.993762 0.316291 0.999724
blue 0.834104 0.565786 0.922212 0.166467
1 2 3 4 5 6 7 8 9 10
|
s = [‘<HTML>‘] s.append(‘<HEAD><TITLE>My DataFrame</TITLE></HEAD>‘) s.append(‘<BODY>‘) s.append(frame.to_html()) s.append(‘</BODY></HTML>‘) html = ‘‘.join(s)
html_file = open(‘myFrame.html‘,‘w‘) html_file.write(html) html_file.close()
|
打开html
5.4.2 从HTML文件读取数据 100
1 2
|
web_frames = pd.read_html(‘myFrame.html‘) print(web_frames[0])
|
Unnamed: 0 up down right left
0 white 0.821537 0.070376 0.131624 0.718632
1 black 0.723164 0.409424 0.554343 0.361086
2 red 0.671485 0.993762 0.316291 0.999724
3 blue 0.834104 0.565786 0.922212 0.166467
1 2
|
ranking = pd.read_html(‘http://www.meccanismocomplesso.org/en/meccanismo-complesso-sito-2/classifica-punteggio/‘) print(ranking[0])
|
# Nome Exp Livelli
0 1 Fabio Nelli 17521 NaN
1 2 admin 9029 NaN
2 3 BrunoOrsini 2124 NaN
3 4 Berserker 700 NaN
4 5 Dnocioni 543 NaN
5 6 albertosallusti 409 NaN
6 7 Jon 231 NaN
7 8 Mr.Y 180 NaN
8 9 michele sisinni 157 NaN
9 10 Selina 136 NaN
10 11 Massimo 127 NaN
11 12 Beniamino Feula 122 NaN
12 13 stefano gustin 121 NaN
13 14 Maurizio Andreoli 111 NaN
14 15 Pietro Baima 108 NaN
15 16 Cecilia Lalatta Costerbosa 108 NaN
16 17 Leonardo Zampi 108 NaN
17 18 Davide Aloisi 106 NaN
18 19 gildalombardi 105 NaN
19 20 Telerobotlabs 104 NaN
20 21 Marco Contigiani 101 NaN
21 22 berillio 58 NaN
22 23 ron 55 NaN
23 24 Titanic4wd 43 NaN
24 25 deg 40 NaN
25 26 al45 40 NaN
26 27 il_mix 38 NaN
27 28 AndreaC 35 NaN
28 29 Sergio fly 32 NaN
29 30 bigazzi 32 NaN
.. ... ... ... ...
220 221 pozi 3 NaN
221 222 mattia 3 NaN
222 223 mauro.menegazzi 3 NaN
223 224 cico89 3 NaN
224 225 eta38 3 NaN
225 226 Chinje Chang 3 NaN
226 227 fraschettin 2 NaN
227 228 Rocco 2 NaN
228 229 Dimitri 2 NaN
229 230 Arturo 2 NaN
230 231 Paolo Indennidate 2 NaN
231 232 fabioroberto 2 NaN
232 233 ycomyca 2 NaN
233 234 bdb 2 NaN
234 235 paolotirispetta 2 NaN
235 236 Roberto72 2 NaN
236 237 Christian76 2 NaN
237 238 paolos46 2 NaN
238 239 Giolat90 2 NaN
239 240 giampyypmaig 1 NaN
240 241 Marco Corbetta 1 NaN
241 242 softeng 1 NaN
242 243 strechum 1 NaN
243 244 an6991 1 NaN
244 245 plato 1 NaN
245 246 CarloAlberto98 1 NaN
246 247 cris 1 NaN
247 248 emilibassi 1 NaN
248 249 mehrbano 1 NaN
249 250 NIKITA PANCHAL 1 NaN
[250 rows x 4 columns]
5.5 从XML读取数据 101
books.xml
<?xml version=”1.0”?>
272103_1_EnRoss, Mark Computer 23.56 2014-22-01 272103_1_EnBracket, Barbara Computer 35.95 2014-12-16
1
|
from lxml import objectify
|
1 2
|
xml = objectify.parse(‘books.xml‘) xml
|
<lxml.etree._ElementTree at 0xc5d74b8>
1 2 3
|
root = xml.getroot() print(root.Book.Author) print(root.Book.PublishDate)
|
272103_1_EnRoss, Mark
2014-22-01
[<Element Book at 0xc761378>, <Element Book at 0xc761d78>]
1
|
[child.tag for child in root.Book.getchildren()]
|
[‘Author‘, ‘Title‘, ‘Genre‘, ‘Price‘, ‘PublishDate‘]
1
|
[child.text for child in root.Book.getchildren()]
|
[‘272103_1_EnRoss, Mark‘, ‘XML Cookbook‘, ‘Computer‘, ‘23.56‘, ‘2014-22-01‘]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
|
def etree2df(root): column_names = [] for i in range(0,len(root.getchildren()[0].getchildren())): column_names.append(root.getchildren()[0].getchildren()[i].tag) xml_frame = pd.DataFrame(columns=column_names) for j in range(0, len(root.getchildren())): obj = root.getchildren()[j].getchildren() texts = [] for k in range(0, len(column_names)): texts.append(obj[k].text) row = dict(zip(column_names, texts)) row_s = pd.Series(row) row_s.name = j xml_frame = xml_frame.append(row_s) return xml_frame
|
1 2
|
temp = etree2df(root) print(temp)
|
Author Title Genre Price PublishDate
0 272103_1_EnRoss, Mark XML Cookbook Computer 23.56 2014-22-01
1 272103_1_EnBracket, Barbara XML for Dummies Computer 35.95 2014-12-16
5.6 读写Microsoft Excel文件 103
1 2
|
temp = pd.read_excel(‘data.xls‘) print(temp)
|
white red green black
a 12 23 17 18
b 22 16 19 18
c 14 23 22 21
1 2
|
temp = pd.read_excel(‘data.xls‘,‘Sheet2‘) print(temp)
|
yellow purple blue orange
A 11 16 44 22
B 20 22 23 44
C 30 31 37 32
1 2
|
temp = pd.read_excel(‘data.xls‘,1) print(temp)
|
yellow purple blue orange
A 11 16 44 22
B 20 22 23 44
C 30 31 37 32
1 2 3 4
|
frame = pd.DataFrame(np.random.random((4,4)), index = [‘exp1‘,‘exp2‘,‘exp3‘,‘exp4‘], columns = [‘Jan2015‘,‘Fab2015‘,‘Mar2015‘,‘Apr2005‘]) print(frame)
|
Jan2015 Fab2015 Mar2015 Apr2005
exp1 0.610508 0.434578 0.019900 0.099366
exp2 0.306480 0.961355 0.073820 0.742758
exp3 0.232366 0.197025 0.312307 0.697120
exp4 0.374647 0.123761 0.521675 0.641097
1
|
frame.to_excel(‘data2.xlsx‘)
|
5.7 JSON数据 105
1 2 3 4 5
|
frame = pd.DataFrame(np.arange(16).reshape(4,4), index=[‘white‘,‘black‘,‘red‘,‘blue‘], columns=[‘up‘,‘down‘,‘right‘,‘left‘]) print(frame) frame.to_json(‘frame.json‘)
|
up down right left
white 0 1 2 3
black 4 5 6 7
red 8 9 10 11
blue 12 13 14 15
{“up”:{“white”:0,”black”:4,”red”:8,”blue”:12},”down”:{“white”:1,”black”:5,”red”:9,”blue”:13},”right”:{“white”:2,”black”:6,”red”:10,”blue”:14},”left”:{“white”:3,”black”:7,”red”:11,”blue”:15}}
1 2
|
temp = pd.read_json(‘frame.json‘) print(temp)
|
down left right up
black 5 7 6 4
blue 13 15 14 12
red 9 11 10 8
white 1 3 2 0
1 2 3 4 5 6 7 8 9
|
from pandas.io.json import json_normalize file = open(‘books.json‘,‘r‘) text = file.read() text = json.loads(text) temp = json_normalize(text,‘books‘) print(temp) print() temp = json_normalize(text,‘books‘,[‘writer‘,‘nationality‘]) print(temp)
|
price title
0 23.56 XML Cookbook
1 50.70 Python Fundamentals
2 12.30 The NumPy library
3 28.60 Java Enterprise
4 31.35 HTML5
5 28.00 Python for Dummies
price title writer nationality
0 23.56 XML Cookbook Mark Ross USA
1 50.70 Python Fundamentals Mark Ross USA
2 12.30 The NumPy library Mark Ross USA
3 28.60 Java Enterprise Barbara Bracket UK
4 31.35 HTML5 Barbara Bracket UK
5 28.00 Python for Dummies Barbara Bracket UK
D:\ProgramData\Anaconda3_32\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: pandas.json is deprecated and will be removed in a future version.
You can access loads as pandas.io.json.loads
after removing the cwd from sys.path.
5.8 HDF5格式 107
1 2 3 4 5 6 7
|
from pandas.io.pytables import HDFStore frame = pd.DataFrame(np.arange(16).reshape(4,4), index=[‘white‘,‘black‘,‘red‘,‘blue‘], columns=[‘up‘,‘down‘,‘right‘,‘left‘]) store = HDFStore(‘mydata.h5‘) store[‘obj1‘] = frame store
|
<class ‘pandas.io.pytables.HDFStore‘>
File path: mydata.h5
/obj1 frame (shape->[4,4])
1 2
|
temp = store[‘obj1‘] print(temp)
|
up down right left
white 0 1 2 3
black 4 5 6 7
red 8 9 10 11
blue 12 13 14 15
5.9 pickle——Python对象序列化 108
5.9.1 用pickle实现Python对象序列化 109
1 2 3 4
|
import pickle as pickle data = { ‘color‘: [‘white‘,‘red‘], ‘value‘: [5, 7]} pickled_data = pickle.dumps(data) print(pickled_data)
|
b‘\x80\x03}q\x00(X\x05\x00\x00\x00colorq\x01]q\x02(X\x05\x00\x00\x00whiteq\x03X\x03\x00\x00\x00redq\x04eX\x05\x00\x00\x00valueq\x05]q\x06(K\x05K\x07eu.‘
1 2
|
nframe = pickle.loads(pickled_data) nframe
|
{‘color‘: [‘white‘, ‘red‘], ‘value‘: [5, 7]}
5.9.2 用pandas实现对象序列化 109
1 2 3 4 5
|
frame = pd.DataFrame(np.arange(16).reshape(4,4), index = [‘up‘,‘down‘,‘left‘,‘right‘]) frame.to_pickle(‘frame.pkl‘)
temp = pd.read_pickle(‘frame.pkl‘) print(temp)
|
0 1 2 3
up 0 1 2 3
down 4 5 6 7
left 8 9 10 11
right 12 13 14 15
5.10 对接数据库 110
1 2 3 4 5 6 7 8 9 10 11
|
from sqlalchemy import create_engine
|
上面代码运行可能会出现错误,根据运行结果,缺少什么使用pip安装什么就可以了。
5.10.1 SQLite3数据读写 111
1 2 3
|
frame = pd.DataFrame( np.arange(20).reshape(4,5), columns=[‘white‘,‘red‘,‘blue‘,‘black‘,‘green‘]) print(frame)
|
white red blue black green
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
1 2 3 4 5
|
engine = create_engine(‘sqlite:///foo.db‘) frame.to_sql(‘colors‘,engine)
temp = pd.read_sql(‘colors‘,engine) print(temp)
|
index white red blue black green
0 0 0 1 2 3 4
1 1 5 6 7 8 9
2 2 10 11 12 13 14
3 3 15 16 17 18 19
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
|
import sqlite3 query = """ CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );""" con = sqlite3.connect(‘:memory:‘) con.execute(query) con.commit()
data = [(‘white‘,‘up‘,1,3), (‘black‘,‘down‘,2,8), (‘green‘,‘up‘,4,4), (‘red‘,‘down‘,5,5)] stmt = "INSERT INTO test VALUES(?,?,?,?)" con.executemany(stmt, data) con.commit()
|
1 2 3 4
|
cursor = con.execute(‘select * from test‘) print(cursor) rows = cursor.fetchall() rows
|
<sqlite3.Cursor object at 0x0D591820>
[(‘white‘, ‘up‘, 1.0, 3),
(‘black‘, ‘down‘, 2.0, 8),
(‘green‘, ‘up‘, 4.0, 4),
(‘red‘, ‘down‘, 5.0, 5)]
((‘a‘, None, None, None, None, None, None),
(‘b‘, None, None, None, None, None, None),
(‘c‘, None, None, None, None, None, None),
(‘d‘, None, None, None, None, None, None))
1 2
|
temp = pd.DataFrame(rows, columns=[‘a‘,‘b‘,‘c‘,‘d‘]) print(temp)
|
a b c d
0 white up 1.0 3
1 black down 2.0 8
2 green up 4.0 4
3 red down 5.0 5
5.10.2 PostgreSQL数据读写 112
1
|
engine = create_engine(‘postgresql://scott:tiger@localhost:5432/mydatabase‘)
|
1 2 3 4 5 6
|
import pandas as pd import numpy as np frame = pd.DataFrame(np.random.random((4,4)), index=[‘exp1‘,‘exp2‘,‘exp3‘,‘exp4‘], columns=[‘feb‘,‘mar‘,‘apr‘,‘may‘]); frame.to_sql(‘dataframe‘,engine)
|
postgres=# SELECT * FROM DATAFRAME;
1 2
|
temp = pd.read_sql_table(‘dataframe‘,engine) print(temp)
|
index feb mar apr may
0 exp1 0.406820 0.964683 0.181662 0.660217
1 exp2 0.573869 0.940819 0.426104 0.484574
2 exp3 0.649881 0.059990 0.616504 0.681356
3 exp4 0.061554 0.733131 0.998748 0.127283
1 2
|
temp = pd.read_sql_query(‘SELECT index,apr,may FROM DATAFRAME WHERE apr > 0.5‘,engine) print(temp)
|
index apr may
0 exp3 0.616504 0.681356
1 exp4 0.998748 0.127283
5.11 NoSQL数据库MongoDB数据读写 114
1 2 3 4
|
import pymongo client = pymongo.MongoClient(‘localhost‘,27017) db = client.mydatabase db
|
Database(MongoClient(host=[‘localhost:27017‘], document_class=dict, tz_aware=False, connect=True), ‘mydatabase‘)
Database(MongoClient(host=[‘localhost:27017‘], document_class=dict, tz_aware=False, connect=True), ‘mydatabase‘)
1 2
|
collection = db.mycollection db[‘mycollection‘]
|
Collection(Database(MongoClient(host=[‘localhost:27017‘], document_class=dict, tz_aware=False, connect=True), ‘mydatabase‘), ‘mycollection‘)
Collection(Database(MongoClient(host=[‘localhost:27017‘], document_class=dict, tz_aware=False, connect=True), ‘mydatabase‘), ‘mycollection‘)
1 2 3
|
frame = pd.DataFrame( np.arange(20).reshape(4,5), columns=[‘white‘,‘red‘,‘blue‘,‘black‘,‘green‘]) print(frame)
|
white red blue black green
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
1 2 3
|
import json record = json.loads(frame.T.to_json()).values() record
|
dict_values([{‘white‘: 0, ‘red‘: 1, ‘blue‘: 2, ‘black‘: 3, ‘green‘: 4}, {‘white‘: 5, ‘red‘: 6, ‘blue‘: 7, ‘black‘: 8, ‘green‘: 9}, {‘white‘: 10, ‘red‘: 11, ‘blue‘: 12, ‘black‘: 13, ‘green‘: 14}, {‘white‘: 15, ‘red‘: 16, ‘blue‘: 17, ‘black‘: 18, ‘green‘: 19}])
1
|
collection.mydocument.insert(record)
|
1 2 3 4 5 6
|
cursor = collection[‘mydocument‘].find() print(type(cursor)) dataframe = (list(cursor)) data = pd.DataFrame(dataframe) del data[‘_id‘] print(data)
|
<class ‘pymongo.cursor.Cursor‘>
black blue green red white
0 3 2 4 1 0
1 8 7 9 6 5
2 13 12 14 11 10
3 18 17 19 16 15
5.12 小结 116
python数据分析实战-第5章-pandas数据读写
标签:set rate dig int() ril write bsp and record
原文地址:https://www.cnblogs.com/LearnFromNow/p/9349928.html