标签:machine learning python 数据处理 可视化
http://blog.csdn.net/pipisorry/article/details/44833603
%matplotlib inline
import requests
print requests.get("http://example.com").text
response = requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"})
raw_data = response.json()
titles = [item[‘volumeInfo‘][‘title‘] for item in raw_data[‘items‘]]
titles
import lxml.html
page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple")
# ^ This is probably illegal. Blocket, please don‘t sue me!
items_data = []
for el in page.getroot().find_class("item_row"):
links = el.find_class("item_link")
images = el.find_class("item_image")
prices = el.find_class("list_price")
if links and images and prices and prices[0].text:
items_data.append({"name": links[0].text,
"image": images[0].attrib[‘src‘],
"price": int(prices[0].text.split(":")[0].replace(" ", ""))})
items_data
import pandas
df = pandas.read_csv(‘sample.csv‘)
# Display the DataFrame
df
# DataFrame‘s columns
df.columns
# Values of a given column
df.Model
# Any missing values?
df[‘Price‘]
df[‘Description‘]
# Fill missing prices by a linear interpolation
df[‘Description‘] = df[‘Description‘].fillna("No description is available.")
df[‘Price‘] = df[‘Price‘].interpolate()
df
import matplotlib.pyplot as plt
df = pandas.read_csv(‘sample2.csv‘)
df
# This table has 3 columns: Office, Year, Sales
print df.columns
# It‘s really easy to query data with Pandas:
print df[(df[‘Office‘] == ‘Stockholm‘) & (df[‘Sales‘] > 260)]
# It‘s also easy to do aggregations...
aggregated_sales = df.groupby(‘Year‘).sum()
print aggregated_sales
# ... and generate plots
%matplotlib inline
aggregated_sales.plot(kind=‘bar‘)
from sklearn import feature_extraction
corpus = [‘All the cats really are great.‘,
‘I like the cats but I still prefer the dogs.‘,
‘Dogs are the best.‘,
‘I like all the trains‘,
]
tfidf = feature_extraction.text.TfidfVectorizer()
print tfidf.fit_transform(corpus).toarray()
print tfidf.get_feature_names()
import json
data = [json.loads("""{"weight": 194.0, "sex": "female", "student": true}"""),
{"weight": 60., "sex": ‘female‘, "student": True},
{"weight": 80.1, "sex": ‘male‘, "student": False},
{"weight": 65.3, "sex": ‘male‘, "student": True},
{"weight": 58.5, "sex": ‘female‘, "student": False}]
vectorizer = feature_extraction.DictVectorizer(sparse=False)
vectors = vectorizer.fit_transform(data)
print vectors
print vectorizer.get_feature_names()
class A:
def __init__(self, x):
self.x = x
self.blabla = ‘test‘
a = A(20)
a.__dict__
from sklearn import preprocessing
data = [[10., 2345., 0., 2.],
[3., -3490., 0.1, 1.99],
[13., 3903., -0.2, 2.11]]
print preprocessing.normalize(data)
from sklearn import decomposition
data = [[0.3, 0.2, 0.4, 0.32],
[0.3, 0.5, 1.0, 0.19],
[0.3, -0.4, -0.8, 0.22]]
pca = decomposition.PCA()
print pca.fit_transform(data)
print pca.explained_variance_ratio_
from sklearn import datasets
from sklearn import svm
iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target
# Training the model
clf = svm.SVC(kernel=‘rbf‘)
clf.fit(X, y)
# Doing predictions
new_data = [[4.85, 3.1], [5.61, 3.02]]
print clf.predict(new_data)
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
def f(x):
return x + np.random.random() * 3.
X = np.arange(0, 5, 0.5)
X = X.reshape((len(X), 1))
y = map(f, X)
clf = linear_model.LinearRegression()
clf.fit(X, y)
new_X = np.arange(0.2, 5.2, 0.3)
new_X = new_X.reshape((len(new_X), 1))
new_y = clf.predict(new_X)
plt.scatter(X, y, color=‘g‘, label=‘Training data‘)
plt.plot(new_X, new_y, ‘.-‘, label=‘Predicted‘)
plt.legend()
from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.4,
random_state=0)
X = StandardScaler().fit_transform(X)
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
db.labels_
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=db.labels_)
from sklearn import svm, cross_validation, datasets
iris = datasets.load_iris()
X, y = iris.data, iris.target
model = svm.SVC()
print cross_validation.cross_val_score(model, X, y, scoring=‘precision‘)
print cross_validation.cross_val_score(model, X, y, scoring=‘mean_squared_error‘)
ref:Data-processing and machine learning with Python
Python下的数据处理和机器学习,对数据在线及本地获取、解析、预处理和训练、预测、交叉验证、可视化
标签:machine learning python 数据处理 可视化
原文地址:http://blog.csdn.net/pipisorry/article/details/44833603