标签:apply ati pandas ack 密度 normal lib nal port
#LOF异常检测算法主要用异常点的检测输出
from scipy.spatial.distance import cdist
import numpy as np
class LOF:
def __init__(self, data, k, epsilon=1.0):
self.data = data
self.k = k
self.epsilon = epsilon
self.N = self.data.shape[0]
def get_dist(self):
# 计算欧式距离矩阵
return cdist(self.data, self.data)
def _kdist(self, arr):
# 计算k距离
inds_sort = np.argsort(arr)
neighbor_ind = inds_sort[1:self.k + 1] # 邻域内点索引
return neighbor_ind, arr[neighbor_ind[-1]]
def get_rdist(self):
# 计算可达距离
dist = self.get_dist()
nei_kdist = np.apply_along_axis(self._kdist, 1, dist)
nei_inds, kdist = zip(*nei_kdist)
for i, k in enumerate(kdist):
ind = np.where(dist[i] < k) # 实际距离小于k距离,则可达距离为k距离
dist[i][ind] = k
return nei_inds, dist
def get_lrd(self, nei_inds, rdist):
# 计算局部可达密度
lrd = np.zeros(self.N)
for i, inds in enumerate(nei_inds):
s = 0
for j in inds:
s += rdist[j, i]
lrd[i] = self.k / s
return lrd
def run(self):
# 计算局部离群因子
nei_inds, rdist = self.get_rdist()
lrd = self.get_lrd(nei_inds, rdist)
score = np.zeros(self.N)
for i, inds in enumerate(nei_inds):
N = len(inds)
lrd_nei = sum(lrd[inds])
score[i] = lrd_nei / self.k / lrd[i]
return score, np.where(score > self.epsilon)[0]
if __name__ == ‘__main__‘:
np.random.seed(42)
import pandas as pd
data=pd.read_excel("finaldata.xlsx")
data=data.loc[:,["p1","p2","p3"]]
print(data)
x = np.random.normal(2, 1, size=(10000, 2))
print(x)
y = np.random.normal(5, 1, size=(20, 2))
print(y)
z = np.vstack((x, y))
print(z)
data=z
data=np.array(data)
lof = LOF(data,5, epsilon=3)
score, out_ind = lof.run()
outliers = data[out_ind]
import matplotlib.pyplot as plt
plt.scatter(data[:, 0], data[:, 1], color=‘b‘)
plt.scatter(outliers[:, 0], outliers[:, 1], color=‘r‘)
plt.show()
print(data)
print(outliers)
print(out_ind)
标签:apply ati pandas ack 密度 normal lib nal port
原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/13367505.html