import numpy as np import pandas as pd import matplotlib.pyplot as plt from pyod.utils.data import generate_data contamination = 0.05# percentage of outliers n_train = 500# number of training points n_test = 500# number of testing points n_features = 6# number of features X_train, X_test, y_train, y_test = generate_data( n_train=n_train, n_test=n_test, n_features= n_features, contamination=contamination, random_state=123)
# Make the 2d numpy array a pandas dataframe for each manipulation X_train_pd = pd.DataFrame(X_train)
from pyod.models.knn import KNN knn = KNN(contamination=0.05) knn.fit(X_train)
# Training data y_train_scores = knn.decision_function(X_train) y_train_pred = knn.predict(X_train)
# Test data y_test_scores = knn.decision_function(X_test) y_test_pred = knn.predict(X_test) # outlier labels (0 or 1)
defcount_stat(vector): # Because it is '0' and '1', we can run a count statistic. unique, counts = np.unique(vector, return_counts=True) return dict(zip(unique, counts))
print("The training data:", count_stat(y_train_pred)) print("The training data:", count_stat(y_test_pred)) # Threshold for the defined comtanimation rate print("The threshold for the defined comtanimation rate:" , knn.threshold_)
The training data: {0: 475, 1: 25} The training data: {0: 475, 1: 25} The threshold for the defined comtanimation rate: 0.7566127656515499
import matplotlib.pyplot as plt plt.hist(y_train_scores, bins='auto') # arguments are passed to np.histogram plt.title("Histogram with 'auto' bins") plt.xlabel('KNN outlier score') plt.show()
threshold = knn.threshold_
# Or other value from the above histogram
defdescriptive_stat_threshold(df,pred_score, threshold): # Let's see how many '0's and '1's. df = pd.DataFrame(df) df['Anomaly_Score'] = pred_score df['Group'] = np.where(df['Anomaly_Score']'Normal', 'Outlier')
# Now let's show the summary statistics: cnt = df.groupby('Group')['Anomaly_Score'].count().reset_index().rename(columns={'Anomaly_Score':'Count'}) cnt['Count %'] = (cnt['Count'] / cnt['Count'].sum()) * 100# The count and count % stat = df.groupby('Group').mean().round(2).reset_index() # The avg. stat = cnt.merge(stat, left_on='Group',right_on='Group') # Put the count and the avg. together return (stat)
from pyod.models.combination import aom, moa, average, maximization from pyod.utils.utility import standardizer # Standardize data X_train_norm, X_test_norm = standardizer(X_train, X_test) # Test a range of k-neighbors from 10 to 200. There will be 20 k-NN models. k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200] n_clf = len(k_list) # Just prepare data frames so we can store the model results train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) train_scores.shape # Modeling for i in range(n_clf): k = k_list[i] clf = KNN(n_neighbors=k, method='largest') clf.fit(X_train_norm)
# Store the results in each column: train_scores[:, i] = clf.decision_scores_ test_scores[:, i] = clf.decision_function(X_test_norm) # Decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores,test_scores)
# Combination by average # The test_scores_norm is 500 x 10. The "average" function will take the average of the 10 columns. # The result "y_by_average" is a single column: y_train_by_average = average(train_scores_norm) y_test_by_average = average(test_scores_norm) import matplotlib.pyplot as plt plt.hist(y_train_by_average, bins='auto') # arguments are passed to np.histogram plt.title("Combination by average") plt.show()