我将使用 PyOD 的generate_data()函数生成10%的离群值。数据生成过程 (DGP) 将模拟六个变量。尽管模拟数据集有目标变量 Y,但无监督模型只使用 X 变量,而 Y 变量仅用于验证。异常值的百分比被设置为5%,即"contamination=0.05"。
import numpy as np import pandas as pd import matplotlib.pyplot as plt from pyod.utils.data import generate_data contamination = 0.05# percentage of outliers n_train = 500# number of training points n_test = 500# number of testing points n_features = 6# number of features X_train, X_test, y_train, y_test = generate_data( n_train=n_train, n_test=n_test, n_features= n_features, contamination=contamination, random_state=123)
from pyod.models.gmm import GMM gmm = GMM(n_components=4, contamination=0.05) gmm.fit(X_train)
# Training data y_train_scores = gmm.decision_function(X_train) y_train_pred = gmm.predict(X_train)
# Test data y_test_scores = gmm.decision_function(X_test) y_test_pred = gmm.predict(X_test) # outlier labels (0 or 1)
defcount_stat(vector): # Because it is '0' and '1', we can run a count statistic. unique, counts = np.unique(vector, return_counts=True) return dict(zip(unique, counts))
print("The training data:", count_stat(y_train_pred)) print("The training data:", count_stat(y_test_pred)) # Threshold for the defined comtanimation rate print("The threshold for the defined comtanimation rate:" , gmm.threshold_)
The training data: {0: 475, 1: 25} The training data: {0: 466, 1: 34} The threshold for the defined comtanimation rate: 4.321327580839012
import matplotlib.pyplot as plt plt.hist(y_train_scores, bins='auto') # arguments are passed to np.histogram plt.title("Histogram with 'auto' bins") plt.xlabel('GMM outlier score') plt.show()
threshold = gmm.threshold_ # Or other value from the above histogram
defdescriptive_stat_threshold(df,pred_score, threshold): # Let's see how many '0's and '1's. df = pd.DataFrame(df) df['Anomaly_Score'] = pred_score df['Group'] = np.where(df['Anomaly_Score']'Normal', 'Outlier')
# Now let's show the summary statistics: cnt = df.groupby('Group')['Anomaly_Score'].count().reset_index().rename(columns={'Anomaly_Score':'Count'}) cnt['Count %'] = (cnt['Count'] / cnt['Count'].sum()) * 100# The count and count % stat = df.groupby('Group').mean().round(2).reset_index() # The avg. stat = cnt.merge(stat, left_on='Group',right_on='Group') # Put the count and the avg. together return (stat)
from pyod.models.combination import aom, moa, average, maximization from pyod.utils.utility import standardizer from pyod.models.gmm import GMM
# Standardize data X_train_norm, X_test_norm = standardizer(X_train, X_test) # Test a range of clusters from 2 to 8. There will be 7 models. n_clf = 7 k_list = [2, 3, 4, 5, 6, 7, 8] # Just prepare data frames so we can store the model results train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) train_scores.shape # Modeling for i in range(n_clf): k = k_list[i] gmm = GMM(n_components = k) gmm.fit(X_train_norm)
# Store the results in each column: test_scores[:, i] = gmm.decision_function(X_test_norm) # Decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores,test_scores)
# Combination by average # The test_scores_norm is 500 x 7. The "average" function will take the average of the 7 columns. # The result "y_by_average" is a single column: y_train_by_average = average(train_scores_norm) y_test_by_average = average(test_scores_norm) import matplotlib.pyplot as plt plt.hist(y_train_by_average, bins='auto') # arguments are passed to np.histogram plt.title("Combination by average") plt.show()