我创建了一个包含 500 个观测值和 6 个变量的模拟数据集,其中异常值的百分比设定为 5%。同时,我还设立了一个目标变量 Y 作为基本事实,但无监督模型只使用 X 变量,Y 变量只是用于验证。
import numpy as np import pandas as pd import matplotlib.pyplot as plt from pyod.utils.data import generate_data contamination = 0.05# percentage of outliers n_train = 500# number of training points n_test = 500# number of testing points n_features = 6# number of features X_train, X_test, y_train, y_test = generate_data( n_train=n_train, n_test=n_test, n_features= n_features, contamination=contamination, random_state=123)
from pyod.models.ecod import ECOD ecod = ECOD(contamination=0.05) ecod.fit(X_train)
# Training data y_train_scores = ecod.decision_function(X_train) y_train_pred = ecod.predict(X_train)
# Test data y_test_scores = ecod.decision_function(X_test) y_test_pred = ecod.predict(X_test) # outlier labels (0 or 1)
defcount_stat(vector): # Because it is '0' and '1', we can run a count statistic. unique, counts = np.unique(vector, return_counts=True) return dict(zip(unique, counts))
print("The training data:", count_stat(y_train_pred)) print("The training data:", count_stat(y_test_pred)) # Threshold for the defined comtanimation rate print("The threshold for the defined comtanimation rate:" , ecod.threshold_)
The training data: {0: 450, 1: 50} The training data: {0: 444, 1: 56} The threshold for the defined comtanimation rate: 12.75035460032711
threshold = ecod.threshold_ # Or other value from the above histogram
defdescriptive_stat_threshold(df,pred_score, threshold): # Let's see how many '0's and '1's. df = pd.DataFrame(df) df['Anomaly_Score'] = pred_score df['Group'] = np.where(df['Anomaly_Score']'Normal', 'Outlier')
# Now let's show the summary statistics: cnt = df.groupby('Group')['Anomaly_Score'].count().reset_index().rename(columns={'Anomaly_Score':'Count'}) cnt['Count %'] = (cnt['Count'] / cnt['Count'].sum()) * 100# The count and count % stat = df.groupby('Group').mean().round(2).reset_index() # The avg. stat = cnt.merge(stat, left_on='Group',right_on='Group') # Put the count and the avg. together return (stat)
# Put the actual, the HBO score and the ECOD score together Actual_pred = pd.DataFrame({'Actual': y_test, 'HBOS_pred': y_test_hbos_pred, 'ECOD_pred': y_test_ecod_pred}) Actual_pred.head() pd.crosstab(Actual_pred['HBOS_pred'],Actual_pred['ECOD_pred'])