확장 가능한 path 설정
root_path = "/lockard_ai/works/malware_detection/"
for root, dir, file in os.walk(root_path):
for f in file:
if "preprocessed_data" in f:
origin_data_path = os.path.join(root, f)
print(origin_data_path)
if "KISA_total.csv" in f:
KISA_dataset_path = os.path.join(root, f)
print(KISA_dataset_path)
normal과 malware 개수
sns.histplot(data=df, x='label',hue=df['label'], bins=2)

def extratree_process(X, y):
extratrees = ek.ExtraTreesClassifier().fit(X,y)
model = SelectFromModel(extratrees, prefit=True, threshold=1.8e-2)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.2)
features = list()
index = np.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]
print("index : ", index)
for f in range(nbfeatures):
print("%d. feature %s (%f)" % (f + 1, dataset.columns[index[f]], extratrees.feature_importances_[index[f]]))
features.append(dataset.columns[index[f]])
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
"RandomForest":ek.RandomForestClassifier(n_estimators=50),
"Adaboost":ek.AdaBoostClassifier(n_estimators=50),
"GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
"GNB":GaussianNB(),
"LinearRegression":LinearRegression()
}
print("\\n모델 성능 비교")
# 모델 성능 비교
results = {}
for algo in model:
clf = model[algo]
clf.fit(X_train,y_train)
score = clf.score(X_test,y_test)
print ("%s : %s " %(algo, score))
results[algo] = score
return model['RandomForest'], features, results
ML의 multi classification은 단순히 y의 값을 multi class 값으로 전달하여 fit을 진행해주기만 하면 된다.
각 클래스에 대해서 label encoding 진행
각 malware 클래스별 개수
sns.histplot(data=df, x='label',hue=df['label'], bins=2)