前言
在机器学习中,支持向量机 (台湾称支援向量機,英语:support vector machine,常简称为SVM,又名支持向量网络)是在分类与回归分析中分析数据的监督式学习模型与相关的学习算法。给定一组训练实例,每个训练实例被标记为属于两个类别中的一个或另一个,SVM训练算法建立一个将新的实例分配给两个类别之一的模型,使其成为非概率二元线性分类器。SVM模型是将实例表示为空间中的点,这样映射就使得单独类别的实例被尽可能宽的明显的间隔分开。然后,将新的实例映射到同一空间,并基于它们落在间隔的哪一侧来预测所属类别。
支持向量机在高维或无限维空间中构造超平面或超平面集合,其可以用于分类、回归或其他任务。直观来说,分类边界距离最近的训练资料点越远越好,因为这样可以缩小分类器的泛化误差。
数据文件及代码
数据文件下载:https://file.sfnote.com/d/data/SVM.csv
# 导入必要模块
import time
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn import svm
from sklearn.decomposition import FactorAnalysis, PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
start_time = time.time()
# 处理空值,使用均值进行填充
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# 加载数据集
df = pd.read_csv("./SVM.csv")
# print(df.isnull().sum())
# 数据集基本处理
# 输出列
y_col = ["type"]
# 舍弃的字段
drop = ["manufact", "model", "sales", "resale", "lnsales", "partition"]
# 截取最后两行作为新的输入
new_data = df[-2:].drop(drop, axis=1).drop(y_col, axis=1)
# 去除无效字段
df = df[:-2].drop(drop, axis=1)
# 舍弃含空值的行
df = df.dropna()
# 数据集平衡处理
# 查看数据集是否平衡
# print((df['type'] == 0).sum()) # 112
# print((df['type'] == 1).sum()) # 40
# 最终确定输入输出
X = df.drop(y_col, axis=1)
y = df['type']
# 划分数据集
test_size = 0.1
random_state = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
# 过采样处理数据
smote = SMOTE(random_state=random_state)
ros = RandomOverSampler(random_state=random_state) # 或使用随机过采样处理
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# 标准化输入特征
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)
# 使用FCA进行数据降维处理
print("using FCA:")
fca = FactorAnalysis(n_components=5) # 选择要保留的主成分数量
X_train_resampled_fca = fca.fit_transform(X_train_resampled)
X_test_fca = fca.transform(X_test)
# 设置模型并训练,给出模型性能
module = svm.SVC(kernel='linear', random_state=random_state)
module.fit(X_train_resampled_fca, y_train_resampled)
y_pred = module.predict(X_test_fca)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# 全部训练并给出预测结果
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled_fca = fca.fit_transform(X_resampled)
X_new_fca = fca.transform(new_data)
module.fit(X_resampled_fca, y_resampled)
y_pred_new = module.predict(X_new_fca)
print("results:", y_pred_new)
# # 使用LDA进行数据降维处理
print("using LDA:")
lda = LinearDiscriminantAnalysis(n_components=1)
X_train_resampled_lda = lda.fit_transform(X_train_resampled, y_train_resampled)
X_test_lda = lda.transform(X_test)
# 设置模型并训练,给出模型性能
module = svm.SVC(kernel='linear', random_state=random_state)
module.fit(X_train_resampled_lda, y_train_resampled)
y_pred = module.predict(X_test_lda)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# 全部训练并给出预测结果
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled_lda = lda.fit_transform(X_resampled, y_resampled)
X_new_lda = lda.transform(new_data)
module.fit(X_resampled_lda, y_resampled)
y_pred_new = module.predict(X_new_lda)
print("results:", y_pred_new)
# 使用PCA进行数据降维处理
print("using PCA:")
pca = PCA(n_components=5)
X_train_resampled_pca = pca.fit_transform(X_train_resampled)
X_test_pca = pca.transform(X_test)
# 设置模型并训练,给出模型性能
module = svm.SVC(kernel='linear', random_state=random_state)
module.fit(X_train_resampled_pca, y_train_resampled)
y_pred = module.predict(X_test_pca)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# 全部训练并给出预测结果
module = svm.SVC(kernel='rbf', random_state=random_state)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled_pca = pca.fit_transform(X_resampled)
X_new_pca = pca.transform(new_data)
module.fit(X_resampled_pca, y_resampled)
y_pred_new = module.predict(X_new_pca)
print("results:", y_pred_new)
# 程序运行时间
print("time used:", time.time() - start_time)
以上只是简单示例,实际应用中,需要自行选择相关模型和参数