SKlearn LinearSVC

#coding

记录一些用得上的。

数据初始化

使用pandas对数据进行处理,将分类转换为数字。

数据表样式是

Sample Class Feature1 Feature2 Feature3 Feature4
S1 Pathogenic 0 2 8 1
S2 Benign 6 2 5 0
S3 Other 9 3 7 1

计算log2CPM值,但为了避免0值使用的加1方法可能会导致倍数不准

import pandas as pd
from math import log

def normalizationDF(table, clsList):
	df = pd.read_table(table, sep="\t", header=0, index_col=0)
	for i in df.index:
		types = df.loc[i, "Class"]
		for c in range(len(clsList)):
			df.loc[i, "Class"] = c
	dropRow = []
	for i in df.index:
		if not isinstance(df.loc[i, "Class"], int):
			dropRow.append(i)
	df = df.drop(dropRow)
	colList = df.columns.values.tolist()
	colSumDict = {}
	for i in df.index:
		gSum = 0
		for col in colList:
			if col != "Class":
				gSum += df.loc[i, col]
		colSumDict[i] = gSum
	for i in df.index:
		for col in colList:
			if col !=  "Class":
				# 使用log2CPM
				df.loc[i, col] = log((df.loc[i, col] / colSumDict[i] * 1000000 + 1), 2)
	return df
	
# 数据矩阵获得,不分析Other
classList = ["Benign", "Pathogenic"]
train = normalizationDF("train.txt", classList)
test = normalizationDF("test.txt", classList)

模型建立

使用SKlearn的LinearSVC模型。

from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

def LinearSVCTrain(xtrain, ytrain):
	model = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-4, max_iter=1000, C=1.0))
	clf = model.fit(xtrain, ytrain)
	return clf
	
# 获得结果
xtrain = train.iloc[:, 1:]
ytrain = train["Class"]
clf = LinearSVCTrain(xtrain, ytrain)

交叉验证

有时候需要交叉验证。

from sklearn.model_selection import cross_val_score

def crossValScorePipe(x, y, cv=10):
	model = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-4, max_iter=1000, C=1.0))
	scores = cross_val_score(model, x, y, cv)
	return scores

# 打印结果
print(crossValScorePipe(xtrain, ytrain))

模型保存及读取

将模型保存下来,方便下次对其他测试数据使用。

import joblib

def model_save(clf, model):
	joblib.dump(clf, model)
	
def model_load(model):
	clf = joblib.load(model)
	return clf
	
# 保存
model_save(clf, "LinearSVC.model")
# 读取
clf = model_load("LinearSVC.model")

贡献度统计

统计模型中,每个feature的贡献度,并画出前20个feature的柱状图。

import matplotlib.pyplot as plt
import seaborn as sns

def feature_importance(x, model, clsName="linearsvc"):
	feature_names = x.columns.values.tolist()
	coefs = model.named_steps[clsName].coef_.flatten()
	zipped = zip(feature_names, coefs)
	df = pd.DataFrame(zipped, columns=["Feature", "Value"])
	df["Abs_value"] = df["Value"].apply(lambda x: abs(x))
	df["Colors"] = df["Value"].apply(lambda x: "green" if x > 0 else "red")
	df = df.sort_values("Abs_value", ascending=False)
	return df

def feature_plot(df, picName):
	fig, ax = plt.subplots(1, 1, figsize=(15, 12))
	sns.barplot(x="Feature", y="Value", data=df.head(20), palette=df.head(20)["Colors"])
	ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
	ax.set_title("Top 20 Features", fontsize=25)
	ax.set_ylabel("Coef", fontsize=22)
	ax.set_xlabel("Feature Name", fontsize=20)
	plt.savefig(picName)

#  画图及保存
f_imp = feature_importance(xtrain, clf)
f_imp.to_csv("LinearSVC.FeaturesRanking.txt", sep="\t", index=False)
feature_plot(f_imp, "LinearSVC.Top20Features.png")

模型测试

使用模型对测试数据进行预测。

def modelPredict(xtest, clf, clsList):
	predict_clf = list(clf.predict(xtest))
	predict_transform = []
	for pc in predict_clf:
		predict_transform.append(clsList[pc])
	name_test = xtest.index
	sampleName = list(name_test)
	zipped = list(zip(sampleName, predict_transform))
	return zipped

# 测试
xtest = test.iloc[:, 1:]
ytest = test["Class"]
zTest = modelPredict(xtest, clf, classList)
results = open("LinearSVC.results.txt", "w", encoding="utf-8")
for z in zTest:
	results.write(z[0] + "\t" + z[1] + "\n")
results.close()