记录一下几种统计学方法的python写法
fisher精确检验
odds ratio:两个事件的相关度,大于1为正相关,小于1为负相关,等于1为不相关;
p-value:置信度
置信区间:odds ratio在此区间内置信度为p-value
import pandas as pd
from scipy.stats import fisher_exact
from scipy.stats import norm
import math
def fisher_own(dataframe, groupAname, groupBname, yates=True):
groupA_muts = dataframe.loc["突变", groupAname]
groupA_unmuts = dataframe.loc["未突变", groupAname]
groupB_muts = dataframe.loc["突变", groupBname]
groupB_unmuts = dataframe.loc["未突变", groupBname]
# 构建二联表
observed = [[groupA_muts, groupB_muts], [groupA_unmuts, groupB_unmuts]]
# 如果列联表中包含0值,需要Yates修正,将所有值增加一个平均数
if yates:
while 0 in [groupA_muts, groupA_unmuts, groupB_muts, groupB_unmuts]:
ave = float(sum([groupA_muts, groupA_unmuts, groupB_muts, groupB_unmuts]) / 4)
groupA_muts = groupA_muts + ave
groupA_unmuts = groupA_unmuts + ave
groupB_muts = groupB_muts + ave
groupB_unmuts = groupB_unmuts + ave
observed = [[groupA_muts, groupB_muts], [groupA_unmuts, groupB_unmuts]]
odds_ratio, p_value = fisher_exact(observed)
# 计算0.95置信水平下的置信区间,存在0值时,理论上fisher精确检验无法计算置信区间
# 改用wilson方法进行计算
alpha = 0.05
if 0 in [groupA_muts, groupA_unmuts, groupB_muts, groupB_unmuts]:
n = groupA_muts + groupA_unmuts + groupB_muts + groupB_unmuts
p_hat = (groupA_muts + 0.5) / (groupA_muts + groupB_muts + 1)
z_alpha_half = norm.ppf(1 - alpha / 2)
denominator = 1 + (z_alpha_half ** 2) / n
centre_adjusted_probability = p_hat + (z_alpha_half ** 2) / (2 * n)
adjusted_standard_error = math.sqrt((p_hat * (1 - p_hat) + (z_alpha_half ** 2) / (4 * n)) / n)
lower_bound = centre_adjusted_probability - adjusted_standard_error * z_alpha_half / denominator
upper_bound = centre_adjusted_probability + adjusted_standard_error * z_alpha_half / denominator
else:
try:
log_odds_ratio = math.log(odds_ratio)
se_log = math.sqrt((1 / groupA_muts) + (1 / groupA_unmuts) + (1 / groupB_muts) + (1 / groupB_unmuts))
z_alpha_half = abs(math.erf(alpha / math.sqrt(2)))
lower_bound = math.exp(log_odds_ratio - z_alpha_half * se_log)
upper_bound = math.exp(log_odds_ratio + z_alpha_half * se_log)
except:
lower_bound = upper_bound = ""
return odds_ratio, p_value, lower_bound, upper_bound
data = {"组A": {"突变": 34, "未突变": 0}, "组B": {"突变": 10, "未突变": 20}}
df = pd.DataFrame(data)
output = fisher_own(df, "组A", "组B")
Bonferroni校正
Bonferroni校正是一种用于多重比较中控制错误率的方法,当我们进行多个统计假设检验时,以防止因执行大量假设检验而增加,在对多个位点逐个进行了fisher精确检验等统计后,将p值与新的显著性水平进行比较。
def bonferroni_fix(p_value_dict, alpha=0.05):
bonfierroni_pass = {}
n = len(p_value_dict)
new_alpha = alpha / n
for i in p_value_dict:
if p_value_dict[i] < new_alpha:
bonfierroni_pass[i] = True
else:
bonfierroni_pass[i] = False
return bonfierroni_pass
卡方检验
期望值 = (行总和 x 列总和)/ 总样本数
卡方值:计算观察与期望值的差异,越大越显著,0值时不显著
自由度:可用独立变化的数据个数,一般是行数减去1再乘以列数减去1
P值:观察的数据与原假设的矛盾程度,如果小于显著性水平(一般是0.05)则拒绝原假设,并认为观察到的结果是显著的
贡献度:某些情况下不同因素对结果的贡献程度,越大则贡献度越大,负值表示减少了整体差异
import pandas as pd
import numpy as np
from scipy.stats import chi2
def chi2_own(dataframe):
observed = dataframe.T.values
# 计算期望频次
row_totals = observed.sum(axis=1)
col_totals = observed.sum(axis=0)
total = observed.sum()
expected = np.outer(row_totals, col_totals) / total
# 计算卡方值
chi_squared = np.sum((observed - expected) ** 2 / expected)
# 计算自由度和显著性水平
degrees_of_freedom = (observed.shape[0] - 1) * (observed.shape[1] - 1)
p_value = 1 - chi2.cdf(chi_squared, degrees_of_freedom)
# 计算贡献度
with np.errstate(divide="ignore", invalid="ignore"):
contribution = (observed / total) * (np.log(observed / expected))
# nan值改为0
contribution[np.isnan(contribution)] = 0
df_contri = pd.DataFrame(contribution).T
df_contri.index = dataframe.index
df_contri.columns = dataframe.columns
return chi_squared, p_value, degrees_of_freedom, df_contri
data = {"组A": {"位点1": 34, "位点2": 0, "位点3": 29, "位点4": 10}, "组B": {"位点1": 4, "位点2": 10, "位点3": 20, "位点4": 10}}
df = pd.DataFrame(data)
output = chi2_own(df)
蒙特卡洛模拟
个别位点仅在单个样本中突变,计算所得卡方值比较大,因此使用Monte Carlo模拟数据,并计算一个卡方值的阈值,认为95分位数时,显著,实际操作中卡方值以此为上限。这里使用了上面的chi2_own函数。
import random
import numpy as np
def monte_carlo_sim(act_data_list):
# observed_data = np.array(act_data_list)
num_simulations = 10000
chi2_values = []
for n in range(num_simulations):
# 模拟所得的虚拟样本
simulated_data = random.choice(act_data_list)
chi_squared, p_value, _, _ = chi2_own(simulated_data)
if not str(chi_squared) == "nan":
chi2_values.append(chi_squared)
# 模拟过程中观察到的卡方值分布中的 95 分位数
percentile_95 = np.percentile(chi2_values, 95)
print("观察卡方值分布中的 95 分位数:", percentile_95)
ANOVA多因素方差分析
当进行多组比较时,显然不适用单位点的假设检验,因此这里需要直接构建一个多位点的矩阵
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import kruskal
from statsmodels.stats.multicomp import pairwise_tukeyhsd
def anova_own(dataframe):
df = dataframe.T
anova_results = f_oneway(*[df[group] for group in df.columns])
anova_statistic, anova_pvalue = anova_results
# 如果方差分析的p值低于显著性水平,进行多重比较校正
alpha = 0.05
if anova_pvalue < alpha:
tukey_results = pairwise_tukeyhsd(df.values.flatten(), np.repeat(df.columns, df.shape[0]))
# 计算每个位点对于分组差异的贡献度
mean_per_group = df.mean()
total_mean = df.values.mean()
contribution = (mean_per_group - total_mean).abs() / total_mean
# 输出位点的p值
p_values = {}
for column in df.columns:
group_data = [df[column][group] for group in df.T.columns]
_, p_value = kruskal(*group_data)
p_values[column] = p_value
return anova_results, contribution, p_values
data = {
"组A": {"位点1": 34, "位点2": 10, "位点3": 0, "位点4": 24},
"组B": {"位点1": 0, "位点2": 20, "位点3": 13, "位点4": 77},
"组C": {"位点1": 6, "位点2": 15, "位点3": 1, "位点4": 5},
"组D": {"位点1": 1, "位点2": 0, "位点3": 2, "位点4": 1}
}
df = pd.DataFrame(data)
Shapiro-Wilk test
正态分布检验 p≥0.05正态分布,p<0.05非正态分布,对每个组的数据进行Shapiro-Wilk检验。
from scipy.stats import shapiro
def shapiro_wilk_own(dataframe):
groupDict = {}
for group in dataframe.columns:
groupDict[group] = shapiro(dataframe[group].values)
return groupDict
Levene’s test
方差齐性检验 p≥0.05方差齐,p<0.05方差齐。
from scipy.stats import levene
def levene_own(dataframe):
dataGroupList = []
for group in dataframe.columns:
dataGroupList.append(dataframe[group].values)
lev_results = levene(*dataGroupList)
return lev_results