富集分析气泡图可视化
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 加载数据
data = pd.read_csv('enrichment.chart.txt', delimiter='\t')
# 确保PValue不为0,以便取对数
data['PValue'] = data['PValue'].replace(0, np.nan)
data.dropna(subset=['PValue'], inplace=True)
# 对PValue取负对数,以便更好地在图上表示
data['-log10(PValue)'] = -np.log10(data['PValue'])
# 过滤掉基因数目小于5的项,且只保留GO term相关的条目
data = data[(data['Count'] >= 5) & data['Term'].str.contains("GO:")]
# 将Term中“~”之后的内容作为新的Term
data['Term'] = data['Term'].apply(lambda x: x.split('~')[-1])
# 按PValue排序
data.sort_values('PValue', inplace=True)
# 设置图的大小
plt.figure(figsize=(24, 6))
# 绘制气泡图,交换x和y轴的数据
scatter = plt.scatter(data['Term'], data['Fold Enrichment'],
s=data['Count']*30, # 气泡大小,Count字段决定,可以按需调整系数
c=data['-log10(PValue)'], # 颜色由PValue决定
cmap='viridis', alpha=1, edgecolors='w', linewidth=0.5)
# 添加色标
plt.colorbar(scatter, label='-log10(PValue)')
# 添加标题和轴标签
#plt.xlabel('GO Terms')
plt.ylabel('Fold Enrichment')
#plt.title('Gene Set Enrichment Analysis (GSEA) Bubble Plot')
# 创建用于图例的假标签
for count in [5, 10, 15, 20]: # 示例基因数目,可以根据需要调整
plt.scatter([], [], s=count*10, c='black', alpha=1, label=str(count))
# 添加图例,说明基因数目
plt.legend(scatterpoints=1, frameon=False, labelspacing=1, title='Gene Count', bbox_to_anchor=(1.05, 1))
# 旋转x轴标签90度
plt.xticks(rotation=45)
# 展示图形
plt.savefig('enrich.pdf')