99.Enrichment

富集分析气泡图可视化

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 加载数据
data = pd.read_csv('enrichment.chart.txt', delimiter='\t')

# 确保PValue不为0,以便取对数
data['PValue'] = data['PValue'].replace(0, np.nan)
data.dropna(subset=['PValue'], inplace=True)

# 对PValue取负对数,以便更好地在图上表示
data['-log10(PValue)'] = -np.log10(data['PValue'])

# 过滤掉基因数目小于5的项,且只保留GO term相关的条目
data = data[(data['Count'] >= 5) & data['Term'].str.contains("GO:")]

# 将Term中“~”之后的内容作为新的Term
data['Term'] = data['Term'].apply(lambda x: x.split('~')[-1])

# 按PValue排序
data.sort_values('PValue', inplace=True)

# 设置图的大小
plt.figure(figsize=(24, 6))

# 绘制气泡图,交换x和y轴的数据
scatter = plt.scatter(data['Term'], data['Fold Enrichment'], 
                      s=data['Count']*30,  # 气泡大小,Count字段决定,可以按需调整系数
                      c=data['-log10(PValue)'],  # 颜色由PValue决定
                      cmap='viridis', alpha=1, edgecolors='w', linewidth=0.5)

# 添加色标
plt.colorbar(scatter, label='-log10(PValue)')

# 添加标题和轴标签
#plt.xlabel('GO Terms')
plt.ylabel('Fold Enrichment')
#plt.title('Gene Set Enrichment Analysis (GSEA) Bubble Plot')

# 创建用于图例的假标签
for count in [5, 10, 15, 20]:  # 示例基因数目,可以根据需要调整
    plt.scatter([], [], s=count*10, c='black', alpha=1, label=str(count))

# 添加图例,说明基因数目
plt.legend(scatterpoints=1, frameon=False, labelspacing=1, title='Gene Count', bbox_to_anchor=(1.05, 1))

# 旋转x轴标签90度
plt.xticks(rotation=45)

# 展示图形
plt.savefig('enrich.pdf')