Python으로 바이오 데이터 시각화하기

이 토픽을 마치면

matplotlib으로 바이오 연구에서 자주 쓰는 세 가지 그래프(생존곡선, volcano plot, heatmap)를 그릴 수 있습니다.

matplotlib 기초: 첫 번째 그래프

matplotlib은 Python의 대표적인 시각화 라이브러리입니다. plt.plot() 한 줄로 그래프를 그릴 수 있습니다.

python

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

# 세포 성장 곡선 (지수 성장)
hours = np.array([0, 2, 4, 6, 8, 10, 12, 24])
cell_count = np.array([1000, 1200, 1800, 3200, 6000, 11000, 22000, 500000])

fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(hours, cell_count, "o-", color="#2E86AB", linewidth=2, markersize=6)
ax.set_xlabel("Time (hours)", fontsize=12)
ax.set_ylabel("Cell Count", fontsize=12)
ax.set_title("Cell Growth Curve", fontsize=14, fontweight="bold")
ax.set_yscale("log")
ax.grid(True, alpha=0.3)
fig.tight_layout()
fig.savefig("growth_curve.png", dpi=150)
plt.close(fig)

assert len(hours) == 8
assert cell_count[-1] == 500000
print("growth_curve.png 저장 완료")

바 차트: 여러 유전자 GC Content 비교

python

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

def calculate_gc(seq: str) -> float:
    seq = seq.upper()
    return (seq.count("G") + seq.count("C")) / len(seq) * 100

genes = {
    "BRCA1": "ATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTC",
    "TP53":  "ATGGAGGAGCCGCAGTCAGATCCTAGCGTGAGTTTGCTGTGA",
    "EGFR":  "ATGCGACCCTCCGGGACGGCCGGGGCAGCGCTCCTGGCGCTG",
    "MYC":   "ATGCCCCTCAACGTTAGCTTCACCAACAGGAACTATGACCTCG",
    "KRAS":  "ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAG",
}

names = list(genes.keys())
gc_values = [calculate_gc(seq) for seq in genes.values()]
colors = ["#E74C3C" if gc > 60 else "#2E86AB" for gc in gc_values]

fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(names, gc_values, color=colors, edgecolor="white", linewidth=0.5)
ax.axhline(y=60, color="#E74C3C", linestyle="--", alpha=0.5, label="GC > 60% threshold")
ax.set_ylabel("GC Content (%)", fontsize=12)
ax.set_title("GC Content by Gene", fontsize=14, fontweight="bold")
ax.legend()
ax.set_ylim(0, 100)
fig.tight_layout()
fig.savefig("gc_bar_chart.png", dpi=150)
plt.close(fig)

assert len(gc_values) == 5
assert all(0 <= gc <= 100 for gc in gc_values)
print(f"GC values: {[f'{gc:.1f}%' for gc in gc_values]}")

Kaplan-Meier 생존곡선

생존분석은 임상시험에서 가장 중요한 시각화입니다. 두 그룹(치료군 vs 대조군)의 생존율을 비교합니다.

python

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)

# 가상 임상 데이터 생성
n_patients = 50
treatment_times = np.sort(np.random.exponential(scale=24, size=n_patients))
control_times = np.sort(np.random.exponential(scale=16, size=n_patients))

def kaplan_meier(times, max_time=48):
    times = times[times <= max_time]
    n = len(times)
    km_times = [0]
    km_survival = [1.0]
    at_risk = n
    for t in sorted(set(times)):
        events = np.sum(times == t)
        survival = km_survival[-1] * (1 - events / at_risk)
        km_times.append(t)
        km_survival.append(survival)
        at_risk -= events
    return np.array(km_times), np.array(km_survival)

t_times, t_surv = kaplan_meier(treatment_times)
c_times, c_surv = kaplan_meier(control_times)

fig, ax = plt.subplots(figsize=(8, 5))
ax.step(t_times, t_surv, where="post", color="#2E86AB", linewidth=2, label="Treatment (n=50)")
ax.step(c_times, c_surv, where="post", color="#E74C3C", linewidth=2, label="Control (n=50)")
ax.fill_between(t_times, t_surv, step="post", alpha=0.1, color="#2E86AB")
ax.fill_between(c_times, c_surv, step="post", alpha=0.1, color="#E74C3C")
ax.set_xlabel("Time (months)", fontsize=12)
ax.set_ylabel("Survival Probability", fontsize=12)
ax.set_title("Kaplan-Meier Survival Curve", fontsize=14, fontweight="bold")
ax.legend(fontsize=11, loc="lower left")
ax.set_xlim(0, 48)
ax.set_ylim(0, 1.05)
ax.grid(True, alpha=0.3)
fig.tight_layout()
fig.savefig("kaplan_meier.png", dpi=150)
plt.close(fig)

assert t_surv[0] == 1.0
assert c_surv[0] == 1.0
assert len(t_times) > 1
print("kaplan_meier.png 저장 완료")

Volcano Plot: 차등 발현 유전자 시각화

Volcano plot은 RNA-seq 데이터에서 유의미하게 발현이 변한 유전자를 한눈에 보여줍니다. X축은 변화량(log2 fold change), Y축은 통계적 유의성(-log10 p-value)입니다.

python

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)
n_genes = 500

log2fc = np.random.normal(0, 1.5, n_genes)
pvalues = 10 ** (-np.abs(log2fc) * np.random.uniform(0.5, 3, n_genes))

fc_threshold = 1.0
p_threshold = 0.05

colors = []
for fc, p in zip(log2fc, pvalues):
    if p < p_threshold and fc > fc_threshold:
        colors.append("#E74C3C")    # 상향 조절
    elif p < p_threshold and fc < -fc_threshold:
        colors.append("#2E86AB")    # 하향 조절
    else:
        colors.append("#CCCCCC")    # 비유의

neg_log_p = -np.log10(pvalues)

fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(log2fc, neg_log_p, c=colors, s=10, alpha=0.7, edgecolors="none")
ax.axhline(-np.log10(p_threshold), color="gray", linestyle="--", alpha=0.5)
ax.axvline(fc_threshold, color="gray", linestyle="--", alpha=0.5)
ax.axvline(-fc_threshold, color="gray", linestyle="--", alpha=0.5)

n_up = sum(1 for c in colors if c == "#E74C3C")
n_down = sum(1 for c in colors if c == "#2E86AB")
ax.set_xlabel("log₂ Fold Change", fontsize=12)
ax.set_ylabel("-log₁₀ p-value", fontsize=12)
ax.set_title(f"Volcano Plot (↑{n_up} ↓{n_down} DEGs)", fontsize=14, fontweight="bold")
ax.grid(True, alpha=0.2)
fig.tight_layout()
fig.savefig("volcano_plot.png", dpi=150)
plt.close(fig)

assert n_up > 0
assert n_down > 0
print(f"상향: {n_up}개, 하향: {n_down}개")

Heatmap: 유전자 발현 패턴

Heatmap은 여러 샘플에서 유전자 발현량을 색상으로 비교합니다.

python

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)

gene_names = ["BRCA1", "TP53", "EGFR", "MYC", "KRAS", "PTEN", "RB1", "APC"]
sample_names = ["Normal_1", "Normal_2", "Tumor_1", "Tumor_2", "Tumor_3"]

expression = np.random.randn(len(gene_names), len(sample_names))
expression[:, 2:] += np.random.uniform(0.5, 2.0, (len(gene_names), 3))

fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(expression, cmap="RdBu_r", aspect="auto", vmin=-3, vmax=3)
ax.set_xticks(range(len(sample_names)))
ax.set_xticklabels(sample_names, rotation=45, ha="right", fontsize=10)
ax.set_yticks(range(len(gene_names)))
ax.set_yticklabels(gene_names, fontsize=10)
ax.set_title("Gene Expression Heatmap", fontsize=14, fontweight="bold")
fig.colorbar(im, ax=ax, label="Z-score", shrink=0.8)
fig.tight_layout()
fig.savefig("heatmap.png", dpi=150)
plt.close(fig)

assert expression.shape == (8, 5)
print(f"Heatmap: {expression.shape[0]} genes × {expression.shape[1]} samples")

직접 해보기 (Faded Example)

아래 빈칸을 채워 scatter plot을 완성하세요.

빈칸 채우기python

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
x = [1, 2, 3, 4, 5]
y = [2.1, 3.9, 6.2, 7.8, 10.1]
fig, ax = plt.subplots()
ax.(x, y, color="blue")
ax.set_xlabel("Concentration")
ax.set_ylabel("")
fig.savefig("scatter.png")
plt.close(fig)

흔한 에러 & 해결법

Q: 그래프가 안 뜨고 UserWarning: Matplotlib is currently using agg

서버 환경(Colab, SSH 등)에서는 matplotlib.use("Agg")를 plt import 전에 호출하고, plt.show() 대신 fig.savefig()로 파일 저장합니다.

Q: 한글 제목이 깨집니다

plt.rcParams['font.family'] = 'NanumGothic'을 추가하세요. Colab에서는 !apt-get install -y fonts-nanum 후 런타임을 재시작해야 합니다.

Q: 그래프가 잘려서 라벨이 안 보입니다

fig.tight_layout()을 savefig() 전에 호출하세요. 대부분의 잘림 문제가 해결됩니다.

다음 글에서는 머신러닝으로 유전자 발현 데이터를 분류하는 법을 배웁니다.