爬取oncokb

爬虫务必设置一个大的时间间隔。

确认爬取路线:

1,从网站中获取所有基因的列表;

2,爬取基因页面;

3,解析基因页面并获得位点列表;

4,爬取位点页面;

5,解析并整理所有内容。

基因列表

oncokb在这个页面中提供了基因列表的下载。

爬取基因页面

oncokb的页面URL结构非常整洁,照旧使用playwright框架来爬取,由于每个页面下可能有多个标签,标签的内容需要点击标签才会加载出来,因此这里每个标签都尝试点击一次并保存一次。

from playwright.sync_api import Playwright, sync_playwright, expect
import time
import random

def run(playwright: Playwright) -> None:
    # 基于列表
    finished = []
    finished_num = 0
    with open("finished.txt", "r", encoding="utf-8") as f:
        for line in f:
            finished.append(line.strip())
            finished_num += 1
    
    genelist = []
    with open("cancerGeneList.tsv", "r", encoding="utf-8") as g:
        for line in g:
            if not line.startswith("Hugo Symbol"):
                lineList = line.strip().split("\t")
                if not lineList in finished:
                    genelist.append(lineList[0])
    # 随机打乱列表
    random.shuffle(genelist)

    browser = playwright.chromium.launch(headless=True)
    context = browser.new_context()
    
    # 注入反反爬脚本
    page = context.new_page()
    page.add_init_script(path="stealth.min.js")
    
    # 基因遍历
    for gene in genelist:
        page.wait_for_load_state("domcontentloaded")
        page.goto("https://www.oncokb.org/gene/{gene}".format(gene=gene), timeout=0)
        time.sleep(random.randint(12, 20))
        try:
            page.get_by_role("tab", name="Annotated Alterations").click()
            time.sleep(2)
            html = page.content()
            with open("html/gene/" + gene + ".alterations.html", "w", encoding="utf-8") as f:
                f.write(html)
        except:
            pass
        try:
            page.get_by_role("tab", name="Therapeutic").click()
            time.sleep(2)
            html = page.content()
            with open("html/gene/" + gene + ".therapeutic.html", "w", encoding="utf-8") as f:
                f.write(html)
        except:
            pass
        try:
            page.get_by_role("tab", name="Diagnostic").click()
            time.sleep(2)
            html = page.content()
            with open("html/gene/" + gene + ".diagnostic.html", "w", encoding="utf-8") as f:
                f.write(html)
        except:
            pass
        try:
            page.get_by_role("tab", name="Prognostic").click()
            time.sleep(2)
            html = page.content()
            with open("html/gene/" + gene + ".prognostic.html", "w", encoding="utf-8") as f:
                f.write(html)
        except:
            pass
        try:
            page.get_by_role("tab", name="FDA-Recognized Content").click()
            time.sleep(2)
            html = page.content()
            with open("html/gene/" + gene + ".fda.html", "w", encoding="utf-8") as f:
                f.write(html)
        except:
            pass
        html = page.content()
        with open("html/gene/" + gene + ".empty.html", "w", encoding="utf-8") as f:
            f.write(html)

        time.sleep(0.3)
        print("【已完成】", gene)

        with open("finished.txt", "a", encoding="utf-8") as f:
            f.write(gene + "\n")

        # 加大爬取间隔,防止被禁
        time.sleep(random.randint(20, 30))
    
    page.close()
    context.close()
    browser.close()

with sync_playwright() as playwright:
    run(playwright)

解析基因页面获得位点列表

使用BeautifulSoup进行解析

import os
from bs4 import BeautifulSoup

# 基因基本信息
def gene_base_info(gene):
    htmlFile = open("html/gene/{gene}.empty.html".format(gene=gene), "r", encoding="utf-8")
    htmlHandle = htmlFile.read()
    soup = BeautifulSoup(htmlHandle, "html.parser")
    div_ele = soup.find_all("div", role="alert")
    negative = False
    for d in div_ele:
        if "We do not have any information for this gene" in d.text:
            negative = True
    if negative:
        outputList = [gene, "无此基因记录", "", "", "", "", ""]
    else:
        geneInfoTable = soup.find_all("table")[0]
        tds = geneInfoTable.find_all("td")
        tdDict = {}
        for n in range(len(tds)):
            if n % 2 == 0:
                tdDict[tds[n].text] = tds[n+1].text
        ncbi = tdDict["NCBI Gene"]
        if "Ensembl Gene" in tdDict:
            embl = tdDict["Ensembl Gene"].split(" (")[0]
        else:
            embl = "-"
        grch37 = grch38 = "-"
        if "Location" in tdDict:
            if "GRch37" in tdDict["Location"]:
                grch37 = tdDict["Location"].split(" (GRch37)")[0].replace("Chr", "chr")
            if "GRch38" in tdDict["Location"]:
                if "GRch37" in tdDict["Location"]:
                    grch38 = tdDict["Location"].split(" (GRch37)")[1].rstrip(" (GRch38)").replace("Chr", "chr")
                else:
                    grch38 = tdDict["Location"].split(" (GRch38)")[0].replace("Chr", "chr")
        if "Ensembl Transcript" in tdDict:
            embl_t = tdDict["Ensembl Transcript"].split(" (")[0]
        else:
            embl_t = "-"
        if "RefSeq" in tdDict:
            refseq = tdDict["RefSeq"].split(" (")[0]
        else:
            refseq = "-"
        outputList = [gene, ncbi, embl, grch37, grch38, embl_t, refseq]
    htmlFile.close()
    return outputList

# alterations
def alterations_info(gene):
    htmlFile = open("html/gene/{gene}.alterations.html".format(gene=gene), "r", encoding="utf-8")
    htmlHandle = htmlFile.read()
    soup = BeautifulSoup(htmlHandle, "html.parser")
    rows = soup.find_all("div", role="row")
    outputList = []
    for i in rows[1:]:
        divs = i.find_all("div")
        outputList.append([gene, divs[0].text, divs[1].text, divs[2].text])
    htmlFile.close()
    return outputList

# diagnostic
def diagnostic_info(gene):
    htmlFile = open("html/gene/{gene}.diagnostic.html".format(gene=gene), "r", encoding="utf-8")
    htmlHandle = htmlFile.read()
    soup = BeautifulSoup(htmlHandle, "html.parser")
    rows = soup.find_all("div", role="row")
    outputList = []
    for i in rows[1:]:
        divs = i.find_all("div")
        level = divs[0].span.i.get("class")[-1].lstrip("level-")
        output = [gene, level, divs[2].text, divs[3].text]
        outputList.append(output)
    htmlFile.close()
    return outputList

# therapeutic
def therapeutic_info(gene):
    htmlFile = open("html/gene/{gene}.therapeutic.html".format(gene=gene), "r", encoding="utf-8")
    htmlHandle = htmlFile.read()
    soup = BeautifulSoup(htmlHandle, "html.parser")
    rows = soup.find_all("div", role="row")
    outputList = []
    for i in rows[1:]:
        divs = i.find_all("div")
        level = divs[0].span.i.get("class")[-1].lstrip("level-")
        output = [gene, level, divs[2].text, divs[3].text, divs[4].text]
        outputList.append(output)
    htmlFile.close()
    return outputList

# prognostic
def prognostic_info(gene):
    htmlFile = open("html/gene/{gene}.prognostic.html".format(gene=gene), "r", encoding="utf-8")
    htmlHandle = htmlFile.read()
    soup = BeautifulSoup(htmlHandle, "html.parser")
    rows = soup.find_all("div", role="row")
    outputList = []
    for i in rows[1:]:
        divs = i.find_all("div")
        level = divs[0].span.i.get("class")[-1].lstrip("level-")
        output = [gene, level, divs[2].text, divs[3].text]
        outputList.append(output)
    htmlFile.close()
    return outputList

# fda
def fda_info(gene):
    htmlFile = open("html/gene/{gene}.fda.html".format(gene=gene), "r", encoding="utf-8")
    htmlHandle = htmlFile.read()
    soup = BeautifulSoup(htmlHandle, "html.parser")
    rows = soup.find_all("div", role="row")
    outputList = []
    for i in rows[1:]:
        divs = i.find_all("div")
        output = [gene, divs[1].text, divs[2].text, divs[3].text]
        outputList.append(output)
    return outputList

####################
geneList = []
with open("oncokb.all.txt", "r", encoding="utf-8") as g:
    for line in g:
        geneList.append(line.strip())

output1 = open("output_all/output.summary.txt", "w", encoding="utf-8")
output2 = open("output_all/output.anno.txt", "w", encoding="utf-8")
output3 = open("output_all/output.thera.txt", "w", encoding="utf-8")
output4 = open("output_all/output.diagn.txt", "w", encoding="utf-8")
output5 = open("output_all/output.progn.txt", "w", encoding="utf-8")
output6 = open("output_all/output.fda.txt", "w", encoding="utf-8")
htmlList = os.listdir("html/gene")
for gene in geneList:
    print(gene)
    for i in ["empty", "alterations", "diagnostic", "therapeutic", "prognostic", "fda"]:
        if (gene + "." + i + ".html") in htmlList:
            if i == "empty":
                output1.write("\t".join(gene_base_info(gene)) + "\n")
            elif i == "alterations":
                for j in alterations_info(gene):
                    output2.write("\t".join(j) + "\n")
            elif i == "therapeutic":
                for j in therapeutic_info(gene):
                    output3.write("\t".join(j) + "\n")
            elif i == "diagnostic":
                for j in diagnostic_info(gene):
                    output4.write("\t".join(j) + "\n")
            elif i == "prognostic":
                try:
                    for j in prognostic_info(gene):
                        output5.write("\t".join(j) + "\n")
                except:
                    continue
            elif i == "fda":
                try:
                    for j in fda_info(gene):
                        output6.write("\t".join(j) + "\n")
                except:
                    continue
output1.close()
output2.close()
output3.close()
output4.close()
output5.close()
output6.close()

爬取位点

与爬取基因大差不差,修改一下上面的脚本,然后位点的某些字符特殊处理一下即可。

解析位点页面

其实感觉基因页面的信息也够用了,但是实际上位点页面中,评级的信息会比基因页面的信息多,另外位点页面中会包含位点的解析。因此这里就只把需要的突变信息和评级抓出来了。

from bs4 import BeautifulSoup

# 基因基本信息
def variant_base_info(gene, variant):
    # MET	981_1028splice
    # gene = "MET"
    # variant = "981_1028splice"
    oncoFixResult = variantDescription = geneDescription = "-"
    htmlFile = open("html/variant/{gene}_{variant}.empty.html".format(gene=gene, variant=variant), "r", encoding="utf-8")
    htmlHandle = htmlFile.read()
    soup = BeautifulSoup(htmlHandle, "html.parser")
    try:
        geneDescription = soup.find_all("div", attrs={"class": "mb-3"})[0].text.strip()
    except:
        pass
    variantDescriptionList = soup.find_all("span")
    try:
        n = 0
        for vars in variantDescriptionList:
            variant = variant.replace("Fusion", "fusion").replace("Amplification", "Amplification of ").replace("Ter", "*")
            if "PMID" in vars.text:
                n += 1
                if n == 1:
                    variantDescription = vars.text
    except:
        pass
    oncoList = soup.find_all("h5")
    try:
        for onco in oncoList[0].find_all("span"):
            if not "style" in str(onco):
                if onco.text in ["Oncogenic", "Resistance", "Likely Oncogenic", "Likely Neutral", "Inconclusive"]:
                    oncoFixResult = onco.text
    except:
        pass


    return [geneDescription.replace("\n", "\\x0a"), variantDescription.replace("\n", "\\x0a"), oncoFixResult]



####################
input = open("oncokb.variant.txt", "r", encoding="utf-8")
output = open("output_all/oncokb.variant.des.txt", "w", encoding="utf-8")
for line in input:
    lines = line.rstrip().split("\t")
    gene = lines[0]
    variant = lines[1].replace("*", "Ter")
    print(gene, variant)
    geneDes, variantDes, oncoFixResult = variant_base_info(gene, variant)
    output.write("\t".join([gene, variant.replace("Ter", "*"), geneDes, variantDes, oncoFixResult]) + "\n")
output.close()
input.close()