爬取CHPO数据库
#coding
CHPO即china HPO,是在中文人类表型标准用语联盟倡导下建立的一个公共网站,希望提供一个共享的平台有助于研究人员和医学专家共同翻译编辑Human Phenotype Ontology,以形成一个中文版的HPO。
通过查看网页源代码,发现想要的中文翻译等信息不在源码中,因此用常规的requests可能不行,因此使用selenium通过模拟浏览器进行。由于我用的是新版edge浏览器,因此需要先到微软找找文档。微软自己有一篇介绍selenium如何调用edge的文章,需要安装指定版本的python包。
pip install msedge-selenium-tools selenium==3.141
同时下载与自己目前使用的edge版本相同的webdriver。
到chpo网站看了看,发现HPO编号从HP:0000002到HP:3000079都有,如果按照5s爬一个网页算,也得大半年才能爬完(单线程)。后来发现,并不是每个HP编号都有相关信息的,但是在chpo网站又不容易获得HP编号,因此就到HPO网站去找了。
HPO网站里能很容易获得这个genes_to_phenotype.txt文件,通过这个文件可以获得HP编号。
cut genes_to_phenotype.txt -f3 | sort | uniq > hpolist.txt
最终其实只有8000左右的HP编号有用。下面是爬取代码,后期可以使用Thread和queue等库来增加线程数,获得更快速度。但是爬网站还是不能太过分,设置一个长一点的间隔,只用单线程。爬完再解析。
from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
from bs4 import BeautifulSoup
import os
import time
import random
设置一个UA池
def randomUA():
MY_USER_AGENT = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36"
]
ua = random.choice(MY_USER_AGENT)
return ua
再设置一个IP池。如果没有比较多的IP,建议放弃。
def randomIP():
ips = open("ip.txt", "r")
ip = []
for line in ips:
l = line.replace("\n", "")
ip.append(l)
ip_random = random.choice(ip)
return ip_random
然后随机导入。基本形成随机IP和UA的访问。
# 爬取
def chinahpo(hpo):
s = random.randint(5, 10)
print("等待 " + str(s) + "秒")
time.sleep(s)
ip = randomIP()
print("使用IP " + ip)
options = EdgeOptions()
options.use_chromium = True
# options.add_argument("headless")
# options.add_argument("disable-gpu")
options.add_argument("--proxy-server={ip}".format(ip=ip))
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
msedge = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe"
driver = Edge(options=options, executable_path=msedge)
script = "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
driver.execute_script(script)
UA = randomUA()
driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": UA})
print(driver.execute_script("return navigator.userAgent;"))
hpid = hpo.split(":")[1]
url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format(hpid=hpid)
try:
driver.get(url)
strtemp = url
print("网址:", strtemp)
except Exception:
print("get page error", hpo)
time.sleep(2)
with open("html/hp_" + hpid + ".html", "a+", encoding="utf-8") as f:
f.write(str(driver.page_source))
driver.close()
fin = open("finish.txt", "a")
fin.write(hpo + "\n")
fin.close()
# 解析
def analysis(hpo):
hpid = hpo.split(":")[1]
file = open("html/hp_" + hpid + ".html", "rb")
html = file.read()
soup = BeautifulSoup(html, "html.parser")
m = soup.select("main")
c = m[0].find_all("div", {"class": "row_list"})
url = "http://www.chinahpo.org/#/searchList?trigger=1&tabType=1&searchContent=HP%3A{hpid}".format(hpid=hpid)
output = [hpo]
try:
p = c[0].select("p")
output.append(p[0].string.split(":")[1])
output.append(p[1].string.split(":")[1])
output.append(p[2].string.split(":")[1])
output.append(p[3].string.split(":")[1])
output.append(url)
file.close()
except:
output.append("未找到信息")
output.append("-")
output.append("-")
output.append("-")
output.append(url)
# os.remove("html/hp_" + hpid + ".html")
filtered = open("filtered.txt", "a")
filtered.write(hpo + "\n")
filtered.close()
return "\t".join(output)
# 爬取
hpoFile = open("hpolist.txt", "r")
for line in hpoFile:
hpo = line.replace("\n", "")
chinahpo(hpo)
hpoFile.close()
# 解析
hpoFileFinish = open("finish.txt", "r")
r = open("chinahpo.txt", "w", encoding="utf-8")
r.write("HPOID\tEN\tCN\tEN_des\tCN_des\tURL\n")
for line in hpoFileFinish:
hpo = line.replace("\n", "")
try:
hpoString = analysis(hpo)
r.write(hpoString + "\n")
except:
pass
r.close()
hpoFileFinish.close()