-
Notifications
You must be signed in to change notification settings - Fork 0
/
biorxiv_v1.4.py
128 lines (112 loc) · 6.83 KB
/
biorxiv_v1.4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
import csv
import re
import time
### Outfile ###
csv_file = open('bioRxivData.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csv_file)
header = "first_author,last_author,subject,title,year,month,pdfDownloads,fullText,abstractViews,tweets\n"
csv_file.write(header)
### Driver settings ###
# Set Chrome options to headless mode to prevent a Chrome window from opening
options = Options()
options.headless = True
# driver must be run with ... , options=options) for headless mode
driver = webdriver.Chrome(executable_path=r'C:/Users/chris/OneDrive/OneDrive/DataAnalysis/Projects/selenium_project/1.4/chromedriver.exe',options=options)
driver.implicitly_wait(1.0)
### Web scraping settings ###
topics = ["animal-behavior-and-cognition","biochemistry", "bioengineering", "bioinformatics", "biophysics", "cancer-biology",
"cell-biology", "clinical-trials", "developmental-biology", "ecology", "epidemiology","evolutionary-biology","genetics",
"genomics","immunology","microbiology","molecular-biology","neuroscience","paleontology","pathology","pharmacology-and-toxicology",
"physiology","plant-biology","scientific-communication-and-education","synthetic-biology","systems-biology","zoology"]
for topic in topics:
topic_link = 'https://www.biorxiv.org/collection/' + topic
# Get all pages of search results (page links)
driver.get(topic_link)
lastPageNumber = int(driver.find_element_by_xpath('//*[@class="pager-last last odd"]').text)
total_links = 0
print("##### Topic: ", topic)
print("##### Total Pages of Links: ", lastPageNumber)
for i in range(lastPageNumber): # should be range(lastPageNumber)
# First page of manuscript links --> gets total pages of links
links = []
newlink = "https://www.biorxiv.org/collection/" + topic + "?page=" + str(i)
# Get 1 page of links (up to total)
driver.get(newlink)
rawLinks = driver.find_elements_by_xpath('.//*[@class="highwire-cite-title"]//a')
for a in rawLinks:
try:
links.append(a.get_attribute('href'))
except StaleElementReferenceException:
continue
linksOnPage = len(links)
#print("### Topic Page: ", topic, ":", i)
print("### Papers recorded: ", total_links)
# For each results page (10 paper links), go through each paper, record data for each, then go on to the next page of links
for link in links:
out_data = {}
fullLink = link + ".article-metrics"
driver.get(fullLink)
# Try to get views
try:
allViewsRaw = driver.find_element_by_xpath('//*[@class="cshl_total"]').text
tsp = allViewsRaw.split(" ")
abstract = tsp[-3]
fullText = tsp[-2]
PDF = tsp[-1]
except NoSuchElementException as exception:
abstract = str(0)
fullText = str(0)
PDF = str(0)
# Try to get tweets
try:
pretweet = driver.find_element_by_xpath('//*[@style="padding-left: 10px; line-height:18px; border-left: 16px solid #74CFED;"]').text
tweets = pretweet.split(" ")[-1]
except NoSuchElementException as exception:
tweets = str(0)
try:
# These should all work!
title = driver.find_element_by_xpath('//*[@class="highwire-cite-title"]').text
surnames2 = driver.find_element_by_xpath('//*[@class="highwire-cite-authors"]').text
authorlist = []
for i in surnames2.split(","):
author = ""
authorSplit = i.split()
for j in authorSplit:
if j not in ["ORCID", "View", "Profile"]:
author = author + " " + j
auth_check = author.replace(" ", "")
if auth_check != "":
authorlist.append(author[1:])
firstAuthor = authorlist[0]
lastAuthor = authorlist[-1]
subject = driver.find_element_by_xpath('//*[@class="highwire-article-collection-term"]').text
# Collect date
posted = (driver.find_element_by_xpath('//*[@class="panel-panel panel-region-sidebar-right"]').text).split("\n")[2].split(" ")
year = posted[3][:-1]
month = posted[1]
day = posted[2][:-1]
# Record data
out_data['firstAuthor'] = firstAuthor
out_data['lastAuthor'] = lastAuthor
out_data['subject'] = subject
out_data['title'] = title
out_data['year'] = year
out_data['month'] = month
out_data['pdfDownloads'] = PDF
out_data['fullText'] = fullText
out_data['abstractViews'] = abstract
out_data['tweets'] = tweets
total_links += 1
writer.writerow(out_data.values())
# Some papers have no author, or lack some other field. Move past these.
except NoSuchElementException as exception:
print("Error at link: ", fullLink)
print(exception)
csv_file.close()
driver.close()