Skip to content

Commit

Permalink
fix rhub _get_category() bug
Browse files Browse the repository at this point in the history
  • Loading branch information
spodgorny9 committed May 13, 2024
1 parent 1b11e8a commit ea8db86
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions elm/web/rhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,14 @@ def _scrape_category(self, soup_inst):
category (str)
"""

category = soup_inst.find('span',
{'class': 'type_classification'}).text
try:
category = soup_inst.find('span',
{'class':
'type_classification'}).text
except AttributeError:
category = soup_inst.find('span',
{'class':
'type_classification_parent'}).text

return category

Expand Down Expand Up @@ -179,7 +185,7 @@ def build_meta(self):
'authors', 'year',
'url', 'doi',
'pdf_url', 'category'))
for link in self.all_links[:20]: # quantity control here #
for link in self.all_links[:50]: # quantity control here #
with urlopen(link) as page:
html = page.read().decode("utf-8")
meta_soup = BeautifulSoup(html, "html.parser")
Expand Down Expand Up @@ -285,7 +291,7 @@ def scrape_publications(self, pdf_dir, txt_dir):

os.makedirs(pdf_dir, exist_ok=True)
os.makedirs(txt_dir, exist_ok=True)
url_list = self.all_links[:20] # quantity control here #
url_list = self.all_links[:50] # quantity control here #

for pub in url_list:
with urlopen(pub) as page:
Expand Down Expand Up @@ -351,7 +357,7 @@ def build_meta(self):
'email', 'url', 'fn',
'category'
))
for link in url_list[:20]: # quantity control here #
for link in url_list[:50]: # quantity control here #
with urlopen(link) as page:
html = page.read().decode("utf-8")
meta_soup = BeautifulSoup(html, "html.parser")
Expand Down Expand Up @@ -604,7 +610,7 @@ def scrape_profiles(self, out_dir):
Text file containing information from the profile.
"""
os.makedirs(out_dir, exist_ok=True)
url_list = self.profile_links[:20] # quantity control here #
url_list = self.profile_links[:50] # quantity control here #

for i, prof in enumerate(url_list):
f = os.path.basename(prof) + '.txt'
Expand Down

0 comments on commit ea8db86

Please sign in to comment.