Skip to content

Commit

Permalink
Removed the working implementationn with the SERP api from google as …
Browse files Browse the repository at this point in the history
…well the commented out code for pubmed citation extraction
  • Loading branch information
nahmed12 committed Feb 13, 2024
1 parent aae1e6e commit 501a020
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 242 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -280,100 +280,6 @@
"## Citations per year for {{researcher_name.raw_value}}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from serpapi import GoogleSearch\n",
"from serpapi import GoogleScholarSearch\n",
"import os\n",
"from parsel import Selector\n",
"import requests, json, re\n",
"from urllib.parse import urlsplit, parse_qsl\n",
"\n",
"params = {\n",
" # https://docs.python.org/3/library/os.html\n",
" 'api_key': os.environ.get('SERP_API_KEY'), # SerpApi API key\n",
" 'engine': 'google_scholar_profiles', # profile results search engine\n",
" 'mauthors': name_of_researcher_first_and_last # search query\n",
"}\n",
"citation_dict = {}\n",
"try:\n",
" search = GoogleSearch(params)\n",
"except:\n",
" print(\"Error with initial Google Search API call for profiles. May have run out of credits\")\n",
" # return citation_dict\n",
"\n",
"profile_is_present = True\n",
"try:\n",
" while profile_is_present:\n",
" profile_results = search.get_dict()\n",
" for profile in profile_results['profiles']:\n",
" print(f'Currently extracting {profile[\"name\"]} with {profile[\"author_id\"]} ID.')\n",
" thumbnail = profile['thumbnail']\n",
" name = profile['name']\n",
" link = profile['link']\n",
" author_id = profile['author_id']\n",
" affiliations = profile['affiliations']\n",
" email = profile.get('email')\n",
" cited_by = profile.get('cited_by')\n",
" interests = profile.get('interests')\n",
" if name_of_researcher_first_and_last.split()[0].lower() in name.lower() and name_of_researcher_first_and_last.split()[-1].lower() in name.lower():\n",
" params = {\n",
" # https://docs.python.org/3/library/os.html\n",
" 'api_key': os.environ.get('SERP_API_KEY'),\n",
" 'engine': 'google_scholar_author',\n",
" 'author_id': author_id,\n",
" 'hl': 'en'\n",
" }\n",
" try:\n",
" search = GoogleScholarSearch(params)\n",
" results = search.get_dict()\n",
" except:\n",
" \"Error with fetching researcher information using Google API call\"\n",
" # return citation_dict\n",
"\n",
" data = {\n",
" 'cited_by': [],\n",
" 'public_access': {},\n",
" 'graph': []\n",
" }\n",
"\n",
" data['cited_by'] = results['cited_by']['table']\n",
" data['public_access']['link'] = results['public_access']['link']\n",
" data['public_access']['articles_available'] = results['public_access']['available']\n",
" data['public_access']['articles_not_available'] = results['public_access']['not_available']\n",
"\n",
" data['graph'] = results['cited_by']['graph']\n",
" for year_dict in results['cited_by']['graph']:\n",
" citation_dict[year_dict['year']] = year_dict['citations']\n",
" year_keys = list(citation_dict.keys())\n",
" year_keys.sort()\n",
" citation_dict = {year:citation_dict[year] for year in year_keys}\n",
" fig = make_bar_plot(citation_dict,'Year', \"Citations\", f\"Citations per Year\", \"Sourced from Google Scholar\")\n",
" fig_line = make_line_plot(citation_dict, 'Year', \"Citations\", f\"Cumulative Citations\", \"Sourced from Google Scholar\")\n",
" fig.show()\n",
" fig.write_image(output_folder+'citations_bar_google_scholar.png')\n",
" figure_counter = display_figure_labels(output_folder, figure_counter, \"The number of times cited per year for {}.\".format(name_of_researcher_first_and_last), title = 'citations_bar_google_scholar')\n",
"\n",
" fig_line.show()\n",
" fig_line.write_image(output_folder+'citations_line_graph_google_scholar.png')\n",
" figure_counter = display_figure_labels(output_folder, figure_counter, \"The cumulative number of citations per year for {}\".format(name_of_researcher_first_and_last), title = 'citations_line_graph_google_scholar')\n",
" # return citation_dict\n",
"\n",
" # # check if the next page is present in 'serpapi_pagination' dict key\n",
" if 'pagination' in profile_results and 'next' in profile_results['pagination']:\n",
" # split URL in parts as a dict() and update search 'params' variable to a new page\n",
" search.params_dict.update(dict(parse_qsl(urlsplit(profile_results['pagination']['next']).query)))\n",
" else:\n",
" profile_is_present = False\n",
"except:\n",
" print(\"Error with fetching the data\")\n",
"# return citation_dict"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
149 changes: 1 addition & 148 deletions appyters/Researcher_Summary_Report_Appyter/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ def query_pubmed_citations(pubmed_name, name_of_researcher_first_and_last):
print("This name does not have any publications to search for citations returned from PubMed.")
return pmid_citation_dict
else:
print("This may take a minute or two.")
print("This may take over a minute or two.")
display(MyMarkdown("### Link to [PubMed Query](https://pubmed.ncbi.nlm.nih.gov/?{}) for {}".format(urlencode(params), name_of_researcher_first_and_last)))
# Use the Entrez module efetch for the publication records for the PMIDs with text information included for each.
records = Entrez.efetch(db="pubmed", id=identifiers, rettype="medline", retmode="text")
Expand Down Expand Up @@ -590,150 +590,3 @@ def query_pubmed_citations(pubmed_name, name_of_researcher_first_and_last):
if len(pmid_citation_dict) == 0:
print("This name does not have any publications to search for citations returned from PubMed.")
return pmid_citation_dict


# from serpapi import GoogleSearch
# from serpapi import GoogleScholarSearch
# import os
# from parsel import Selector
# import requests, json, re
# from urllib.parse import urlsplit, parse_qsl
# def serpapi_scrape_all_authors(name_of_researcher):
# params = {
# # https://docs.python.org/3/library/os.html
# 'api_key': '5d728fd1a3c62a85ebe7d31f50b6eb13d1a2cd8fa30381df01d30fd6c578de49', # SerpApi API key
# 'engine': 'google_scholar_profiles', # profile results search engine
# 'mauthors': name_of_researcher_first_and_last # search query
# }
# search = GoogleSearch(params)
# citation_dict = {}
# profiles_is_present = True
# # try:
# while profiles_is_present:

# profile_results = search.get_dict()
# print(profile_results.keys())
# print(profile_results)

# for profile in profile_results['profiles']:

# print(f'Currently extracting {profile["name"]} with {profile["author_id"]} ID.')

# thumbnail = profile['thumbnail']
# name = profile['name']
# link = profile['link']
# author_id = profile['author_id']
# affiliations = profile['affiliations']
# email = profile.get('email')
# cited_by = profile.get('cited_by')
# interests = profile.get('interests')
# if name_of_researcher.split()[0] in name and name_of_researcher.split()[-1] in name:
# params = {
# # https://docs.python.org/3/library/os.html
# 'api_key': '5d728fd1a3c62a85ebe7d31f50b6eb13d1a2cd8fa30381df01d30fd6c578de49',
# 'engine': 'google_scholar_author',
# 'author_id': author_id,
# 'hl': 'en'
# }

# search = GoogleScholarSearch(params)
# results = search.get_dict()

# data = {
# 'cited_by': [],
# 'public_access': {},
# 'graph': []
# }

# data['cited_by'] = results['cited_by']['table']
# data['public_access']['link'] = results['public_access']['link']
# data['public_access']['articles_available'] = results['public_access']['available']
# data['public_access']['articles_not_available'] = results['public_access']['not_available']

# data['graph'] = results['cited_by']['graph']

# for year_dict in results['cited_by']['graph']:
# citation_dict[year_dict['year']] = year_dict['citations']
# year_keys = list(citation_dict.keys())
# year_keys.sort()
# citation_dict = {year:citation_dict[year] for year in year_keys}
# fig = make_bar_plot(citation_dict,'Year', "Citations", f"Citations per Year", "Sourced from Google Scholar")
# fig_line = make_line_plot(citation_dict, 'Year', "Citations", f"Cumulative Citations", "Sourced from Google Scholar")
# fig.show()
# fig.write_image(output_folder+'citations_bar_google_scholar.png')
# # figure_counter = display_figure_labels(output_folder, figure_counter, "Citations that are connected to the publications each year for {}.".format(name_of_researcher_first_and_last), title = 'citations_bar_google_scholar')

# fig_line.show()
# fig_line.write_image(output_folder+'citations_line_graph_google_scholar.png')
# # figure_counter = display_figure_labels(output_folder, figure_counter, "The cumulative citations that are connected to the publications each year for {}".format(name_of_researcher_first_and_last), title = 'citations_line_graph_google_scholar')
# return citation_dict

# # # check if the next page is present in 'serpapi_pagination' dict key
# if 'pagination' in profile_results and 'next' in profile_results['pagination']:
# # split URL in parts as a dict() and update search 'params' variable to a new page
# search.params_dict.update(dict(parse_qsl(urlsplit(profile_results['pagination']['next']).query)))
# else:
# profiles_is_present = False
# # except:
# # print("Error in api calls")

# print("no researcher matched")

# return citation_dict





# PUBMED CITATION EXTRACTION

# params = {
# 'term': "{}".format(pubmed_name)
# }
# pmid_citation_dict = defaultdict(int)
# #Get the pubmed publications for the researcher with the pmids
# info = Entrez.esearch(db="PubMed", term= pubmed_name, retmax = "5000")
# info = Entrez.read(info)
# identifiers = info['IdList'] # Get list of identifiers which are pmids
# if len(identifiers) == 0:
# print("This name does not have any publications to search for citations returned from PubMed.")
# else:
# print("Searching for Citation Information from Pubmed for {}".format(name_of_researcher_first_and_last))
# print("This may take a minute or two.")
# display(MyMarkdown("### Link to [PubMed Query](https://pubmed.ncbi.nlm.nih.gov/?{}) for {}".format(urlencode(params), name_of_researcher_first_and_last)))
# # Use the Entrez module efetch for the publication records for the PMIDs with text information included for each.
# records = Entrez.efetch(db="pubmed", id=identifiers, rettype="medline", retmode="text")
# publications = records.read().split("\n\n")
# for pub in publications:
# try:
# year_published = int(pub.split("DP - ")[1].split('\n')[0].split()[0].strip()[:4])
# pmid = pub.split("PMID-")[1].split('\n')[0].strip()
# handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pmc_refs")
# record = Entrez.read(handle)
# if len(record[0]["LinkSetDb"]) != 0:
# # print(record[0]["LinkSetDb"][0]["Link"])
# list_of_ids = []
# for id_dict in record[0]["LinkSetDb"][0]["Link"]:
# list_of_ids.append(id_dict['Id'])
# handle = Entrez.esummary(db="pmc", id=','.join(list_of_ids), retmode="xml")
# pub_records = Entrez.parse(handle)
# for record in pub_records:
# if 'PubDate' in record:
# if record['PubDate'][:4].isdigit():
# year_article_published = int(record['PubDate'][:4])
# pmid_citation_dict[year_article_published] += 1
# except:
# continue

# year_keys = list(pmid_citation_dict.keys())
# year_keys.sort()
# pmid_citation_dict = {year:pmid_citation_dict[year] for year in year_keys}
# fig = make_bar_plot(pmid_citation_dict,'Year', "Citations", f"Citations per Year", "Sourced from Pubmed")
# fig_line = make_line_plot(pmid_citation_dict, 'Year', "Citations", f"Cumulative Citations", "Sourced from Pubmed")
# fig.show()
# fig.write_image(output_folder+'citations_bar_pubmed.png')
# figure_counter = display_figure_labels(output_folder, figure_counter, "Citations that are connected to the publications each year for {}.".format(name_of_researcher_first_and_last), title = 'citations_bar_pubmed')

# fig_line.show()
# fig_line.write_image(output_folder+'citations_line_graph_pubmed.png')
# figure_counter = display_figure_labels(output_folder, figure_counter, "The cumulative citations that are connected to the publications each year for {}".format(name_of_researcher_first_and_last), title = 'citations_line_graph_pubmed')

0 comments on commit 501a020

Please sign in to comment.