diff --git a/appyters/Researcher_Summary_Report_Appyter/researcher_summary_appyter.ipynb b/appyters/Researcher_Summary_Report_Appyter/researcher_summary_appyter.ipynb index dd901202..a09f7108 100644 --- a/appyters/Researcher_Summary_Report_Appyter/researcher_summary_appyter.ipynb +++ b/appyters/Researcher_Summary_Report_Appyter/researcher_summary_appyter.ipynb @@ -280,100 +280,6 @@ "## Citations per year for {{researcher_name.raw_value}}" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from serpapi import GoogleSearch\n", - "from serpapi import GoogleScholarSearch\n", - "import os\n", - "from parsel import Selector\n", - "import requests, json, re\n", - "from urllib.parse import urlsplit, parse_qsl\n", - "\n", - "params = {\n", - " # https://docs.python.org/3/library/os.html\n", - " 'api_key': os.environ.get('SERP_API_KEY'), # SerpApi API key\n", - " 'engine': 'google_scholar_profiles', # profile results search engine\n", - " 'mauthors': name_of_researcher_first_and_last # search query\n", - "}\n", - "citation_dict = {}\n", - "try:\n", - " search = GoogleSearch(params)\n", - "except:\n", - " print(\"Error with initial Google Search API call for profiles. May have run out of credits\")\n", - " # return citation_dict\n", - "\n", - "profile_is_present = True\n", - "try:\n", - " while profile_is_present:\n", - " profile_results = search.get_dict()\n", - " for profile in profile_results['profiles']:\n", - " print(f'Currently extracting {profile[\"name\"]} with {profile[\"author_id\"]} ID.')\n", - " thumbnail = profile['thumbnail']\n", - " name = profile['name']\n", - " link = profile['link']\n", - " author_id = profile['author_id']\n", - " affiliations = profile['affiliations']\n", - " email = profile.get('email')\n", - " cited_by = profile.get('cited_by')\n", - " interests = profile.get('interests')\n", - " if name_of_researcher_first_and_last.split()[0].lower() in name.lower() and name_of_researcher_first_and_last.split()[-1].lower() in name.lower():\n", - " params = {\n", - " # https://docs.python.org/3/library/os.html\n", - " 'api_key': os.environ.get('SERP_API_KEY'),\n", - " 'engine': 'google_scholar_author',\n", - " 'author_id': author_id,\n", - " 'hl': 'en'\n", - " }\n", - " try:\n", - " search = GoogleScholarSearch(params)\n", - " results = search.get_dict()\n", - " except:\n", - " \"Error with fetching researcher information using Google API call\"\n", - " # return citation_dict\n", - "\n", - " data = {\n", - " 'cited_by': [],\n", - " 'public_access': {},\n", - " 'graph': []\n", - " }\n", - "\n", - " data['cited_by'] = results['cited_by']['table']\n", - " data['public_access']['link'] = results['public_access']['link']\n", - " data['public_access']['articles_available'] = results['public_access']['available']\n", - " data['public_access']['articles_not_available'] = results['public_access']['not_available']\n", - "\n", - " data['graph'] = results['cited_by']['graph']\n", - " for year_dict in results['cited_by']['graph']:\n", - " citation_dict[year_dict['year']] = year_dict['citations']\n", - " year_keys = list(citation_dict.keys())\n", - " year_keys.sort()\n", - " citation_dict = {year:citation_dict[year] for year in year_keys}\n", - " fig = make_bar_plot(citation_dict,'Year', \"Citations\", f\"Citations per Year\", \"Sourced from Google Scholar\")\n", - " fig_line = make_line_plot(citation_dict, 'Year', \"Citations\", f\"Cumulative Citations\", \"Sourced from Google Scholar\")\n", - " fig.show()\n", - " fig.write_image(output_folder+'citations_bar_google_scholar.png')\n", - " figure_counter = display_figure_labels(output_folder, figure_counter, \"The number of times cited per year for {}.\".format(name_of_researcher_first_and_last), title = 'citations_bar_google_scholar')\n", - "\n", - " fig_line.show()\n", - " fig_line.write_image(output_folder+'citations_line_graph_google_scholar.png')\n", - " figure_counter = display_figure_labels(output_folder, figure_counter, \"The cumulative number of citations per year for {}\".format(name_of_researcher_first_and_last), title = 'citations_line_graph_google_scholar')\n", - " # return citation_dict\n", - "\n", - " # # check if the next page is present in 'serpapi_pagination' dict key\n", - " if 'pagination' in profile_results and 'next' in profile_results['pagination']:\n", - " # split URL in parts as a dict() and update search 'params' variable to a new page\n", - " search.params_dict.update(dict(parse_qsl(urlsplit(profile_results['pagination']['next']).query)))\n", - " else:\n", - " profile_is_present = False\n", - "except:\n", - " print(\"Error with fetching the data\")\n", - "# return citation_dict" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/appyters/Researcher_Summary_Report_Appyter/utils.py b/appyters/Researcher_Summary_Report_Appyter/utils.py index dd706394..4cedf230 100644 --- a/appyters/Researcher_Summary_Report_Appyter/utils.py +++ b/appyters/Researcher_Summary_Report_Appyter/utils.py @@ -562,7 +562,7 @@ def query_pubmed_citations(pubmed_name, name_of_researcher_first_and_last): print("This name does not have any publications to search for citations returned from PubMed.") return pmid_citation_dict else: - print("This may take a minute or two.") + print("This may take over a minute or two.") display(MyMarkdown("### Link to [PubMed Query](https://pubmed.ncbi.nlm.nih.gov/?{}) for {}".format(urlencode(params), name_of_researcher_first_and_last))) # Use the Entrez module efetch for the publication records for the PMIDs with text information included for each. records = Entrez.efetch(db="pubmed", id=identifiers, rettype="medline", retmode="text") @@ -590,150 +590,3 @@ def query_pubmed_citations(pubmed_name, name_of_researcher_first_and_last): if len(pmid_citation_dict) == 0: print("This name does not have any publications to search for citations returned from PubMed.") return pmid_citation_dict - - -# from serpapi import GoogleSearch -# from serpapi import GoogleScholarSearch -# import os -# from parsel import Selector -# import requests, json, re -# from urllib.parse import urlsplit, parse_qsl -# def serpapi_scrape_all_authors(name_of_researcher): -# params = { -# # https://docs.python.org/3/library/os.html -# 'api_key': '5d728fd1a3c62a85ebe7d31f50b6eb13d1a2cd8fa30381df01d30fd6c578de49', # SerpApi API key -# 'engine': 'google_scholar_profiles', # profile results search engine -# 'mauthors': name_of_researcher_first_and_last # search query -# } -# search = GoogleSearch(params) -# citation_dict = {} -# profiles_is_present = True -# # try: -# while profiles_is_present: - -# profile_results = search.get_dict() -# print(profile_results.keys()) -# print(profile_results) - -# for profile in profile_results['profiles']: - -# print(f'Currently extracting {profile["name"]} with {profile["author_id"]} ID.') - -# thumbnail = profile['thumbnail'] -# name = profile['name'] -# link = profile['link'] -# author_id = profile['author_id'] -# affiliations = profile['affiliations'] -# email = profile.get('email') -# cited_by = profile.get('cited_by') -# interests = profile.get('interests') -# if name_of_researcher.split()[0] in name and name_of_researcher.split()[-1] in name: -# params = { -# # https://docs.python.org/3/library/os.html -# 'api_key': '5d728fd1a3c62a85ebe7d31f50b6eb13d1a2cd8fa30381df01d30fd6c578de49', -# 'engine': 'google_scholar_author', -# 'author_id': author_id, -# 'hl': 'en' -# } - -# search = GoogleScholarSearch(params) -# results = search.get_dict() - -# data = { -# 'cited_by': [], -# 'public_access': {}, -# 'graph': [] -# } - -# data['cited_by'] = results['cited_by']['table'] -# data['public_access']['link'] = results['public_access']['link'] -# data['public_access']['articles_available'] = results['public_access']['available'] -# data['public_access']['articles_not_available'] = results['public_access']['not_available'] - -# data['graph'] = results['cited_by']['graph'] - -# for year_dict in results['cited_by']['graph']: -# citation_dict[year_dict['year']] = year_dict['citations'] -# year_keys = list(citation_dict.keys()) -# year_keys.sort() -# citation_dict = {year:citation_dict[year] for year in year_keys} -# fig = make_bar_plot(citation_dict,'Year', "Citations", f"Citations per Year", "Sourced from Google Scholar") -# fig_line = make_line_plot(citation_dict, 'Year', "Citations", f"Cumulative Citations", "Sourced from Google Scholar") -# fig.show() -# fig.write_image(output_folder+'citations_bar_google_scholar.png') -# # figure_counter = display_figure_labels(output_folder, figure_counter, "Citations that are connected to the publications each year for {}.".format(name_of_researcher_first_and_last), title = 'citations_bar_google_scholar') - -# fig_line.show() -# fig_line.write_image(output_folder+'citations_line_graph_google_scholar.png') -# # figure_counter = display_figure_labels(output_folder, figure_counter, "The cumulative citations that are connected to the publications each year for {}".format(name_of_researcher_first_and_last), title = 'citations_line_graph_google_scholar') -# return citation_dict - -# # # check if the next page is present in 'serpapi_pagination' dict key -# if 'pagination' in profile_results and 'next' in profile_results['pagination']: -# # split URL in parts as a dict() and update search 'params' variable to a new page -# search.params_dict.update(dict(parse_qsl(urlsplit(profile_results['pagination']['next']).query))) -# else: -# profiles_is_present = False -# # except: -# # print("Error in api calls") - -# print("no researcher matched") - -# return citation_dict - - - - - -# PUBMED CITATION EXTRACTION - -# params = { -# 'term': "{}".format(pubmed_name) -# } -# pmid_citation_dict = defaultdict(int) -# #Get the pubmed publications for the researcher with the pmids -# info = Entrez.esearch(db="PubMed", term= pubmed_name, retmax = "5000") -# info = Entrez.read(info) -# identifiers = info['IdList'] # Get list of identifiers which are pmids -# if len(identifiers) == 0: -# print("This name does not have any publications to search for citations returned from PubMed.") -# else: -# print("Searching for Citation Information from Pubmed for {}".format(name_of_researcher_first_and_last)) -# print("This may take a minute or two.") -# display(MyMarkdown("### Link to [PubMed Query](https://pubmed.ncbi.nlm.nih.gov/?{}) for {}".format(urlencode(params), name_of_researcher_first_and_last))) -# # Use the Entrez module efetch for the publication records for the PMIDs with text information included for each. -# records = Entrez.efetch(db="pubmed", id=identifiers, rettype="medline", retmode="text") -# publications = records.read().split("\n\n") -# for pub in publications: -# try: -# year_published = int(pub.split("DP - ")[1].split('\n')[0].split()[0].strip()[:4]) -# pmid = pub.split("PMID-")[1].split('\n')[0].strip() -# handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pmc_refs") -# record = Entrez.read(handle) -# if len(record[0]["LinkSetDb"]) != 0: -# # print(record[0]["LinkSetDb"][0]["Link"]) -# list_of_ids = [] -# for id_dict in record[0]["LinkSetDb"][0]["Link"]: -# list_of_ids.append(id_dict['Id']) -# handle = Entrez.esummary(db="pmc", id=','.join(list_of_ids), retmode="xml") -# pub_records = Entrez.parse(handle) -# for record in pub_records: -# if 'PubDate' in record: -# if record['PubDate'][:4].isdigit(): -# year_article_published = int(record['PubDate'][:4]) -# pmid_citation_dict[year_article_published] += 1 -# except: -# continue - -# year_keys = list(pmid_citation_dict.keys()) -# year_keys.sort() -# pmid_citation_dict = {year:pmid_citation_dict[year] for year in year_keys} -# fig = make_bar_plot(pmid_citation_dict,'Year', "Citations", f"Citations per Year", "Sourced from Pubmed") -# fig_line = make_line_plot(pmid_citation_dict, 'Year', "Citations", f"Cumulative Citations", "Sourced from Pubmed") -# fig.show() -# fig.write_image(output_folder+'citations_bar_pubmed.png') -# figure_counter = display_figure_labels(output_folder, figure_counter, "Citations that are connected to the publications each year for {}.".format(name_of_researcher_first_and_last), title = 'citations_bar_pubmed') - -# fig_line.show() -# fig_line.write_image(output_folder+'citations_line_graph_pubmed.png') -# figure_counter = display_figure_labels(output_folder, figure_counter, "The cumulative citations that are connected to the publications each year for {}".format(name_of_researcher_first_and_last), title = 'citations_line_graph_pubmed')