Skip to content

Commit

Permalink
Added Google scholar scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
Deepashri committed May 18, 2024
1 parent d366b11 commit 1fe2999
Showing 1 changed file with 54 additions and 0 deletions.
54 changes: 54 additions & 0 deletions src/scrape_up/googlescholar/gscholar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import requests
from bs4 import BeautifulSoup

def get_scholar_results(query, num_results=10):

base_url = "https://scholar.google.com/scholar"
params = {
'q': query,
'hl': 'en',
'num': num_results
}


response = requests.get(base_url, params=params)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all('div', class_='gs_r gs_or gs_scl')


parsed_results = []
for result in results:
title_tag = result.find('h3', class_='gs_rt')
if title_tag and title_tag.a:
title = title_tag.a.text
link = title_tag.a['href']
else:
title = title_tag.text if title_tag else 'No title'
link = None

snippet = result.find('div', class_='gs_rs').text if result.find('div', class_='gs_rs') else 'No snippet'
publication_info = result.find('div', class_='gs_a').text if result.find('div', class_='gs_a') else 'No publication info'

parsed_results.append({
'title': title,
'link': link,
'snippet': snippet,
'publication_info': publication_info
})

return parsed_results

# Main function
if __name__ == "__main__":
query = input("Enter your query: ")
num_results = int(input("Enter the number of results you want: "))
results = get_scholar_results(query, num_results)
for idx, result in enumerate(results):
print(f"Result {idx + 1}:")
print(f"Title: {result['title']}")
print(f"Link: {result['link']}")
print(f"Snippet: {result['snippet']}")
print(f"Publication Info: {result['publication_info']}")
print()

0 comments on commit 1fe2999

Please sign in to comment.