Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V4.1.2 Updates #54

Merged
merged 7 commits into from
Dec 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@
<li><a href="#topicArgument">Function Arguments</a></li>
<li><a href="#profileOutput">Keys of the output:</a></li>
</ul>
<li><a href='#to-scrape-user-tweets-with-api'>Scraping user's tweet using API</a></li>
<ul>
<li><a href='#to-scrape-user-tweets-with-api'>In JSON format - Example</a></li>
<li><a href='#users_api_parameter'>Function Arguments</a></li>
<li><a href='#scrape_user_with_api_args_keys'>Keys of the output</a></li>
</ul>
<li><a href="#proxy">Using scraper with proxy</a>
<ul>
<li><a href="#unauthenticatedProxy">Unauthenticated Proxy</a></li>
Expand Down Expand Up @@ -163,6 +169,12 @@ Usage</h2>
<td>Browser Automation & HTTP Request</td>
<td>Fast</td>
</tr>
<tr>
<td><code>scrape_profile_with_api()</code></td>
<td>Scrape's Twitter tweets by twitter profile username. It expects the username of the profile</td>
<td>Browser Automation & HTTP Request</td>
<td>Fast</td>
</tr>
</table>
<p>
Note: HTTP Request Method sends the request to Twitter's API directly for scraping data, and Browser Automation visits that page, scroll while collecting the data.</p>
Expand Down Expand Up @@ -1039,6 +1051,43 @@ data = scrape_topic(filename="steamdeck", url='https://twitter.com/i/topics/1415
| directory | str | Directory to save output file. Deafult current working directory. |
| browser_profile | str | Path to the browser profile where cookies are stored and can be used for scraping data in an authenticated way. |

<br>
<hr>
<div id="to-scrape-user-tweets-with-api">

<p>To Scrap profile's tweets with API:</p>

```python
from twitter_scraper_selenium import scrape_profile_with_api

scrape_profile_with_api('elonmusk', output_filename='musk', tweets_count= 100)
```
</div>
<br>
<div id="users_api_parameter">
<p><code>scrape_profile_with_api()</code> arguments:<p>
Same as <a href='#scrape_topic_with_api_args'>scrape_topic_with_api</a>
</div>
<br>
<div id="scrape_user_with_api_args_keys"> <p>Output:<p>

```js
{
"1608939190548598784": {
"tweet_url" : "https://twitter.com/elonmusk/status/1608939190548598784",
"tweet_details":{
...
},
"user_details":{
...
}
}, ...
}
```

</div>
<br>
<hr>
</div>

<h3 id="proxy"> Using scraper with proxy (http proxy) </h3>
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name="twitter_scraper_selenium",
version="4.0.2",
version="4.1.2",
author="Sajid Shaikh",
author_email="[email protected]",
description="Python package to scrap twitter's front-end easily with selenium",
Expand Down
3 changes: 2 additions & 1 deletion twitter_scraper_selenium/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
from .keyword_api import scrape_keyword_with_api
from .profile_details import get_profile_details
from .topic_api import scrape_topic_with_api
from .profile_api import scrape_profile_with_api
# __all__ = ["Initializer",
# "Utilities", "Finder",
# "Scraping_utilities","scrap_profile","scrap_keyword"]
__all__ = ["scrape_profile", "scrape_keyword",
"scrape_topic", "scrape_keyword_with_api", "get_profile_details", "scrape_topic_with_api"]
"scrape_topic", "scrape_keyword_with_api", "get_profile_details", "scrape_topic_with_api", "scrape_profile_with_api"]
8 changes: 4 additions & 4 deletions twitter_scraper_selenium/keyword_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def scrape_keyword_with_api(query: str, proxy: Union[str, None] = None,
"""
keyword_scraper = Keywords_api(query, proxy, tweets_count)
data = keyword_scraper.scrap()
if output_filename:
if output_filename and len(data) > 0:
path = os.path.join(output_dir, "{}.json".format(output_filename))
mode = 'a'
if os.path.exists(path):
Expand All @@ -114,8 +114,8 @@ def scrape_keyword_with_api(query: str, proxy: Union[str, None] = None,
content = {}
file.close()
data.update(content)
with open(path, 'w', encoding='utf-8') as file_in_write_mode:
json.dump(data, file_in_write_mode)
logger.info('Data was saved to {}'.format(path))
with open(path, 'w', encoding='utf-8') as file_in_write_mode:
json.dump(data, file_in_write_mode)
logger.info('Data was saved to {}'.format(path))
else:
return data
12 changes: 6 additions & 6 deletions twitter_scraper_selenium/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def scrape_profile(twitter_username: str, browser: str = "firefox", proxy: Union
if filename == '':
# if filename was not provided then print the JSON to console
return json.dumps(data)
elif filename != '':
elif filename != '' and len(data) > 0:
# if filename was provided, save it to that file
mode = 'w'
json_file_location = os.path.join(directory, filename+".json")
Expand All @@ -215,11 +215,11 @@ def scrape_profile(twitter_username: str, browser: str = "firefox", proxy: Union
content = {}
file.close()
data.update(content)
with open(json_file_location, 'w', encoding='utf-8') as file_in_write_mode:
json.dump(data, file_in_write_mode)
logger.setLevel(logging.INFO)
logger.info(
'Data Successfully Saved to {}'.format(json_file_location))
with open(json_file_location, 'w', encoding='utf-8') as file_in_write_mode:
json.dump(data, file_in_write_mode)
logger.setLevel(logging.INFO)
logger.info(
'Data Successfully Saved to {}'.format(json_file_location))
elif output_format.lower() == "csv":
if filename == "":
filename = twitter_username
Expand Down
196 changes: 196 additions & 0 deletions twitter_scraper_selenium/profile_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/usr/bin/env python3

from .driver_initialization import Initializer
import logging
from .scraping_utilities import Scraping_utilities
import json
from typing import Union
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumwire.utils import decode
from selenium.common.exceptions import TimeoutException

logger = logging.getLogger(__name__)
format = logging.Formatter(
"%(asctime)s - %(levelname)s - %(message)s")
ch = logging.StreamHandler()
ch.setFormatter(format)
logger.addHandler(ch)


class Profile_api:
def __init__(self, username, browser, proxy, headless, tweets_count):
self.username = username
self.browser = browser
self.proxy = proxy
self.headless = headless
self.driver = ''
self.data_dict = {}
self.tweets_count = tweets_count

def start_driver(self):
"""changes the class member driver value to driver on call"""
self.driver = Initializer(
self.browser, self.headless, self.proxy).init()

def close_driver(self):
self.driver.close()
self.driver.quit()

def find_entries(self, response):
try:
tweets = response["data"]["user"]["result"]["timeline_v2"]["timeline"][
"instructions"
][0]["entries"]
return tweets
except KeyError:
tweets = response["data"]["user"]["result"]["timeline_v2"]["timeline"][
"instructions"
][1]["entries"]
return tweets

def augment_data(self, tweets):
try:
for tweet in tweets:
try:
tweet_results = tweet["content"]["itemContent"]["tweet_results"]['result']
rest_id = tweet_results['rest_id']
user_details = tweet_results['core']['user_results']['result']
del tweet_results['core']
self.data_dict[rest_id] = {
"tweet_url": f'https://twitter.com/{self.username}/status/{rest_id}',
'tweet_details': tweet_results,
"user_details": user_details
}

except KeyError:
continue
except Exception as ex:
logger.exception('Error at augment_data : {}'.format(ex))

def find_cursor(self, tweets):
try:
cursor = None
for tweet in tweets:
try:
if tweet["content"]["cursorType"] == "Bottom":
cursor = tweet["content"]["value"]
break
except KeyError:
continue
return cursor
except Exception as ex:
logger.exception("Find Cursor : ", ex)

def get_headers_and_uid(self, url, retry=5):
try:
self.start_driver()
self.driver.get(url)
WebDriverWait(self.driver, 30).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '[data-testid="tweet"]')
)
)
header = {}
uid = None
user_id = None
for request in self.driver.requests:
if "UserTweets" in request.url:
header = request.headers
uid = request.url.split("/")[-2]
elif "UserByScreenName" in request.url:
body = decode(
request.response.body,
request.response.headers.get(
"Content-Encoding", "identity"),
)
data = json.loads(body.decode())
user_id = data["data"]["user"]["result"]["rest_id"]
self.close_driver()
return [dict(header), uid, user_id]
except TimeoutException:
retry -= 1
self.close_driver()
return self.get_headers_and_uid(url, retry)
except Exception as ex:
logger.exception(ex)
self.close_driver()
return [dict(header), uid, user_id]

def scrape(self):
try:
header, uid, user_id = self.get_headers_and_uid(
f"https://twitter.com/{self.username}")
if not uid:
logger.error(f'Failed to find UID for {self.username}')
cursor = None
params = None
while len(self.data_dict) < self.tweets_count:
params = Scraping_utilities.build_params_for_profile(
user_id, cursor)
response = Scraping_utilities.make_http_request_with_params(
f"https://twitter.com/i/api/graphql/{uid}/UserTweets", params, header, self.proxy)
if response:
entries = self.find_entries(response)
self.augment_data(entries)
cursor = self.find_cursor(entries)
logger.setLevel(logging.INFO)
logger.info('Number of tweets scraped: {}'.format(
len(self.data_dict)))
else:
logger.warning('Failed to make request!')
break

data = dict(list(self.data_dict.items())
[0:int(self.tweets_count)])
return data
except Exception as ex:
logger.warning('Error at scrap : {}'.format(ex))


def scrape_profile_with_api(username: str, proxy: Union[str, None] = None,
tweets_count: int = 10,
output_filename: Union[str, None] = None,
output_dir: Union[str, None] = os.getcwd(),
browser: str = 'firefox',
headless: bool = True):
"""Function to scrape twitter Topic from URL using Twitter's API.

Args:
username (str): username of the twitter account.
proxy (Union[str, None], optional): Optional parameter, if user wants to use proxy for scraping. If the proxy is authenticated proxy then the proxy format is username:password@host:port. Defaults to None.
tweets_count (int, optional): Number of Tweets to scrape. Defaults to 10.
output_filename (Union[str, None], optional): Name of the output JSON file. Defaults to None.
output_dir (Union[str, None], optional): Directory where to save the file. Defaults to os.getcwd().
browser (str, optional): Which browser to use for extracting out graphql key. Defaults to 'firefox'.
headless (bool, optional): Whether to run browser in headless mode?. Defaults to True.
Returns:
(dict | none): None if data was saved, else JSON String.
"""
profile_api_scraper = Profile_api(
username=username, browser=browser, tweets_count=tweets_count, proxy=proxy, headless=headless)
data = profile_api_scraper.scrape()
if output_filename and len(data) > 0:
path = os.path.join(output_dir, "{}.json".format(output_filename))
mode = 'a'
if os.path.exists(path):
mode = 'r'
with open(path, mode, encoding='utf-8') as file:
if mode == 'r': # if mode is read mode it means file already exists and may content data
try:
file_content = file.read() # read the file
content = json.loads(file_content.strip()) # load the data
except json.decoder.JSONDecodeError:
# if the stored data is invalid data
logger.warning('Invalid JSON Detected!')
content = []
data.update(content)
with open(path, 'w', encoding='utf-8') as file_in_write_mode:
# open the file in writing mode to erase the data and dump the current data
json.dump(data, file_in_write_mode)
logger.setLevel(logging.INFO)
logger.info('Data was saved to {}'.format(path))
else:
return data
14 changes: 8 additions & 6 deletions twitter_scraper_selenium/profile_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_profile_details(twitter_username: str, proxy: Union[str, None] = None,
if filename == '':
# if filename was not provided then print the JSON to console
return json.dumps(data)
elif filename != '':
elif filename != '' and len(data) > 0:
# if filename was provided, save it to that file
mode = 'w'
json_file_location = os.path.join(directory, filename+".json")
Expand All @@ -71,8 +71,10 @@ def get_profile_details(twitter_username: str, proxy: Union[str, None] = None,
content = {}
file.close()
data.update(content)
with open(json_file_location, 'w', encoding='utf-8') as file_in_write_mode:
json.dump(data, file_in_write_mode)
logger.setLevel(logging.INFO)
logger.info(
'Data Successfully Saved to {}'.format(json_file_location))
with open(json_file_location, 'w', encoding='utf-8') as file_in_write_mode:
json.dump(data, file_in_write_mode)
logger.setLevel(logging.INFO)
logger.info(
'Data Successfully Saved to {}'.format(json_file_location))
else:
return json.dumps(data)
6 changes: 6 additions & 0 deletions twitter_scraper_selenium/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
fake_headers==1.0.2
python_dateutil==2.8.2
requests==2.27.1
selenium==4.5.0
selenium_wire==4.6.4
webdriver_manager==3.2.2
23 changes: 23 additions & 0 deletions twitter_scraper_selenium/scraping_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,26 @@ def build_topic_params(rest_id, cursor):
'features': '{"responsive_web_twitter_blue_verified_badge_is_enabled":true,"verified_phone_label_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"unified_cards_ad_metadata_container_dynamic_card_content_query_enabled":true,"tweetypie_unmention_optimization_enabled":true,"responsive_web_uc_gql_enabled":true,"vibe_api_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":false,"interactive_text_enabled":true,"responsive_web_text_conversations_enabled":false,"responsive_web_enhance_cards_enabled":true}',
}
return params

@staticmethod
def build_params_for_profile(user_id, cursor=None):
variables = {
"userId": str(user_id),
"count": 40,
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withSuperFollowsUserFields": True,
"withDownvotePerspective": False,
"withReactionsMetadata": False,
"withReactionsPerspective": False,
"withSuperFollowsTweetFields": True,
"withVoice": True,
"withV2Timeline": True,
}
if cursor:
variables["cursor"] = cursor
params = {
"variables": json.dumps(variables),
'features': '{"responsive_web_twitter_blue_verified_badge_is_enabled":true,"verified_phone_label_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"view_counts_public_visibility_enabled":true,"view_counts_everywhere_api_enabled":true,"tweetypie_unmention_optimization_enabled":true,"responsive_web_uc_gql_enabled":true,"vibe_api_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":false,"interactive_text_enabled":true,"responsive_web_text_conversations_enabled":false,"responsive_web_enhance_cards_enabled":true}',
}
return params
Loading