shaikhsajid1111 · shaikhsajid1111 · Dec 31, 2022 · Dec 31, 2022 · Dec 31, 2022 · Dec 31, 2022
diff --git a/README.md b/README.md
@@ -67,6 +67,12 @@
     <li><a href="#topicArgument">Function Arguments</a></li>
     <li><a href="#profileOutput">Keys of the output:</a></li>
     </ul>
+    <li><a href='#to-scrape-user-tweets-with-api'>Scraping user's tweet using API</a></li>
+    <ul>
+    <li><a href='#to-scrape-user-tweets-with-api'>In JSON format - Example</a></li>
+    <li><a href='#users_api_parameter'>Function Arguments</a></li>
+    <li><a href='#scrape_user_with_api_args_keys'>Keys of the output</a></li>
+    </ul>
     <li><a href="#proxy">Using scraper with proxy</a>
     <ul>
     <li><a href="#unauthenticatedProxy">Unauthenticated Proxy</a></li>
@@ -163,6 +169,12 @@ Usage</h2>
 <td>Browser Automation & HTTP Request</td>
 <td>Fast</td>
 </tr>
+<tr>
+<td><code>scrape_profile_with_api()</code></td>
+<td>Scrape's Twitter tweets by twitter profile username. It expects the username of the profile</td>
+<td>Browser Automation & HTTP Request</td>
+<td>Fast</td>
+</tr>
 </table>
 <p>
 Note: HTTP Request Method sends the request to Twitter's API directly for scraping data, and Browser Automation visits that page, scroll while collecting the data.</p>
@@ -1039,6 +1051,43 @@ data = scrape_topic(filename="steamdeck", url='https://twitter.com/i/topics/1415
 | directory     | str                | Directory to save output file. Deafult current working directory.                                                                      |
 | browser_profile | str | Path to the browser profile where cookies are stored and can be used for scraping data in an authenticated way. |
 
+<br>
+<hr>
+<div id="to-scrape-user-tweets-with-api">
+
+<p>To Scrap profile's tweets with API:</p>
+
+```python
+from twitter_scraper_selenium import scrape_profile_with_api
+
+scrape_profile_with_api('elonmusk', output_filename='musk', tweets_count= 100)
+```
+</div>
+<br>
+<div id="users_api_parameter">
+<p><code>scrape_profile_with_api()</code> arguments:<p>
+Same as <a href='#scrape_topic_with_api_args'>scrape_topic_with_api</a>
+</div>
+<br>
+<div id="scrape_user_with_api_args_keys"> <p>Output:<p>
+
+```js
+{
+  "1608939190548598784": {
+    "tweet_url" : "https://twitter.com/elonmusk/status/1608939190548598784",
+    "tweet_details":{
+      ...
+    },
+    "user_details":{
+      ...
+    }
+  }, ...
+}
+```
+
+</div>
+<br>
+<hr>
 </div>
 
 <h3 id="proxy"> Using scraper with proxy (http proxy) </h3>

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setuptools.setup(
     name="twitter_scraper_selenium",
-    version="4.0.2",
+    version="4.1.2",
     author="Sajid Shaikh",
     author_email="[email protected]",
     description="Python package to scrap twitter's front-end easily with selenium",

diff --git a/twitter_scraper_selenium/__init__.py b/twitter_scraper_selenium/__init__.py
@@ -8,8 +8,9 @@
 from .keyword_api import scrape_keyword_with_api
 from .profile_details import get_profile_details
 from .topic_api import scrape_topic_with_api
+from .profile_api import scrape_profile_with_api
 # __all__ = ["Initializer",
 #           "Utilities", "Finder",
 #           "Scraping_utilities","scrap_profile","scrap_keyword"]
 __all__ = ["scrape_profile", "scrape_keyword",
-           "scrape_topic", "scrape_keyword_with_api", "get_profile_details", "scrape_topic_with_api"]
+           "scrape_topic", "scrape_keyword_with_api", "get_profile_details", "scrape_topic_with_api", "scrape_profile_with_api"]
diff --git a/twitter_scraper_selenium/keyword_api.py b/twitter_scraper_selenium/keyword_api.py
@@ -99,7 +99,7 @@ def scrape_keyword_with_api(query: str, proxy: Union[str, None] = None,
     """
     keyword_scraper = Keywords_api(query, proxy, tweets_count)
     data = keyword_scraper.scrap()
-    if output_filename:
+    if output_filename and len(data) > 0:
         path = os.path.join(output_dir, "{}.json".format(output_filename))
         mode = 'a'
         if os.path.exists(path):
@@ -114,8 +114,8 @@ def scrape_keyword_with_api(query: str, proxy: Union[str, None] = None,
                     content = {}
                 file.close()
                 data.update(content)
-                with open(path, 'w', encoding='utf-8') as file_in_write_mode:
-                    json.dump(data, file_in_write_mode)
-                    logger.info('Data was saved to {}'.format(path))
+        with open(path, 'w', encoding='utf-8') as file_in_write_mode:
+            json.dump(data, file_in_write_mode)
+            logger.info('Data was saved to {}'.format(path))
     else:
         return data
diff --git a/twitter_scraper_selenium/profile.py b/twitter_scraper_selenium/profile.py
@@ -199,7 +199,7 @@ def scrape_profile(twitter_username: str, browser: str = "firefox", proxy: Union
         if filename == '':
           # if filename was not provided then print the JSON to console
             return json.dumps(data)
-        elif filename != '':
+        elif filename != '' and len(data) > 0:
           # if filename was provided, save it to that file
             mode = 'w'
             json_file_location = os.path.join(directory, filename+".json")
@@ -215,11 +215,11 @@ def scrape_profile(twitter_username: str, browser: str = "firefox", proxy: Union
                         content = {}
                     file.close()
                     data.update(content)
-                with open(json_file_location, 'w', encoding='utf-8') as file_in_write_mode:
-                    json.dump(data, file_in_write_mode)
-                    logger.setLevel(logging.INFO)
-                    logger.info(
-                        'Data Successfully Saved to {}'.format(json_file_location))
+            with open(json_file_location, 'w', encoding='utf-8') as file_in_write_mode:
+                json.dump(data, file_in_write_mode)
+                logger.setLevel(logging.INFO)
+                logger.info(
+                    'Data Successfully Saved to {}'.format(json_file_location))
     elif output_format.lower() == "csv":
         if filename == "":
             filename = twitter_username

diff --git a/twitter_scraper_selenium/profile_api.py b/twitter_scraper_selenium/profile_api.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+from .driver_initialization import Initializer
+import logging
+from .scraping_utilities import Scraping_utilities
+import json
+from typing import Union
+import os
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from seleniumwire.utils import decode
+from selenium.common.exceptions import TimeoutException
+
+logger = logging.getLogger(__name__)
+format = logging.Formatter(
+    "%(asctime)s - %(levelname)s - %(message)s")
+ch = logging.StreamHandler()
+ch.setFormatter(format)
+logger.addHandler(ch)
+
+
+class Profile_api:
+    def __init__(self, username, browser, proxy, headless, tweets_count):
+        self.username = username
+        self.browser = browser
+        self.proxy = proxy
+        self.headless = headless
+        self.driver = ''
+        self.data_dict = {}
+        self.tweets_count = tweets_count
+
+    def start_driver(self):
+        """changes the class member driver value to driver on call"""
+        self.driver = Initializer(
+            self.browser, self.headless, self.proxy).init()
+
+    def close_driver(self):
+        self.driver.close()
+        self.driver.quit()
+
+    def find_entries(self, response):
+        try:
+            tweets = response["data"]["user"]["result"]["timeline_v2"]["timeline"][
+                "instructions"
+            ][0]["entries"]
+            return tweets
+        except KeyError:
+            tweets = response["data"]["user"]["result"]["timeline_v2"]["timeline"][
+                "instructions"
+            ][1]["entries"]
+            return tweets
+
+    def augment_data(self, tweets):
+        try:
+            for tweet in tweets:
+                try:
+                    tweet_results = tweet["content"]["itemContent"]["tweet_results"]['result']
+                    rest_id = tweet_results['rest_id']
+                    user_details = tweet_results['core']['user_results']['result']
+                    del tweet_results['core']
+                    self.data_dict[rest_id] = {
+                        "tweet_url": f'https://twitter.com/{self.username}/status/{rest_id}',
+                        'tweet_details': tweet_results,
+                        "user_details": user_details
+                    }
+
+                except KeyError:
+                    continue
+        except Exception as ex:
+            logger.exception('Error at augment_data : {}'.format(ex))
+
+    def find_cursor(self, tweets):
+        try:
+            cursor = None
+            for tweet in tweets:
+                try:
+                    if tweet["content"]["cursorType"] == "Bottom":
+                        cursor = tweet["content"]["value"]
+                        break
+                except KeyError:
+                    continue
+            return cursor
+        except Exception as ex:
+            logger.exception("Find Cursor : ", ex)
+
+    def get_headers_and_uid(self, url, retry=5):
+        try:
+            self.start_driver()
+            self.driver.get(url)
+            WebDriverWait(self.driver, 30).until(
+                EC.presence_of_element_located(
+                    (By.CSS_SELECTOR, '[data-testid="tweet"]')
+                )
+            )
+            header = {}
+            uid = None
+            user_id = None
+            for request in self.driver.requests:
+                if "UserTweets" in request.url:
+                    header = request.headers
+                    uid = request.url.split("/")[-2]
+                elif "UserByScreenName" in request.url:
+                    body = decode(
+                        request.response.body,
+                        request.response.headers.get(
+                            "Content-Encoding", "identity"),
+                    )
+                    data = json.loads(body.decode())
+                    user_id = data["data"]["user"]["result"]["rest_id"]
+            self.close_driver()
+            return [dict(header), uid, user_id]
+        except TimeoutException:
+            retry -= 1
+            self.close_driver()
+            return self.get_headers_and_uid(url, retry)
+        except Exception as ex:
+            logger.exception(ex)
+            self.close_driver()
+            return [dict(header), uid, user_id]
+
+    def scrape(self):
+        try:
+            header, uid, user_id = self.get_headers_and_uid(
+                f"https://twitter.com/{self.username}")
+            if not uid:
+                logger.error(f'Failed to find UID for {self.username}')
+            cursor = None
+            params = None
+            while len(self.data_dict) < self.tweets_count:
+                params = Scraping_utilities.build_params_for_profile(
+                    user_id, cursor)
+                response = Scraping_utilities.make_http_request_with_params(
+                    f"https://twitter.com/i/api/graphql/{uid}/UserTweets", params, header, self.proxy)
+                if response:
+                    entries = self.find_entries(response)
+                    self.augment_data(entries)
+                    cursor = self.find_cursor(entries)
+                    logger.setLevel(logging.INFO)
+                    logger.info('Number of tweets scraped: {}'.format(
+                        len(self.data_dict)))
+                else:
+                    logger.warning('Failed to make request!')
+                    break
+
+            data = dict(list(self.data_dict.items())
+                        [0:int(self.tweets_count)])
+            return data
+        except Exception as ex:
+            logger.warning('Error at scrap : {}'.format(ex))
+
+
+def scrape_profile_with_api(username: str, proxy: Union[str, None] = None,
+                            tweets_count: int = 10,
+                            output_filename: Union[str, None] = None,
+                            output_dir: Union[str, None] = os.getcwd(),
+                            browser: str = 'firefox',
+                            headless: bool = True):
+    """Function to scrape twitter Topic from URL using Twitter's API.
+
+    Args:
+        username (str): username of the twitter account.
+        proxy (Union[str, None], optional): Optional parameter, if user wants to use proxy for scraping. If the proxy is authenticated proxy then the proxy format is username:password@host:port. Defaults to None.
+        tweets_count (int, optional): Number of Tweets to scrape. Defaults to 10.
+        output_filename (Union[str, None], optional): Name of the output JSON file. Defaults to None.
+        output_dir (Union[str, None], optional): Directory where to save the file. Defaults to os.getcwd().
+        browser (str, optional): Which browser to use for extracting out graphql key. Defaults to 'firefox'.
+        headless (bool, optional): Whether to run browser in headless mode?. Defaults to True.
+    Returns:
+        (dict | none): None if data was saved, else JSON String.
+    """
+    profile_api_scraper = Profile_api(
+        username=username, browser=browser, tweets_count=tweets_count, proxy=proxy, headless=headless)
+    data = profile_api_scraper.scrape()
+    if output_filename and len(data) > 0:
+        path = os.path.join(output_dir, "{}.json".format(output_filename))
+        mode = 'a'
+        if os.path.exists(path):
+            mode = 'r'
+        with open(path, mode, encoding='utf-8') as file:
+            if mode == 'r':  # if mode is read mode it means file already exists and may content data
+                try:
+                    file_content = file.read()  # read the file
+                    content = json.loads(file_content.strip())  # load the data
+                except json.decoder.JSONDecodeError:
+                    # if the stored data is invalid data
+                    logger.warning('Invalid JSON Detected!')
+                    content = []
+                data.update(content)
+        with open(path, 'w', encoding='utf-8') as file_in_write_mode:
+            # open the file in writing mode to erase the data and dump the current data
+            json.dump(data, file_in_write_mode)
+            logger.setLevel(logging.INFO)
+            logger.info('Data was saved to {}'.format(path))
+    else:
+        return data
diff --git a/twitter_scraper_selenium/profile_details.py b/twitter_scraper_selenium/profile_details.py
@@ -55,7 +55,7 @@ def get_profile_details(twitter_username: str, proxy: Union[str, None] = None,
     if filename == '':
         # if filename was not provided then print the JSON to console
         return json.dumps(data)
-    elif filename != '':
+    elif filename != '' and len(data) > 0:
         # if filename was provided, save it to that file
         mode = 'w'
         json_file_location = os.path.join(directory, filename+".json")
@@ -71,8 +71,10 @@ def get_profile_details(twitter_username: str, proxy: Union[str, None] = None,
                     content = {}
                 file.close()
                 data.update(content)
-            with open(json_file_location, 'w', encoding='utf-8') as file_in_write_mode:
-                json.dump(data, file_in_write_mode)
-                logger.setLevel(logging.INFO)
-                logger.info(
-                    'Data Successfully Saved to {}'.format(json_file_location))
+        with open(json_file_location, 'w', encoding='utf-8') as file_in_write_mode:
+            json.dump(data, file_in_write_mode)
+            logger.setLevel(logging.INFO)
+            logger.info(
+                'Data Successfully Saved to {}'.format(json_file_location))
+    else:
+      return json.dumps(data)
diff --git a/twitter_scraper_selenium/requirements.txt b/twitter_scraper_selenium/requirements.txt
@@ -0,0 +1,6 @@
+fake_headers==1.0.2
+python_dateutil==2.8.2
+requests==2.27.1
+selenium==4.5.0
+selenium_wire==4.6.4
+webdriver_manager==3.2.2
diff --git a/twitter_scraper_selenium/scraping_utilities.py b/twitter_scraper_selenium/scraping_utilities.py
@@ -244,3 +244,26 @@ def build_topic_params(rest_id, cursor):
             'features': '{"responsive_web_twitter_blue_verified_badge_is_enabled":true,"verified_phone_label_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"unified_cards_ad_metadata_container_dynamic_card_content_query_enabled":true,"tweetypie_unmention_optimization_enabled":true,"responsive_web_uc_gql_enabled":true,"vibe_api_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":false,"interactive_text_enabled":true,"responsive_web_text_conversations_enabled":false,"responsive_web_enhance_cards_enabled":true}',
         }
         return params
+
+    @staticmethod
+    def build_params_for_profile(user_id, cursor=None):
+     variables = {
+         "userId": str(user_id),
+         "count": 40,
+         "includePromotedContent": True,
+         "withQuickPromoteEligibilityTweetFields": True,
+         "withSuperFollowsUserFields": True,
+         "withDownvotePerspective": False,
+         "withReactionsMetadata": False,
+         "withReactionsPerspective": False,
+         "withSuperFollowsTweetFields": True,
+         "withVoice": True,
+         "withV2Timeline": True,
+     }
+     if cursor:
+         variables["cursor"] = cursor
+     params = {
+         "variables": json.dumps(variables),
+          'features': '{"responsive_web_twitter_blue_verified_badge_is_enabled":true,"verified_phone_label_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"view_counts_public_visibility_enabled":true,"view_counts_everywhere_api_enabled":true,"tweetypie_unmention_optimization_enabled":true,"responsive_web_uc_gql_enabled":true,"vibe_api_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":false,"interactive_text_enabled":true,"responsive_web_text_conversations_enabled":false,"responsive_web_enhance_cards_enabled":true}',
+     }
+     return params