diff --git a/twscrape/api.py b/twscrape/api.py index 14d304e..1fc37cf 100644 --- a/twscrape/api.py +++ b/twscrape/api.py @@ -3,7 +3,7 @@ from .accounts_pool import AccountsPool from .constants import * from .logger import set_log_level -from .models import Tweet, User +from .models import Tweet, User, get_tweets, get_users from .queue_client import QueueClient from .utils import encode_params, find_obj, get_by_path, to_old_obj, to_old_rep @@ -93,14 +93,9 @@ async def search_raw(self, q: str, limit=-1, kv=None): yield x async def search(self, q: str, limit=-1, kv=None): - twids = set() async for rep in self.search_raw(q, limit=limit, kv=kv): - obj = to_old_rep(rep.json()) - for x in obj["tweets"].values(): - tmp = Tweet.parse(x, obj) - if tmp.id not in twids: - twids.add(tmp.id) - yield tmp + for x in get_tweets(rep.json(), limit): + yield x # user_by_id @@ -181,9 +176,8 @@ async def followers_raw(self, uid: int, limit=-1, kv=None): async def followers(self, uid: int, limit=-1, kv=None): async for rep in self.followers_raw(uid, limit=limit, kv=kv): - obj = to_old_rep(rep.json()) - for _, v in obj["users"].items(): - yield User.parse(v) + for x in get_users(rep.json(), limit): + yield x # following @@ -195,9 +189,8 @@ async def following_raw(self, uid: int, limit=-1, kv=None): async def following(self, uid: int, limit=-1, kv=None): async for rep in self.following_raw(uid, limit=limit, kv=kv): - obj = to_old_rep(rep.json()) - for _, v in obj["users"].items(): - yield User.parse(v) + for x in get_users(rep.json(), limit): + yield x # retweeters @@ -209,9 +202,8 @@ async def retweeters_raw(self, twid: int, limit=-1, kv=None): async def retweeters(self, twid: int, limit=-1, kv=None): async for rep in self.retweeters_raw(twid, limit=limit, kv=kv): - obj = to_old_rep(rep.json()) - for _, v in obj["users"].items(): - yield User.parse(v) + for x in get_users(rep.json(), limit): + yield x # favoriters @@ -223,9 +215,8 @@ async def favoriters_raw(self, twid: int, limit=-1, kv=None): async def favoriters(self, twid: int, limit=-1, kv=None): async for rep in self.favoriters_raw(twid, limit=limit, kv=kv): - obj = to_old_rep(rep.json()) - for _, v in obj["users"].items(): - yield User.parse(v) + for x in get_users(rep.json(), limit): + yield x # user_tweets @@ -245,9 +236,8 @@ async def user_tweets_raw(self, uid: int, limit=-1, kv=None): async def user_tweets(self, uid: int, limit=-1, kv=None): async for rep in self.user_tweets_raw(uid, limit=limit, kv=kv): - obj = to_old_rep(rep.json()) - for _, v in obj["tweets"].items(): - yield Tweet.parse(v, obj) + for x in get_tweets(rep.json(), limit): + yield x # user_tweets_and_replies @@ -267,9 +257,8 @@ async def user_tweets_and_replies_raw(self, uid: int, limit=-1, kv=None): async def user_tweets_and_replies(self, uid: int, limit=-1, kv=None): async for rep in self.user_tweets_and_replies_raw(uid, limit=limit, kv=kv): - obj = to_old_rep(rep.json()) - for _, v in obj["tweets"].items(): - yield Tweet.parse(v, obj) + for x in get_tweets(rep.json(), limit): + yield x # list timeline @@ -285,6 +274,5 @@ async def list_timeline_raw(self, list_id: int, limit=-1, kv=None): async def list_timeline(self, list_id: int, limit=-1, kv=None): async for rep in self.list_timeline_raw(list_id, limit=limit, kv=kv): - obj = to_old_rep(rep.json()) - for x in obj["tweets"].values(): - yield Tweet.parse(x, obj) + for x in get_tweets(rep, limit): + yield x diff --git a/twscrape/models.py b/twscrape/models.py index 4b6183a..1ff7048 100644 --- a/twscrape/models.py +++ b/twscrape/models.py @@ -3,10 +3,12 @@ import re from dataclasses import asdict, dataclass, field from datetime import datetime -from typing import Optional +from typing import Generator, Optional + +import httpx from .logger import logger -from .utils import find_item, get_or, int_or_none +from .utils import find_item, get_or, int_or_none, to_old_rep @dataclass @@ -115,7 +117,7 @@ class User(JSONTrait): # label: typing.Optional["UserLabel"] = None @staticmethod - def parse(obj: dict): + def parse(obj: dict, res=None): return User( id=int(obj["id_str"]), id_str=obj["id_str"], @@ -373,3 +375,36 @@ def _get_views(obj: dict, rt_obj: dict): if k is not None: return k return None + + +# reply parsing + + +def get_items(rep: httpx.Response, kind: str, limit: int = -1): + if kind == "user": + Cls = User + key = "users" + elif kind == "tweet": + Cls = Tweet + key = "tweets" + else: + raise ValueError(f"Invalid kind: {kind}") + + ids = set() + obj = to_old_rep(rep.json() if "json" in rep else rep) # type: ignore + for x in obj[key].values(): + if limit != -1 and len(ids) >= limit: + break + + tmp = Cls.parse(x, obj) + if tmp.id not in ids: + ids.add(tmp.id) + yield tmp + + +def get_tweets(rep: httpx.Response, limit: int = -1) -> Generator[Tweet, None, None]: + return get_items(rep, "tweet", limit) # type: ignore + + +def get_users(rep: httpx.Response, limit: int = -1) -> Generator[User, None, None]: + return get_items(rep, "user", limit) # type: ignore diff --git a/twscrape/queue_client.py b/twscrape/queue_client.py index 222c1bf..00e8d47 100644 --- a/twscrape/queue_client.py +++ b/twscrape/queue_client.py @@ -155,6 +155,11 @@ async def _check_rep(self, rep: httpx.Response): await self._close_ctx(-1, banned=True, msg=msg) raise BannedError(msg) + # possible banned by old api flow + if rep.status_code in (401, 403): + await self._close_ctx(utc_ts() + 60 * 60 * 12) # lock for 12 hours + raise RateLimitError(msg) + # content not found if rep.status_code == 200 and "_Missing: No status found with that ID." in msg: return # ignore this error