2021-11-26 17:32:11 -08:00
import requests
import re
2021-12-05 19:02:16 -08:00
import csv
import sys
2021-11-26 17:32:11 -08:00
import json
import argparse
2022-07-06 13:40:30 -07:00
from time import sleep
2021-11-26 17:32:11 -08:00
GUEST_TOKEN_ENDPOINT = " https://api.twitter.com/1.1/guest/activate.json "
2021-12-05 19:02:16 -08:00
STATUS_ENDPOINT = " https://twitter.com/i/api/graphql/ "
2021-11-26 17:32:11 -08:00
2021-12-05 19:02:16 -08:00
CURSOR_PATTERN = re . compile ( ' TimelineCursor " , " value " : " ([^ \" ]+) " [^ \ }]+Bottom " ' )
ID_PATTERN = re . compile ( ' " rest_id " : " ([^ " ]+) " ' )
COUNT_PATTERN = re . compile ( ' " statuses_count " :([0-9]+) ' )
2021-11-26 17:32:11 -08:00
variables = {
" count " : 200 ,
" withTweetQuoteCount " : True ,
" includePromotedContent " : True ,
" withQuickPromoteEligibilityTweetFields " : False ,
" withSuperFollowsUserFields " : True ,
" withUserResults " : True ,
" withBirdwatchPivots " : False ,
" withDownvotePerspective " : False ,
" withReactionsMetadata " : False ,
" withReactionsPerspective " : False ,
" withSuperFollowsTweetFields " : True ,
" withVoice " : True ,
2022-07-06 13:40:30 -07:00
" withV2Timeline " : False ,
2021-11-26 17:32:11 -08:00
}
2022-07-06 13:40:30 -07:00
features = {
" standardized_nudges_misinfo " : True , " dont_mention_me_view_api_enabled " : True , " responsive_web_edit_tweet_api_enabled " : True , " interactive_text_enabled " : True , " responsive_web_enhance_cards_enabled " : True , " responsive_web_uc_gql_enabled " : True , " vibe_tweet_context_enabled " : True ,
}
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
def send_request ( url , session_method , headers , params = None ) :
if params :
2021-12-05 19:02:16 -08:00
response = session_method ( url , headers = headers , stream = True , params = {
2022-07-06 13:40:30 -07:00
" variables " : json . dumps ( params ) ,
" features " : json . dumps ( features )
} )
2021-11-26 17:32:11 -08:00
else :
response = session_method ( url , headers = headers , stream = True )
if response . status_code != 200 :
print ( response . request . url )
print ( response . status_code )
2022-07-06 13:40:30 -07:00
assert response . status_code == 200 , f " Failed request to { url } . { response . status_code } . Please submit an issue including this information. { response . text } "
2021-11-26 17:32:11 -08:00
result = [ line . decode ( " utf-8 " ) for line in response . iter_lines ( ) ]
return " " . join ( result )
def search_json ( j , target_key , result ) :
if type ( j ) == dict :
for key in j :
if key == target_key :
result . append ( j [ key ] )
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
search_json ( j [ key ] , target_key , result )
return result
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
if type ( j ) == list :
for item in j :
search_json ( item , target_key , result )
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
return result
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
return result
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
def tweet_subset ( d ) :
return {
" id " : d [ " id_str " ] ,
" text " : d [ " full_text " ] ,
" created_at " : d [ " created_at " ] ,
" retweet_count " : d [ " retweet_count " ] ,
" favorite_count " : d [ " favorite_count " ] ,
" reply_count " : d [ " reply_count " ] ,
" quote_count " : d [ " quote_count " ] ,
" retweeted " : d [ " retweeted " ] ,
" is_quote_status " : d [ " is_quote_status " ] ,
" possibly_sensitive " : d . get ( " possibly_sensitive " , " No data " ) ,
}
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
def get_tweets ( query_id , session , headers , variables , expected_total ) :
2021-12-05 19:02:16 -08:00
resp = send_request (
f " { STATUS_ENDPOINT } { query_id } /UserTweetsAndReplies " , session . get , headers , variables )
2021-11-26 17:32:11 -08:00
j = json . loads ( resp )
all_tweets = search_json ( j , " legacy " , [ ] )
all_tweets = [ tweet for tweet in all_tweets if " id_str " in tweet ]
ids = { tweet [ " id_str " ] for tweet in all_tweets }
while True :
cursor = CURSOR_PATTERN . findall ( resp ) [ 0 ]
variables [ " cursor " ] = cursor
2021-12-05 19:02:16 -08:00
resp = send_request (
f " { STATUS_ENDPOINT } { query_id } /UserTweetsAndReplies " , session . get , headers , variables )
2021-11-26 17:32:11 -08:00
j = json . loads ( resp )
next_tweets = search_json ( j , " legacy " , [ ] )
next_tweets = [ tweet for tweet in next_tweets if " id_str " in tweet ]
next_ids = { tweet [ " id_str " ] for tweet in next_tweets }
old_id_size = len ( ids )
ids . update ( next_ids )
if old_id_size == len ( ids ) :
break
all_tweets . extend ( next_tweets )
2021-11-26 18:13:23 -08:00
if args . output :
print ( f " { len ( all_tweets ) } / { expected_total } " , end = " \r " )
2021-11-26 17:32:11 -08:00
2021-12-05 19:02:16 -08:00
all_tweets = [ tweet for tweet in all_tweets if " full_text " in tweet and tweet . get (
" user_id_str " , " " ) == variables [ " userId " ] ]
2021-11-26 17:32:11 -08:00
return all_tweets
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
def get_id_and_tweet_count ( session , headers , query_id , username ) :
resp = send_request (
2021-12-05 19:02:16 -08:00
f " { STATUS_ENDPOINT } { query_id } /UserByScreenName " ,
session . get ,
headers ,
2021-11-26 17:32:11 -08:00
params = {
" screen_name " : username ,
" withSafetyModeUserFields " : True ,
" withSuperFollowsUserFields " : True
}
)
ids = ID_PATTERN . findall ( resp )
2021-12-05 19:02:16 -08:00
assert len (
ids ) == 1 , f " Failed to find user id for { username } . Please open this as an issue including this message. "
2021-11-26 17:32:11 -08:00
counts = COUNT_PATTERN . findall ( resp )
2021-12-05 19:02:16 -08:00
assert len (
counts ) == 1 , f " Failed to find tweet count for { username } . Please open this as an issue including this message. "
2021-11-26 17:32:11 -08:00
return ids [ 0 ] , int ( counts [ 0 ] )
2022-07-06 13:40:30 -07:00
def user_tweets ( username ) :
print ( f " Getting Tweets for { username } " )
2021-11-26 17:32:11 -08:00
session = requests . Session ( )
headers = { }
# One of the js files from original url holds the bearer token and query id.
2021-12-05 19:02:16 -08:00
container = send_request (
2022-07-06 13:40:30 -07:00
f " https://twitter.com/ { username } " , session . get , headers
)
2021-12-05 19:02:16 -08:00
js_files = re . findall ( " src=[ ' \" ]([^ ' \" ()]*js)[ ' \" ] " , container )
2021-11-26 17:32:11 -08:00
bearer_token = None
query_id = None
user_query_id = None
# Search the javascript files for a bearer token and UserTweets queryId
for f in js_files :
file_content = send_request ( f , session . get , headers )
2021-12-05 19:02:16 -08:00
bt = re . search (
' [ " \' ](AAA[a-zA-Z0-9 % -]+ % [a-zA-Z0-9 % -]+)[ " \' ] ' , file_content )
ops = re . findall (
' \ { queryId: " [a-zA-Z0-9_]+[^ \ }]+UserTweetsAndReplies " ' , file_content )
2021-11-26 17:32:11 -08:00
query_op = [ op for op in ops if " UserTweetsAndReplies " in op ]
if len ( query_op ) == 1 :
query_id = re . findall ( ' queryId: " ([^ " ]+) " ' , query_op [ 0 ] ) [ 0 ]
2021-12-05 19:02:16 -08:00
2021-11-26 17:32:11 -08:00
if bt :
bearer_token = bt . group ( 1 )
2021-12-05 19:02:16 -08:00
ops = re . findall (
' \ { queryId: " [a-zA-Z0-9_]+[^ \ }]+UserByScreenName " ' , file_content )
2021-11-26 17:32:11 -08:00
user_query_op = [ op for op in ops if " UserByScreenName " in op ]
if len ( user_query_op ) == 1 :
2021-12-05 19:02:16 -08:00
user_query_id = re . findall (
' queryId: " ([^ " ]+) " ' , user_query_op [ 0 ] ) [ 0 ]
2021-11-26 17:32:11 -08:00
assert bearer_token , f " Did not find bearer token. Are you sure you used the right username? { username } "
assert query_id , f " Did not find query id. Are you sure you used the right twitter username? { username } "
assert user_query_id , f " Did not find user query id. Are you sure you used the right twitter username? { username } "
2021-12-05 19:02:16 -08:00
headers [ ' authorization ' ] = f " Bearer { bearer_token } "
2021-11-26 17:32:11 -08:00
2021-12-05 19:02:16 -08:00
guest_token_resp = send_request (
GUEST_TOKEN_ENDPOINT , session . post , headers )
2021-11-26 17:32:11 -08:00
guest_token = json . loads ( guest_token_resp ) [ ' guest_token ' ]
assert guest_token , f " Did not find guest token. Probably means the script is broken. Please submit an issue. Include this message in your issue: { username } "
headers [ ' x-guest-token ' ] = guest_token
2021-12-05 19:02:16 -08:00
user_id , total_count = get_id_and_tweet_count (
2022-07-06 13:40:30 -07:00
session , headers , user_query_id , username
)
session . close ( )
2021-11-26 17:32:11 -08:00
variables [ " userId " ] = user_id
resp = get_tweets ( query_id , session , headers , variables , total_count )
all_tweets = [ tweet_subset ( tweet ) for tweet in resp ]
2021-12-05 19:02:16 -08:00
for tweet in all_tweets :
tweet [ " url " ] = f " https://twitter.com/ { username } /status/ { tweet [ ' id ' ] } "
2022-07-06 13:40:30 -07:00
tweet [ " username " ] = username
return all_tweets
if __name__ == " __main__ " :
2021-11-26 17:32:11 -08:00
2022-07-06 13:40:30 -07:00
parser = argparse . ArgumentParser ( description = " Get tweets for a user. " )
group = parser . add_mutually_exclusive_group ( required = True )
group . add_argument (
" --username " ,
help = " The username of the user to get tweets for. " ,
required = False
)
group . add_argument (
" --usersFile " ,
help = " A file containing a list of usernames to get tweets for. " ,
required = False
)
parser . add_argument (
" --output " , help = " The output file to write to. If not specified, prints to stdout. "
)
args = parser . parse_args ( )
usernames = [ ]
if args . username :
usernames . append ( args . username )
if args . usersFile :
with open ( args . usersFile ) as f :
usernames . extend ( f . read ( ) . splitlines ( ) )
all_tweets = [ ]
for username in usernames :
try :
all_tweets . extend ( user_tweets ( username ) )
print ( " Sleeping 10s to avoid rate limit. " )
sleep ( 10 )
except Exception as e :
print ( f " Failed to get tweets for { username } " )
print ( e )
2021-12-05 19:02:16 -08:00
headers = all_tweets [ 0 ] . keys ( )
writer = csv . DictWriter ( open ( args . output , " w " )
if args . output else sys . stdout , fieldnames = headers )
writer . writeheader ( )
writer . writerows ( all_tweets )