1
This commit is contained in:
parent
a47864943c
commit
2f8a4aa3e3
301
get_tweets.py
301
get_tweets.py
@ -1,254 +1,77 @@
|
|||||||
import requests
|
|
||||||
import re
|
import re
|
||||||
import csv
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import argparse
|
|
||||||
from time import sleep
|
|
||||||
|
|
||||||
GUEST_TOKEN_ENDPOINT = "https://api.twitter.com/1.1/guest/activate.json"
|
|
||||||
STATUS_ENDPOINT = "https://twitter.com/i/api/graphql/"
|
|
||||||
|
|
||||||
CURSOR_PATTERN = re.compile('TimelineCursor","value":"([^\"]+)"[^\}]+Bottom"')
|
def fix_line(lines, line_number):
|
||||||
ID_PATTERN = re.compile('"rest_id":"([^"]+)"')
|
"""
|
||||||
COUNT_PATTERN = re.compile('"statuses_count":([0-9]+)')
|
修复CSV行,确保text字段中的引号和换行符正确处理。
|
||||||
|
|
||||||
variables = {
|
参数:
|
||||||
"count": 200,
|
lines (list): 输入的行列表,可能包含多行
|
||||||
"withTweetQuoteCount": True,
|
line_number (int): 当前处理的行号(用于调试)
|
||||||
"includePromotedContent": True,
|
返回:
|
||||||
"withQuickPromoteEligibilityTweetFields": False,
|
str: 修复后的单行CSV字符串
|
||||||
"withSuperFollowsUserFields": True,
|
"""
|
||||||
"withUserResults": True,
|
full_line = ''.join(lines)
|
||||||
"withBirdwatchPivots": False,
|
|
||||||
"withDownvotePerspective": False,
|
|
||||||
"withReactionsMetadata": False,
|
|
||||||
"withReactionsPerspective": False,
|
|
||||||
"withSuperFollowsTweetFields": True,
|
|
||||||
"withVoice": True,
|
|
||||||
"withV2Timeline": False,
|
|
||||||
}
|
|
||||||
|
|
||||||
features = {
|
# 正则匹配完整行
|
||||||
"standardized_nudges_misinfo": True, "dont_mention_me_view_api_enabled": True, "responsive_web_edit_tweet_api_enabled": True, "interactive_text_enabled": True, "responsive_web_enhance_cards_enabled": True, "responsive_web_uc_gql_enabled": True, "vibe_tweet_context_enabled": True,
|
match = re.search(r'^([^,]+),"(.+?)","([A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T)"$', full_line,
|
||||||
}
|
re.DOTALL)
|
||||||
|
if match:
|
||||||
|
id_part = match.group(1)
|
||||||
|
text_content = match.group(2)
|
||||||
|
created_at = match.group(3)
|
||||||
|
|
||||||
def send_request(url, session_method, headers, params=None):
|
# 修复text字段中的引号
|
||||||
if params:
|
fixed_text = text_content.replace('"', '""')
|
||||||
response = session_method(url, headers=headers, stream=True, params={
|
|
||||||
"variables": json.dumps(params),
|
# 调试第4375行附近
|
||||||
"features": json.dumps(features)
|
if 4370 <= line_number <= 4380:
|
||||||
})
|
print(f"Line {line_number}:")
|
||||||
|
print(f" Original text: {repr(text_content)}")
|
||||||
|
print(f" Fixed text: {repr(fixed_text)}")
|
||||||
|
print(f" Contains newline: {'\\n' in text_content}")
|
||||||
|
|
||||||
|
# 重建CSV行
|
||||||
|
fixed_line = f'{id_part},"{fixed_text}","{created_at}"'
|
||||||
|
return fixed_line
|
||||||
else:
|
else:
|
||||||
response = session_method(url, headers=headers, stream=True)
|
# 如果格式不匹配,返回原始内容并警告
|
||||||
|
print(f"Line {line_number} format error: {repr(full_line)}")
|
||||||
if response.status_code != 200:
|
return full_line
|
||||||
print(response.request.url)
|
|
||||||
print(response.status_code)
|
|
||||||
|
|
||||||
assert response.status_code == 200, f"Failed request to {url}. {response.status_code}. Please submit an issue including this information. {response.text}"
|
|
||||||
result = [line.decode("utf-8") for line in response.iter_lines()]
|
|
||||||
return "".join(result)
|
|
||||||
|
|
||||||
|
|
||||||
def search_json(j, target_key, result):
|
def process_file(input_file, output_file):
|
||||||
if type(j) == dict:
|
"""
|
||||||
for key in j:
|
处理整个CSV文件,修复每一行。
|
||||||
if key == target_key:
|
"""
|
||||||
result.append(j[key])
|
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
|
||||||
|
f_out.write("id,text,created_at\n")
|
||||||
|
buffer = []
|
||||||
|
line_number = 0
|
||||||
|
|
||||||
search_json(j[key], target_key, result)
|
for line in f_in:
|
||||||
return result
|
line = line.rstrip('\n') # 仅移除行尾换行符,保留字段内的换行符
|
||||||
|
if line.startswith('id,text,created_at'):
|
||||||
|
continue
|
||||||
|
|
||||||
if type(j) == list:
|
line_number += 1
|
||||||
for item in j:
|
buffer.append(line)
|
||||||
search_json(item, target_key, result)
|
|
||||||
|
|
||||||
return result
|
# 检查是否到达完整行
|
||||||
|
if line.endswith('"') and re.search(r'"[A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T"$', line):
|
||||||
|
fixed_line = fix_line(buffer, line_number)
|
||||||
|
f_out.write(fixed_line + '\n')
|
||||||
|
buffer = []
|
||||||
|
|
||||||
return result
|
# 处理剩余未完成行
|
||||||
|
if buffer:
|
||||||
|
fixed_line = fix_line(buffer, line_number)
|
||||||
|
f_out.write(fixed_line + '\n')
|
||||||
|
|
||||||
|
print(f"文件已修复,保存为 {output_file}")
|
||||||
|
|
||||||
|
|
||||||
def tweet_subset(d):
|
# 处理文件
|
||||||
return {
|
input_file = 'original.csv'
|
||||||
"id": d["id_str"],
|
output_file = 'fixed.csv'
|
||||||
"text": d["full_text"],
|
process_file(input_file, output_file)
|
||||||
"created_at": d["created_at"],
|
|
||||||
"retweet_count": d["retweet_count"],
|
|
||||||
"favorite_count": d["favorite_count"],
|
|
||||||
"reply_count": d["reply_count"],
|
|
||||||
"quote_count": d["quote_count"],
|
|
||||||
"retweeted": d["retweeted"],
|
|
||||||
"is_quote_status": d["is_quote_status"],
|
|
||||||
"possibly_sensitive": d.get("possibly_sensitive", "No data"),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_tweets(query_id, session, headers, variables, expected_total):
|
|
||||||
resp = send_request(
|
|
||||||
f"{STATUS_ENDPOINT}{query_id}/UserTweetsAndReplies", session.get, headers, variables)
|
|
||||||
j = json.loads(resp)
|
|
||||||
all_tweets = search_json(j, "legacy", [])
|
|
||||||
|
|
||||||
all_tweets = [tweet for tweet in all_tweets if "id_str" in tweet]
|
|
||||||
ids = {tweet["id_str"] for tweet in all_tweets}
|
|
||||||
|
|
||||||
while True:
|
|
||||||
cursor = CURSOR_PATTERN.findall(resp)[0]
|
|
||||||
variables["cursor"] = cursor
|
|
||||||
resp = send_request(
|
|
||||||
f"{STATUS_ENDPOINT}{query_id}/UserTweetsAndReplies", session.get, headers, variables)
|
|
||||||
j = json.loads(resp)
|
|
||||||
|
|
||||||
next_tweets = search_json(j, "legacy", [])
|
|
||||||
next_tweets = [tweet for tweet in next_tweets if "id_str" in tweet]
|
|
||||||
next_ids = {tweet["id_str"] for tweet in next_tweets}
|
|
||||||
|
|
||||||
old_id_size = len(ids)
|
|
||||||
ids.update(next_ids)
|
|
||||||
if old_id_size == len(ids):
|
|
||||||
break
|
|
||||||
|
|
||||||
all_tweets.extend(next_tweets)
|
|
||||||
if args.output:
|
|
||||||
print(f"{len(all_tweets)} / {expected_total}", end="\r")
|
|
||||||
|
|
||||||
all_tweets = [tweet for tweet in all_tweets if "full_text" in tweet and tweet.get(
|
|
||||||
"user_id_str", "") == variables["userId"]]
|
|
||||||
return all_tweets
|
|
||||||
|
|
||||||
|
|
||||||
def get_id_and_tweet_count(session, headers, query_id, username):
|
|
||||||
resp = send_request(
|
|
||||||
f"{STATUS_ENDPOINT}{query_id}/UserByScreenName",
|
|
||||||
session.get,
|
|
||||||
headers,
|
|
||||||
params={
|
|
||||||
"screen_name": username,
|
|
||||||
"withSafetyModeUserFields": True,
|
|
||||||
"withSuperFollowsUserFields": True
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
ids = ID_PATTERN.findall(resp)
|
|
||||||
assert len(
|
|
||||||
ids) == 1, f"Failed to find user id for {username}. Please open this as an issue including this message."
|
|
||||||
|
|
||||||
counts = COUNT_PATTERN.findall(resp)
|
|
||||||
assert len(
|
|
||||||
counts) == 1, f"Failed to find tweet count for {username}. Please open this as an issue including this message."
|
|
||||||
|
|
||||||
return ids[0], int(counts[0])
|
|
||||||
|
|
||||||
def user_tweets(username):
|
|
||||||
print(f"Getting Tweets for {username}")
|
|
||||||
session = requests.Session()
|
|
||||||
headers = {}
|
|
||||||
|
|
||||||
# One of the js files from original url holds the bearer token and query id.
|
|
||||||
container = send_request(
|
|
||||||
f"https://twitter.com/{username}", session.get, headers
|
|
||||||
)
|
|
||||||
js_files = re.findall("src=['\"]([^'\"()]*js)['\"]", container)
|
|
||||||
|
|
||||||
bearer_token = None
|
|
||||||
query_id = None
|
|
||||||
user_query_id = None
|
|
||||||
|
|
||||||
# Search the javascript files for a bearer token and UserTweets queryId
|
|
||||||
for f in js_files:
|
|
||||||
file_content = send_request(f, session.get, headers)
|
|
||||||
bt = re.search(
|
|
||||||
'["\'](AAA[a-zA-Z0-9%-]+%[a-zA-Z0-9%-]+)["\']', file_content)
|
|
||||||
|
|
||||||
ops = re.findall(
|
|
||||||
'\{queryId:"[a-zA-Z0-9_]+[^\}]+UserTweetsAndReplies"', file_content)
|
|
||||||
query_op = [op for op in ops if "UserTweetsAndReplies" in op]
|
|
||||||
|
|
||||||
if len(query_op) == 1:
|
|
||||||
query_id = re.findall('queryId:"([^"]+)"', query_op[0])[0]
|
|
||||||
|
|
||||||
if bt:
|
|
||||||
bearer_token = bt.group(1)
|
|
||||||
|
|
||||||
ops = re.findall(
|
|
||||||
'\{queryId:"[a-zA-Z0-9_]+[^\}]+UserByScreenName"', file_content)
|
|
||||||
user_query_op = [op for op in ops if "UserByScreenName" in op]
|
|
||||||
|
|
||||||
if len(user_query_op) == 1:
|
|
||||||
user_query_id = re.findall(
|
|
||||||
'queryId:"([^"]+)"', user_query_op[0])[0]
|
|
||||||
|
|
||||||
assert bearer_token, f"Did not find bearer token. Are you sure you used the right username? {username}"
|
|
||||||
assert query_id, f"Did not find query id. Are you sure you used the right twitter username? {username}"
|
|
||||||
assert user_query_id, f"Did not find user query id. Are you sure you used the right twitter username? {username}"
|
|
||||||
|
|
||||||
headers['authorization'] = f"Bearer {bearer_token}"
|
|
||||||
|
|
||||||
guest_token_resp = send_request(
|
|
||||||
GUEST_TOKEN_ENDPOINT, session.post, headers)
|
|
||||||
guest_token = json.loads(guest_token_resp)['guest_token']
|
|
||||||
assert guest_token, f"Did not find guest token. Probably means the script is broken. Please submit an issue. Include this message in your issue: {username}"
|
|
||||||
headers['x-guest-token'] = guest_token
|
|
||||||
|
|
||||||
user_id, total_count = get_id_and_tweet_count(
|
|
||||||
session, headers, user_query_id, username
|
|
||||||
)
|
|
||||||
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
variables["userId"] = user_id
|
|
||||||
|
|
||||||
resp = get_tweets(query_id, session, headers, variables, total_count)
|
|
||||||
all_tweets = [tweet_subset(tweet) for tweet in resp]
|
|
||||||
|
|
||||||
for tweet in all_tweets:
|
|
||||||
tweet["url"] = f"https://twitter.com/{username}/status/{tweet['id']}"
|
|
||||||
tweet["username"] = username
|
|
||||||
|
|
||||||
return all_tweets
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Get tweets for a user.")
|
|
||||||
group = parser.add_mutually_exclusive_group(required=True)
|
|
||||||
group.add_argument(
|
|
||||||
"--username",
|
|
||||||
help="The username of the user to get tweets for.",
|
|
||||||
required=False
|
|
||||||
)
|
|
||||||
group.add_argument(
|
|
||||||
"--usersFile",
|
|
||||||
help="A file containing a list of usernames to get tweets for.",
|
|
||||||
required=False
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output", help="The output file to write to. If not specified, prints to stdout."
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
usernames = []
|
|
||||||
|
|
||||||
if args.username:
|
|
||||||
usernames.append(args.username)
|
|
||||||
|
|
||||||
if args.usersFile:
|
|
||||||
with open(args.usersFile) as f:
|
|
||||||
usernames.extend(f.read().splitlines())
|
|
||||||
|
|
||||||
all_tweets = []
|
|
||||||
for username in usernames:
|
|
||||||
try:
|
|
||||||
all_tweets.extend(user_tweets(username))
|
|
||||||
print("Sleeping 10s to avoid rate limit.")
|
|
||||||
sleep(10)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Failed to get tweets for {username}")
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
headers = all_tweets[0].keys()
|
|
||||||
writer = csv.DictWriter(open(args.output, "w")
|
|
||||||
if args.output else sys.stdout, fieldnames=headers)
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(all_tweets)
|
|
19177
original.csv
Normal file
19177
original.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user