1

2025-02-24 17:27:20 +08:00 · 2025-02-24 17:27:20 +08:00 · 2f8a4aa3e3
commit 2f8a4aa3e3
parent a47864943c
3 changed files with 31325 additions and 239 deletions
--- a/fixed.csv
+++ b/fixed.csv
--- a/get_tweets.py
+++ b/get_tweets.py
@ -1,254 +1,77 @@
 import requests
 import re
 import csv
 import sys
 import json
 import argparse
 from time import sleep
 GUEST_TOKEN_ENDPOINT = "https://api.twitter.com/1.1/guest/activate.json"
 STATUS_ENDPOINT = "https://twitter.com/i/api/graphql/"
-CURSOR_PATTERN = re.compile('TimelineCursor","value":"([^\"]+)"[^\}]+Bottom"')
+def fix_line(lines, line_number):
-ID_PATTERN = re.compile('"rest_id":"([^"]+)"')
+    """
-COUNT_PATTERN = re.compile('"statuses_count":([0-9]+)')
+    修复CSV行，确保text字段中的引号和换行符正确处理。
-variables = {
+    参数:
-    "count": 200,
+        lines (list): 输入的行列表，可能包含多行
-    "withTweetQuoteCount": True,
+        line_number (int): 当前处理的行号（用于调试）
-    "includePromotedContent": True,
+    返回:
-    "withQuickPromoteEligibilityTweetFields": False,
+        str: 修复后的单行CSV字符串
-    "withSuperFollowsUserFields": True,
+    """
-    "withUserResults": True,
+    full_line = ''.join(lines)
    "withBirdwatchPivots": False,
    "withDownvotePerspective": False,
    "withReactionsMetadata": False,
    "withReactionsPerspective": False,
    "withSuperFollowsTweetFields": True,
    "withVoice": True,
    "withV2Timeline": False,
 }
-features = {
+    # 正则匹配完整行
-    "standardized_nudges_misinfo": True, "dont_mention_me_view_api_enabled": True, "responsive_web_edit_tweet_api_enabled": True, "interactive_text_enabled": True, "responsive_web_enhance_cards_enabled": True, "responsive_web_uc_gql_enabled": True, "vibe_tweet_context_enabled": True,
+    match = re.search(r'^([^,]+),"(.+?)","([A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T)"$', full_line,
-}
+                      re.DOTALL)
    if match:
        id_part = match.group(1)
        text_content = match.group(2)
        created_at = match.group(3)
-def send_request(url, session_method, headers, params=None):
+        # 修复text字段中的引号
-    if params:
+        fixed_text = text_content.replace('"', '""')
-        response = session_method(url, headers=headers, stream=True, params={
+
-                                  "variables": json.dumps(params),
+        # 调试第4375行附近
-                                  "features": json.dumps(features)
+        if 4370 <= line_number <= 4380:
-                            })
+            print(f"Line {line_number}:")
            print(f"  Original text: {repr(text_content)}")
            print(f"  Fixed text: {repr(fixed_text)}")
            print(f"  Contains newline: {'\\n' in text_content}")
        # 重建CSV行
        fixed_line = f'{id_part},"{fixed_text}","{created_at}"'
        return fixed_line
    else:
-        response = session_method(url, headers=headers, stream=True)
+        # 如果格式不匹配，返回原始内容并警告
-
+        print(f"Line {line_number} format error: {repr(full_line)}")
-    if response.status_code != 200:
+        return full_line
        print(response.request.url)
        print(response.status_code)
    assert response.status_code == 200, f"Failed request to {url}.  {response.status_code}.  Please submit an issue including this information. {response.text}"
    result = [line.decode("utf-8") for line in response.iter_lines()]
    return "".join(result)
-def search_json(j, target_key, result):
+def process_file(input_file, output_file):
-    if type(j) == dict:
+    """
-        for key in j:
+    处理整个CSV文件，修复每一行。
-            if key == target_key:
+    """
-                result.append(j[key])
+    with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
        f_out.write("id,text,created_at\n")
        buffer = []
        line_number = 0
-            search_json(j[key], target_key, result)
+        for line in f_in:
-        return result
+            line = line.rstrip('\n')  # 仅移除行尾换行符，保留字段内的换行符
            if line.startswith('id,text,created_at'):
                continue
-    if type(j) == list:
+            line_number += 1
-        for item in j:
+            buffer.append(line)
            search_json(item, target_key, result)
-        return result
+            # 检查是否到达完整行
            if line.endswith('"') and re.search(r'"[A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T"$', line):
                fixed_line = fix_line(buffer, line_number)
                f_out.write(fixed_line + '\n')
                buffer = []
-    return result
+        # 处理剩余未完成行
        if buffer:
            fixed_line = fix_line(buffer, line_number)
            f_out.write(fixed_line + '\n')
    print(f"文件已修复，保存为 {output_file}")
-def tweet_subset(d):
+# 处理文件
-    return {
+input_file = 'original.csv'
-        "id": d["id_str"],
+output_file = 'fixed.csv'
-        "text": d["full_text"],
+process_file(input_file, output_file)
        "created_at": d["created_at"],
        "retweet_count": d["retweet_count"],
        "favorite_count": d["favorite_count"],
        "reply_count": d["reply_count"],
        "quote_count": d["quote_count"],
        "retweeted": d["retweeted"],
        "is_quote_status": d["is_quote_status"],
        "possibly_sensitive": d.get("possibly_sensitive", "No data"),
    }
 def get_tweets(query_id, session, headers, variables, expected_total):
    resp = send_request(
        f"{STATUS_ENDPOINT}{query_id}/UserTweetsAndReplies", session.get, headers, variables)
    j = json.loads(resp)
    all_tweets = search_json(j, "legacy", [])
    all_tweets = [tweet for tweet in all_tweets if "id_str" in tweet]
    ids = {tweet["id_str"] for tweet in all_tweets}
    while True:
        cursor = CURSOR_PATTERN.findall(resp)[0]
        variables["cursor"] = cursor
        resp = send_request(
            f"{STATUS_ENDPOINT}{query_id}/UserTweetsAndReplies", session.get, headers, variables)
        j = json.loads(resp)
        next_tweets = search_json(j, "legacy", [])
        next_tweets = [tweet for tweet in next_tweets if "id_str" in tweet]
        next_ids = {tweet["id_str"] for tweet in next_tweets}
        old_id_size = len(ids)
        ids.update(next_ids)
        if old_id_size == len(ids):
            break
        all_tweets.extend(next_tweets)
        if args.output:
            print(f"{len(all_tweets)} / {expected_total}", end="\r")
    all_tweets = [tweet for tweet in all_tweets if "full_text" in tweet and tweet.get(
        "user_id_str", "") == variables["userId"]]
    return all_tweets
 def get_id_and_tweet_count(session, headers, query_id, username):
    resp = send_request(
        f"{STATUS_ENDPOINT}{query_id}/UserByScreenName",
        session.get,
        headers,
        params={
            "screen_name": username,
            "withSafetyModeUserFields": True,
            "withSuperFollowsUserFields": True
        }
    )
    ids = ID_PATTERN.findall(resp)
    assert len(
        ids) == 1, f"Failed to find user id for {username}.  Please open this as an issue including this message."
    counts = COUNT_PATTERN.findall(resp)
    assert len(
        counts) == 1, f"Failed to find tweet count for {username}.  Please open this as an issue including this message."
    return ids[0], int(counts[0])
 def user_tweets(username):
    print(f"Getting Tweets for {username}")
    session = requests.Session()
    headers = {}
    # One of the js files from original url holds the bearer token and query id.
    container = send_request(
        f"https://twitter.com/{username}", session.get, headers
    )
    js_files = re.findall("src=['\"]([^'\"()]*js)['\"]", container)
    bearer_token = None
    query_id = None
    user_query_id = None
    # Search the javascript files for a bearer token and UserTweets queryId
    for f in js_files:
        file_content = send_request(f, session.get, headers)
        bt = re.search(
            '["\'](AAA[a-zA-Z0-9%-]+%[a-zA-Z0-9%-]+)["\']', file_content)
        ops = re.findall(
            '\{queryId:"[a-zA-Z0-9_]+[^\}]+UserTweetsAndReplies"', file_content)
        query_op = [op for op in ops if "UserTweetsAndReplies" in op]
        if len(query_op) == 1:
            query_id = re.findall('queryId:"([^"]+)"', query_op[0])[0]
        if bt:
            bearer_token = bt.group(1)
        ops = re.findall(
            '\{queryId:"[a-zA-Z0-9_]+[^\}]+UserByScreenName"', file_content)
        user_query_op = [op for op in ops if "UserByScreenName" in op]
        if len(user_query_op) == 1:
            user_query_id = re.findall(
                'queryId:"([^"]+)"', user_query_op[0])[0]
    assert bearer_token, f"Did not find bearer token.  Are you sure you used the right username? {username}"
    assert query_id, f"Did not find query id.  Are you sure you used the right twitter username? {username}"
    assert user_query_id, f"Did not find user query id.  Are you sure you used the right twitter username? {username}"
    headers['authorization'] = f"Bearer {bearer_token}"
    guest_token_resp = send_request(
        GUEST_TOKEN_ENDPOINT, session.post, headers)
    guest_token = json.loads(guest_token_resp)['guest_token']
    assert guest_token, f"Did not find guest token.  Probably means the script is broken.  Please submit an issue.  Include this message in your issue: {username}"
    headers['x-guest-token'] = guest_token
    user_id, total_count = get_id_and_tweet_count(
        session, headers, user_query_id, username
    )
    session.close()
    variables["userId"] = user_id
    resp = get_tweets(query_id, session, headers, variables, total_count)
    all_tweets = [tweet_subset(tweet) for tweet in resp]
    for tweet in all_tweets:
        tweet["url"] = f"https://twitter.com/{username}/status/{tweet['id']}"
        tweet["username"] = username
    return all_tweets
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Get tweets for a user.")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        "--username", 
        help="The username of the user to get tweets for.",
        required=False
    )
    group.add_argument(
        "--usersFile", 
        help="A file containing a list of usernames to get tweets for.",
        required=False
    )    
    parser.add_argument(
        "--output", help="The output file to write to.  If not specified, prints to stdout."
    )
    args = parser.parse_args()    
    usernames = []
    if args.username:
        usernames.append(args.username)
    if args.usersFile:
        with open(args.usersFile) as f:
            usernames.extend(f.read().splitlines())
    all_tweets = []
    for username in usernames:
        try:
            all_tweets.extend(user_tweets(username))
            print("Sleeping 10s to avoid rate limit.")
            sleep(10)
        except Exception as e:
            print(f"Failed to get tweets for {username}")
            print(e)
    headers = all_tweets[0].keys()
    writer = csv.DictWriter(open(args.output, "w")
                            if args.output else sys.stdout, fieldnames=headers)
    writer.writeheader()
    writer.writerows(all_tweets)
--- a/original.csv
+++ b/original.csv