2025-02-25 11:12:56 +08:00
|
|
|
|
import csv
|
2021-11-26 17:32:11 -08:00
|
|
|
|
import re
|
2025-02-25 11:12:56 +08:00
|
|
|
|
import mysql.connector
|
|
|
|
|
from datetime import datetime
|
2025-02-25 17:56:23 +08:00
|
|
|
|
import requests
|
2025-02-25 11:12:56 +08:00
|
|
|
|
|
|
|
|
|
# 文件路径
|
2025-02-25 17:56:23 +08:00
|
|
|
|
INPUT_FILE = '../elonmusk.csv'
|
|
|
|
|
OUTPUT_FILE = '../fixed.csv'
|
2025-02-25 11:12:56 +08:00
|
|
|
|
|
|
|
|
|
# 数据库连接配置
|
|
|
|
|
DB_CONFIG = {
|
|
|
|
|
'host': '8.155.23.172',
|
|
|
|
|
'port': 3306,
|
|
|
|
|
'user': 'root2',
|
|
|
|
|
'password': 'tG0f6PVYh18le41BCb',
|
|
|
|
|
'database': 'elonX'
|
|
|
|
|
}
|
|
|
|
|
TABLE_NAME = 'elon_tweets'
|
|
|
|
|
|
2025-02-25 17:56:23 +08:00
|
|
|
|
|
|
|
|
|
def download_file(file_path):
|
|
|
|
|
url = 'https://www.xtracker.io/api/download'
|
|
|
|
|
headers = {
|
|
|
|
|
'Accept': '*/*',
|
|
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,zh-TW;q=0.5',
|
|
|
|
|
'Cache-Control': 'no-cache',
|
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
|
'Content-Type': 'application/json',
|
|
|
|
|
'Origin': 'https://www.xtracker.io',
|
|
|
|
|
'Pragma': 'no-cache',
|
|
|
|
|
'Referer': 'https://www.xtracker.io/',
|
|
|
|
|
'Sec-Fetch-Dest': 'empty',
|
|
|
|
|
'Sec-Fetch-Mode': 'cors',
|
|
|
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0',
|
|
|
|
|
'sec-ch-ua': '"Not(A:Brand";v="99", "Microsoft Edge";v="133", "Chromium";v="133"',
|
|
|
|
|
'sec-ch-ua-mobile': '?0',
|
|
|
|
|
'sec-ch-ua-platform': '"Windows"'
|
|
|
|
|
}
|
|
|
|
|
data = '{"handle":"elonmusk","platform":"X"}'
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
response = requests.post(url, headers=headers, data=data)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
# 直接保存原始字节数据,不进行解码
|
|
|
|
|
with open(file_path, 'wb') as f:
|
|
|
|
|
f.write(response.content)
|
|
|
|
|
print(f"文件已成功下载到 {file_path}(原始字节数据)")
|
|
|
|
|
else:
|
|
|
|
|
print(f"下载失败,状态码:{response.status_code}")
|
|
|
|
|
print(f"响应内容:{response.text}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"下载文件时出错:{e}")
|
|
|
|
|
|
2025-02-25 11:12:56 +08:00
|
|
|
|
# 第一步:修复 CSV 文件并添加 rank_id
|
|
|
|
|
def fix_line(lines, line_number, rank_id):
|
2025-02-24 17:27:20 +08:00
|
|
|
|
full_line = ''.join(lines)
|
|
|
|
|
match = re.search(r'^([^,]+),"(.+?)","([A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T)"$', full_line,
|
|
|
|
|
re.DOTALL)
|
|
|
|
|
if match:
|
|
|
|
|
id_part = match.group(1)
|
|
|
|
|
text_content = match.group(2)
|
|
|
|
|
created_at = match.group(3)
|
|
|
|
|
fixed_text = text_content.replace('"', '""')
|
2025-02-25 11:12:56 +08:00
|
|
|
|
fixed_line = f'{rank_id},{id_part},"{fixed_text}","{created_at}"'
|
2025-02-24 17:27:20 +08:00
|
|
|
|
return fixed_line
|
|
|
|
|
else:
|
|
|
|
|
print(f"Line {line_number} format error: {repr(full_line)}")
|
2025-02-25 11:12:56 +08:00
|
|
|
|
return f'{rank_id},{full_line}'
|
2025-02-24 17:27:20 +08:00
|
|
|
|
|
2025-02-25 17:56:23 +08:00
|
|
|
|
|
2025-02-24 17:27:20 +08:00
|
|
|
|
def process_file(input_file, output_file):
|
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
|
2025-02-25 11:12:56 +08:00
|
|
|
|
f_out.write("rank_id,id,text,created_at\n")
|
2025-02-24 17:27:20 +08:00
|
|
|
|
buffer = []
|
|
|
|
|
line_number = 0
|
2025-02-25 11:12:56 +08:00
|
|
|
|
rank_id = 1
|
2025-02-24 17:27:20 +08:00
|
|
|
|
for line in f_in:
|
2025-02-25 11:12:56 +08:00
|
|
|
|
line = line.rstrip('\n')
|
2025-02-24 17:27:20 +08:00
|
|
|
|
if line.startswith('id,text,created_at'):
|
|
|
|
|
continue
|
|
|
|
|
line_number += 1
|
|
|
|
|
buffer.append(line)
|
|
|
|
|
if line.endswith('"') and re.search(r'"[A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T"$', line):
|
2025-02-25 11:12:56 +08:00
|
|
|
|
fixed_line = fix_line(buffer, line_number, rank_id)
|
2025-02-24 17:27:20 +08:00
|
|
|
|
f_out.write(fixed_line + '\n')
|
|
|
|
|
buffer = []
|
2025-02-25 11:12:56 +08:00
|
|
|
|
rank_id += 1
|
2025-02-24 17:27:20 +08:00
|
|
|
|
if buffer:
|
2025-02-25 11:12:56 +08:00
|
|
|
|
fixed_line = fix_line(buffer, line_number, rank_id)
|
2025-02-24 17:27:20 +08:00
|
|
|
|
f_out.write(fixed_line + '\n')
|
2025-02-25 11:12:56 +08:00
|
|
|
|
print(f"CSV 文件已修复并添加 rank_id,保存为 {output_file}")
|
|
|
|
|
|
2025-02-25 17:56:23 +08:00
|
|
|
|
|
2025-02-25 11:12:56 +08:00
|
|
|
|
# 第二步:数据库操作
|
|
|
|
|
def get_max_rank_id(cursor):
|
|
|
|
|
try:
|
|
|
|
|
cursor.execute(f"SELECT MAX(rank_id) FROM {TABLE_NAME}")
|
|
|
|
|
result = cursor.fetchone()[0]
|
|
|
|
|
return result if result is not None else 0
|
|
|
|
|
except mysql.connector.Error as e:
|
|
|
|
|
print(f"获取最大 rank_id 出错: {e}")
|
|
|
|
|
return 0
|
|
|
|
|
|
2025-02-25 17:56:23 +08:00
|
|
|
|
|
2025-02-25 11:12:56 +08:00
|
|
|
|
def import_to_database(input_file):
|
|
|
|
|
try:
|
|
|
|
|
conn = mysql.connector.connect(**DB_CONFIG)
|
|
|
|
|
cursor = conn.cursor()
|
|
|
|
|
print("成功连接到数据库")
|
|
|
|
|
|
|
|
|
|
# 获取当前年份
|
|
|
|
|
current_year = str(datetime.now().year)
|
|
|
|
|
|
|
|
|
|
# 获取数据库中最大 rank_id
|
|
|
|
|
max_rank_id = get_max_rank_id(cursor)
|
|
|
|
|
print(f"数据库中最大 rank_id: {max_rank_id}")
|
|
|
|
|
|
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
|
|
|
reader = csv.reader(f)
|
|
|
|
|
next(reader) # 跳过表头
|
|
|
|
|
total_rows = 0
|
|
|
|
|
inserted = 0
|
|
|
|
|
|
|
|
|
|
for row in reader:
|
|
|
|
|
if len(row) != 4:
|
|
|
|
|
print(f"跳过无效行: {row}")
|
|
|
|
|
continue
|
|
|
|
|
rank_id, id_, text, created_at = row
|
|
|
|
|
rank_id = int(rank_id)
|
|
|
|
|
tweet_id = float(id_)
|
|
|
|
|
|
|
|
|
|
# 只导入 rank_id 大于 max_rank_id 的记录
|
|
|
|
|
if rank_id <= max_rank_id:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
total_rows += 1
|
|
|
|
|
insert_query = f"""
|
|
|
|
|
INSERT INTO {TABLE_NAME} (rank_id, id, text, year, created_at, timestamp)
|
|
|
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|
|
|
|
"""
|
|
|
|
|
cursor.execute(insert_query, (rank_id, tweet_id, text, current_year, created_at, 0))
|
|
|
|
|
inserted += 1
|
|
|
|
|
print(f"文本: 【{text}】,这是第 {inserted} 行")
|
|
|
|
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
print(f"数据库导入完成:总计处理 {total_rows} 行,插入 {inserted} 行")
|
|
|
|
|
|
|
|
|
|
# 更新新插入记录的 timestamp,使用参数化查询
|
|
|
|
|
update_query = f"""
|
|
|
|
|
UPDATE {TABLE_NAME}
|
|
|
|
|
SET timestamp = UNIX_TIMESTAMP(
|
|
|
|
|
CONVERT_TZ(
|
|
|
|
|
STR_TO_DATE(
|
|
|
|
|
CONCAT(year, ' ', SUBSTRING_INDEX(created_at, ' ', 4)),
|
|
|
|
|
'%Y %b %d, %l:%i:%s %p'
|
|
|
|
|
),
|
|
|
|
|
CASE
|
|
|
|
|
WHEN RIGHT(created_at, 3) = 'EDT' THEN 'America/New_York'
|
|
|
|
|
WHEN RIGHT(created_at, 3) = 'EST' THEN 'America/New_York'
|
|
|
|
|
ELSE 'UTC'
|
|
|
|
|
END,
|
|
|
|
|
'UTC'
|
|
|
|
|
)
|
2025-02-25 17:56:23 +08:00
|
|
|
|
) + 8*60*60
|
2025-02-25 11:12:56 +08:00
|
|
|
|
WHERE rank_id > {max_rank_id}
|
|
|
|
|
"""
|
|
|
|
|
cursor.execute(update_query)
|
|
|
|
|
conn.commit()
|
|
|
|
|
print(f"已更新 rank_id > {max_rank_id} 的记录的时间戳")
|
|
|
|
|
|
|
|
|
|
except mysql.connector.Error as e:
|
|
|
|
|
print(f"数据库错误: {e}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"其他错误: {e}")
|
|
|
|
|
finally:
|
|
|
|
|
if 'cursor' in locals():
|
|
|
|
|
cursor.close()
|
|
|
|
|
if 'conn' in locals() and conn.is_connected():
|
|
|
|
|
conn.close()
|
|
|
|
|
print("数据库连接已关闭")
|
|
|
|
|
|
2025-02-25 17:56:23 +08:00
|
|
|
|
|
2025-02-25 11:12:56 +08:00
|
|
|
|
# 主流程
|
|
|
|
|
def main():
|
2025-02-25 17:56:23 +08:00
|
|
|
|
download_file(INPUT_FILE) # 先下载文件
|
2025-02-25 11:12:56 +08:00
|
|
|
|
process_file(INPUT_FILE, OUTPUT_FILE)
|
|
|
|
|
import_to_database(OUTPUT_FILE)
|
|
|
|
|
|
2025-02-25 17:56:23 +08:00
|
|
|
|
|
2025-02-25 11:12:56 +08:00
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|