elon_py/get_tweets.py
2025-02-24 17:27:20 +08:00

77 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
def fix_line(lines, line_number):
"""
修复CSV行确保text字段中的引号和换行符正确处理。
参数:
lines (list): 输入的行列表,可能包含多行
line_number (int): 当前处理的行号(用于调试)
返回:
str: 修复后的单行CSV字符串
"""
full_line = ''.join(lines)
# 正则匹配完整行
match = re.search(r'^([^,]+),"(.+?)","([A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T)"$', full_line,
re.DOTALL)
if match:
id_part = match.group(1)
text_content = match.group(2)
created_at = match.group(3)
# 修复text字段中的引号
fixed_text = text_content.replace('"', '""')
# 调试第4375行附近
if 4370 <= line_number <= 4380:
print(f"Line {line_number}:")
print(f" Original text: {repr(text_content)}")
print(f" Fixed text: {repr(fixed_text)}")
print(f" Contains newline: {'\\n' in text_content}")
# 重建CSV行
fixed_line = f'{id_part},"{fixed_text}","{created_at}"'
return fixed_line
else:
# 如果格式不匹配,返回原始内容并警告
print(f"Line {line_number} format error: {repr(full_line)}")
return full_line
def process_file(input_file, output_file):
"""
处理整个CSV文件修复每一行。
"""
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
f_out.write("id,text,created_at\n")
buffer = []
line_number = 0
for line in f_in:
line = line.rstrip('\n') # 仅移除行尾换行符,保留字段内的换行符
if line.startswith('id,text,created_at'):
continue
line_number += 1
buffer.append(line)
# 检查是否到达完整行
if line.endswith('"') and re.search(r'"[A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T"$', line):
fixed_line = fix_line(buffer, line_number)
f_out.write(fixed_line + '\n')
buffer = []
# 处理剩余未完成行
if buffer:
fixed_line = fix_line(buffer, line_number)
f_out.write(fixed_line + '\n')
print(f"文件已修复,保存为 {output_file}")
# 处理文件
input_file = 'original.csv'
output_file = 'fixed.csv'
process_file(input_file, output_file)