elon_py/get_tweets.py

77 lines
2.5 KiB
Python
Raw Normal View History

2021-11-26 17:32:11 -08:00
import re
2025-02-24 17:27:20 +08:00
def fix_line(lines, line_number):
"""
修复CSV行确保text字段中的引号和换行符正确处理
参数:
lines (list): 输入的行列表可能包含多行
line_number (int): 当前处理的行号用于调试
返回:
str: 修复后的单行CSV字符串
"""
full_line = ''.join(lines)
# 正则匹配完整行
match = re.search(r'^([^,]+),"(.+?)","([A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T)"$', full_line,
re.DOTALL)
if match:
id_part = match.group(1)
text_content = match.group(2)
created_at = match.group(3)
# 修复text字段中的引号
fixed_text = text_content.replace('"', '""')
# 调试第4375行附近
if 4370 <= line_number <= 4380:
print(f"Line {line_number}:")
print(f" Original text: {repr(text_content)}")
print(f" Fixed text: {repr(fixed_text)}")
print(f" Contains newline: {'\\n' in text_content}")
# 重建CSV行
fixed_line = f'{id_part},"{fixed_text}","{created_at}"'
return fixed_line
else:
# 如果格式不匹配,返回原始内容并警告
print(f"Line {line_number} format error: {repr(full_line)}")
return full_line
def process_file(input_file, output_file):
"""
处理整个CSV文件修复每一行
"""
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
f_out.write("id,text,created_at\n")
buffer = []
line_number = 0
for line in f_in:
line = line.rstrip('\n') # 仅移除行尾换行符,保留字段内的换行符
if line.startswith('id,text,created_at'):
continue
line_number += 1
buffer.append(line)
# 检查是否到达完整行
if line.endswith('"') and re.search(r'"[A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T"$', line):
fixed_line = fix_line(buffer, line_number)
f_out.write(fixed_line + '\n')
buffer = []
# 处理剩余未完成行
if buffer:
fixed_line = fix_line(buffer, line_number)
f_out.write(fixed_line + '\n')
print(f"文件已修复,保存为 {output_file}")
# 处理文件
input_file = 'original.csv'
output_file = 'fixed.csv'
process_file(input_file, output_file)