2021-11-26 17:32:11 -08:00
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
2025-02-24 17:27:20 +08:00
|
|
|
|
def fix_line(lines, line_number):
|
|
|
|
|
"""
|
|
|
|
|
修复CSV行,确保text字段中的引号和换行符正确处理。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
lines (list): 输入的行列表,可能包含多行
|
|
|
|
|
line_number (int): 当前处理的行号(用于调试)
|
|
|
|
|
返回:
|
|
|
|
|
str: 修复后的单行CSV字符串
|
|
|
|
|
"""
|
|
|
|
|
full_line = ''.join(lines)
|
|
|
|
|
|
|
|
|
|
# 正则匹配完整行
|
|
|
|
|
match = re.search(r'^([^,]+),"(.+?)","([A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T)"$', full_line,
|
|
|
|
|
re.DOTALL)
|
|
|
|
|
if match:
|
|
|
|
|
id_part = match.group(1)
|
|
|
|
|
text_content = match.group(2)
|
|
|
|
|
created_at = match.group(3)
|
|
|
|
|
|
|
|
|
|
# 修复text字段中的引号
|
|
|
|
|
fixed_text = text_content.replace('"', '""')
|
|
|
|
|
|
|
|
|
|
# 调试第4375行附近
|
|
|
|
|
if 4370 <= line_number <= 4380:
|
|
|
|
|
print(f"Line {line_number}:")
|
|
|
|
|
print(f" Original text: {repr(text_content)}")
|
|
|
|
|
print(f" Fixed text: {repr(fixed_text)}")
|
|
|
|
|
print(f" Contains newline: {'\\n' in text_content}")
|
|
|
|
|
|
|
|
|
|
# 重建CSV行
|
|
|
|
|
fixed_line = f'{id_part},"{fixed_text}","{created_at}"'
|
|
|
|
|
return fixed_line
|
|
|
|
|
else:
|
|
|
|
|
# 如果格式不匹配,返回原始内容并警告
|
|
|
|
|
print(f"Line {line_number} format error: {repr(full_line)}")
|
|
|
|
|
return full_line
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_file(input_file, output_file):
|
|
|
|
|
"""
|
|
|
|
|
处理整个CSV文件,修复每一行。
|
|
|
|
|
"""
|
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
|
|
f_out.write("id,text,created_at\n")
|
|
|
|
|
buffer = []
|
|
|
|
|
line_number = 0
|
|
|
|
|
|
|
|
|
|
for line in f_in:
|
|
|
|
|
line = line.rstrip('\n') # 仅移除行尾换行符,保留字段内的换行符
|
|
|
|
|
if line.startswith('id,text,created_at'):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
line_number += 1
|
|
|
|
|
buffer.append(line)
|
|
|
|
|
|
|
|
|
|
# 检查是否到达完整行
|
|
|
|
|
if line.endswith('"') and re.search(r'"[A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T"$', line):
|
|
|
|
|
fixed_line = fix_line(buffer, line_number)
|
|
|
|
|
f_out.write(fixed_line + '\n')
|
|
|
|
|
buffer = []
|
|
|
|
|
|
|
|
|
|
# 处理剩余未完成行
|
|
|
|
|
if buffer:
|
|
|
|
|
fixed_line = fix_line(buffer, line_number)
|
|
|
|
|
f_out.write(fixed_line + '\n')
|
|
|
|
|
|
|
|
|
|
print(f"文件已修复,保存为 {output_file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 处理文件
|
|
|
|
|
input_file = 'original.csv'
|
|
|
|
|
output_file = 'fixed.csv'
|
|
|
|
|
process_file(input_file, output_file)
|