import re def fix_line(lines, line_number): """ 修复CSV行,确保text字段中的引号和换行符正确处理。 参数: lines (list): 输入的行列表,可能包含多行 line_number (int): 当前处理的行号(用于调试) 返回: str: 修复后的单行CSV字符串 """ full_line = ''.join(lines) # 正则匹配完整行 match = re.search(r'^([^,]+),"(.+?)","([A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T)"$', full_line, re.DOTALL) if match: id_part = match.group(1) text_content = match.group(2) created_at = match.group(3) # 修复text字段中的引号 fixed_text = text_content.replace('"', '""') # 调试第4375行附近 if 4370 <= line_number <= 4380: print(f"Line {line_number}:") print(f" Original text: {repr(text_content)}") print(f" Fixed text: {repr(fixed_text)}") print(f" Contains newline: {'\\n' in text_content}") # 重建CSV行 fixed_line = f'{id_part},"{fixed_text}","{created_at}"' return fixed_line else: # 如果格式不匹配,返回原始内容并警告 print(f"Line {line_number} format error: {repr(full_line)}") return full_line def process_file(input_file, output_file): """ 处理整个CSV文件,修复每一行。 """ with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out: f_out.write("id,text,created_at\n") buffer = [] line_number = 0 for line in f_in: line = line.rstrip('\n') # 仅移除行尾换行符,保留字段内的换行符 if line.startswith('id,text,created_at'): continue line_number += 1 buffer.append(line) # 检查是否到达完整行 if line.endswith('"') and re.search(r'"[A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T"$', line): fixed_line = fix_line(buffer, line_number) f_out.write(fixed_line + '\n') buffer = [] # 处理剩余未完成行 if buffer: fixed_line = fix_line(buffer, line_number) f_out.write(fixed_line + '\n') print(f"文件已修复,保存为 {output_file}") # 处理文件 input_file = 'original.csv' output_file = 'fixed.csv' process_file(input_file, output_file)