77 lines
2.5 KiB
Python
77 lines
2.5 KiB
Python
import re
|
||
|
||
|
||
def fix_line(lines, line_number):
|
||
"""
|
||
修复CSV行,确保text字段中的引号和换行符正确处理。
|
||
|
||
参数:
|
||
lines (list): 输入的行列表,可能包含多行
|
||
line_number (int): 当前处理的行号(用于调试)
|
||
返回:
|
||
str: 修复后的单行CSV字符串
|
||
"""
|
||
full_line = ''.join(lines)
|
||
|
||
# 正则匹配完整行
|
||
match = re.search(r'^([^,]+),"(.+?)","([A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T)"$', full_line,
|
||
re.DOTALL)
|
||
if match:
|
||
id_part = match.group(1)
|
||
text_content = match.group(2)
|
||
created_at = match.group(3)
|
||
|
||
# 修复text字段中的引号
|
||
fixed_text = text_content.replace('"', '""')
|
||
|
||
# 调试第4375行附近
|
||
if 4370 <= line_number <= 4380:
|
||
print(f"Line {line_number}:")
|
||
print(f" Original text: {repr(text_content)}")
|
||
print(f" Fixed text: {repr(fixed_text)}")
|
||
print(f" Contains newline: {'\\n' in text_content}")
|
||
|
||
# 重建CSV行
|
||
fixed_line = f'{id_part},"{fixed_text}","{created_at}"'
|
||
return fixed_line
|
||
else:
|
||
# 如果格式不匹配,返回原始内容并警告
|
||
print(f"Line {line_number} format error: {repr(full_line)}")
|
||
return full_line
|
||
|
||
|
||
def process_file(input_file, output_file):
|
||
"""
|
||
处理整个CSV文件,修复每一行。
|
||
"""
|
||
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
|
||
f_out.write("id,text,created_at\n")
|
||
buffer = []
|
||
line_number = 0
|
||
|
||
for line in f_in:
|
||
line = line.rstrip('\n') # 仅移除行尾换行符,保留字段内的换行符
|
||
if line.startswith('id,text,created_at'):
|
||
continue
|
||
|
||
line_number += 1
|
||
buffer.append(line)
|
||
|
||
# 检查是否到达完整行
|
||
if line.endswith('"') and re.search(r'"[A-Z][a-z]{2} \d{1,2}, \d{1,2}:\d{2}:\d{2} (AM|PM) E[SD]T"$', line):
|
||
fixed_line = fix_line(buffer, line_number)
|
||
f_out.write(fixed_line + '\n')
|
||
buffer = []
|
||
|
||
# 处理剩余未完成行
|
||
if buffer:
|
||
fixed_line = fix_line(buffer, line_number)
|
||
f_out.write(fixed_line + '\n')
|
||
|
||
print(f"文件已修复,保存为 {output_file}")
|
||
|
||
|
||
# 处理文件
|
||
input_file = 'original.csv'
|
||
output_file = 'fixed.csv'
|
||
process_file(input_file, output_file) |