import os import json import re # 清洗函数 def clean_text(text): text = re.sub(r"[#&]", "\n", text) text = re.sub(r"^(\\[A-Za-z0-9]{2}\s*)+", "", text) text = re.sub(r"/%*\$", "", text) text = re.sub(r"\^1", "", text) return text # 还原函数 def restore_text(key, text, source_text): # 1. \n -> # or & if key in source_text: original = source_text[key] def replace_newline(m): return "#" if "#" in original else "&" text = re.sub(r"\\\n", replace_newline, text) else: text = text.replace("\\\n", "&") # 2. 恢复前缀控制符 z* 或标签 match_prefix = re.match(r"^(\\[A-Za-z0-9]{2}\s*)+", source_text.get(key, "")) if match_prefix: text = match_prefix.group(0) + text # 3. 恢复后缀控制符 /%*$ match_suffix = re.search(r"/%*\$$", source_text.get(key, "")) if match_suffix: text = text + match_suffix.group(0) # 4. 标点前添加停顿符 ^1 text = re.sub(r"(?