import os import json import re # 清洗函数 def clean_text(text): text = re.sub(r"[#&]", "\n", text) text = re.sub(r"^(\\[A-Za-z0-9]{2}\s*)+", "", text) text = re.sub(r"/%*$", "", text) text = re.sub(r"\^1", "", text) return text # 还原函数 def restore_text(key, text, source_text): # 1. \n -> # or & if key in source_text: original = source_text[key] def replace_newline(m): return "#" if "#" in original else "&" text = re.sub(r"\n", replace_newline, text) else: text = text.replace("\n", "&") # 2. 恢复前缀控制符 z* 或标签 match_prefix = re.match(r"^(\\[A-Za-z0-9]{2}\s*)+", source_text.get(key, "")) if match_prefix: text = match_prefix.group(0) + text # 4. 标点前添加停顿符 ^1 def replacer(match): puncts = match.group(1) return puncts[:-1] + "^1" + puncts[-1] text = re.sub(r"(?