import os import json import re # 清洗函数 def clean_text(text): text = re.sub(r"[#&]", "\n", text) text = re.sub(r"^(\\[A-Za-z0-9]{2}\s*)+", "", text) text = re.sub(r"/%*$", "", text) text = re.sub(r"\^1", "", text) return text # 还原函数 def restore_text(key, text, source_text): # 1. \n -> # or & if key in source_text: original = source_text[key] def replace_newline(m): return "#" if "#" in original else "&" text = re.sub(r"\n", replace_newline, text) else: text = text.replace("\n", "&") # 2. 恢复前缀控制符 z* 或标签 match_prefix = re.match(r"^(\\[A-Za-z0-9]{2}\s*)+", source_text.get(key, "")) if match_prefix: text = match_prefix.group(0) + text # 4. 标点前添加停顿符 ^1 def replacer(match): puncts = match.group(1) return puncts[:-1] + "^1" + puncts[-1] text = re.sub(r"(?<!\^[0-9])([。!?\?!,\.,:]+)(?=\s*[\w&])", replacer, text) # 3. 恢复后缀控制符 /%*$ match_suffix = re.search(r"/%*$", source_text.get(key, "")) if match_suffix: text = text + match_suffix.group(0) return text # 文件遍历和处理 def process_all_jsons(original_root, processed_root, target_root): for root, _, files in os.walk(original_root): for file in files: if not file.endswith(".json"): continue # 构建路径 rel_path = os.path.relpath(os.path.join(root, file), original_root) original_path = os.path.join(original_root, rel_path) processed_path = os.path.join(processed_root, rel_path) target_path = os.path.join(target_root, rel_path) # 创建目录 os.makedirs(os.path.dirname(processed_path), exist_ok=True) os.makedirs(os.path.dirname(target_path), exist_ok=True) # 读取原始文件 with open(original_path, "r", encoding="utf-8") as f: original_data = json.load(f) # 清洗处理并保存 processed 文件 cleaned_data = {k: clean_text(v) for k, v in original_data.items()} with open(processed_path, "w", encoding="utf-8") as f: json.dump(cleaned_data, f, ensure_ascii=False, indent=4) # 再次读取 processed 文件进行还原 with open(processed_path, "r", encoding="utf-8") as f: processed_data = json.load(f) restored_data = { k: restore_text(k, v, original_data) for k, v in processed_data.items() } with open(target_path, "w", encoding="utf-8") as f: json.dump(restored_data, f, ensure_ascii=False, indent=4) print(f"✅ 处理完成: {rel_path}") # 运行入口 if __name__ == "__main__": process_all_jsons("text_original", "text_processed", "text_target")