2025-05-14 15:24:25 +08:00

91 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import re
# 清洗函数
def clean_text(text):
text = re.sub(r"[#&]", "\n", text)
text = re.sub(r"^(\\[A-Za-z0-9]{2}\s*)+", "", text)
text = re.sub(r"/%*$", "", text)
text = re.sub(r"\^1", "", text)
return text
# 还原函数
def restore_text(key, text, source_text):
# 1. \n -> # or &
if key in source_text:
original = source_text[key]
def replace_newline(m):
return "#" if "#" in original else "&"
text = re.sub(r"\n", replace_newline, text)
else:
text = text.replace("\n", "&")
# 2. 恢复前缀控制符 z* 或标签
match_prefix = re.match(r"^(\\[A-Za-z0-9]{2}\s*)+", source_text.get(key, ""))
if match_prefix:
text = match_prefix.group(0) + text
# 4. 标点前添加停顿符 ^1
def replacer(match):
puncts = match.group(1)
return puncts[:-1] + "^1" + puncts[-1]
text = re.sub(r"(?<!\^[0-9])([。!?\?!,\.]+)(?=\s*[\w&])", replacer, text)
# 3. 恢复后缀控制符 /%*$
match_suffix = re.search(r"/%*$", source_text.get(key, ""))
if match_suffix:
text = text + match_suffix.group(0)
return text
# 文件遍历和处理
def process_all_jsons(original_root, processed_root, target_root):
for root, _, files in os.walk(original_root):
for file in files:
if not file.endswith(".json"):
continue
# 构建路径
rel_path = os.path.relpath(os.path.join(root, file), original_root)
original_path = os.path.join(original_root, rel_path)
processed_path = os.path.join(processed_root, rel_path)
target_path = os.path.join(target_root, rel_path)
# 创建目录
os.makedirs(os.path.dirname(processed_path), exist_ok=True)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
# 读取原始文件
with open(original_path, "r", encoding="utf-8") as f:
original_data = json.load(f)
# 清洗处理并保存 processed 文件
cleaned_data = {k: clean_text(v) for k, v in original_data.items()}
with open(processed_path, "w", encoding="utf-8") as f:
json.dump(cleaned_data, f, ensure_ascii=False, indent=4)
# 再次读取 processed 文件进行还原
with open(processed_path, "r", encoding="utf-8") as f:
processed_data = json.load(f)
restored_data = {
k: restore_text(k, v, original_data) for k, v in processed_data.items()
}
with open(target_path, "w", encoding="utf-8") as f:
json.dump(restored_data, f, ensure_ascii=False, indent=4)
print(f"✅ 处理完成: {rel_path}")
# 运行入口
if __name__ == "__main__":
process_all_jsons("text_original", "text_processed", "text_target")