deltarune-weblate/04.py

import os
import json
import re


# 清洗函数
def clean_text(text):
    text = re.sub(r"[#&]", "\n", text)
    text = re.sub(r"^(\\[A-Za-z0-9]{2}\s*)+", "", text)
    text = re.sub(r"/%*$", "", text)
    text = re.sub(r"\^1", "", text)
    return text


# 还原函数
def restore_text(key, text, source_text):
    # 1. \n -> # or &
    if key in source_text:
        original = source_text[key]

        def replace_newline(m):
            return "#" if "#" in original else "&"

        text = re.sub(r"\n", replace_newline, text)
    else:
        text = text.replace("\n", "&")

    # 2. 恢复前缀控制符 z* 或标签
    match_prefix = re.match(r"^(\\[A-Za-z0-9]{2}\s*)+", source_text.get(key, ""))
    if match_prefix:
        text = match_prefix.group(0) + text

    # 4. 标点前添加停顿符 ^1
    def replacer(match):
        puncts = match.group(1)
        return puncts[:-1] + "^1" + puncts[-1]

    text = re.sub(r"(?<!\^[0-9])([。！？\?!,\.，：]+)(?=\s*[\w&])", replacer, text)

    # 3. 恢复后缀控制符 /%*$
    match_suffix = re.search(r"/%*$", source_text.get(key, ""))

    if match_suffix:
        text = text + match_suffix.group(0)

    return text


# 文件遍历和处理
def process_all_jsons(original_root, processed_root, target_root):
    for root, _, files in os.walk(original_root):
        for file in files:
            if not file.endswith(".json"):
                continue

            # 构建路径
            rel_path = os.path.relpath(os.path.join(root, file), original_root)
            original_path = os.path.join(original_root, rel_path)
            processed_path = os.path.join(processed_root, rel_path)
            target_path = os.path.join(target_root, rel_path)

            # 创建目录
            os.makedirs(os.path.dirname(processed_path), exist_ok=True)
            os.makedirs(os.path.dirname(target_path), exist_ok=True)

            # 读取原始文件
            with open(original_path, "r", encoding="utf-8") as f:
                original_data = json.load(f)

            # 清洗处理并保存 processed 文件
            cleaned_data = {k: clean_text(v) for k, v in original_data.items()}
            with open(processed_path, "w", encoding="utf-8") as f:
                json.dump(cleaned_data, f, ensure_ascii=False, indent=4)

            # 再次读取 processed 文件进行还原
            with open(processed_path, "r", encoding="utf-8") as f:
                processed_data = json.load(f)

            restored_data = {
                k: restore_text(k, v, original_data) for k, v in processed_data.items()
            }
            with open(target_path, "w", encoding="utf-8") as f:
                json.dump(restored_data, f, ensure_ascii=False, indent=4)

            print(f"✅ 处理完成: {rel_path}")


# 运行入口
if __name__ == "__main__":
    process_all_jsons("text_original", "text_processed", "text_target")