1234

2025-05-14 12:41:38 +08:00 · 2025-05-14 12:41:38 +08:00 · b3267425a5
commit b3267425a5
parent 3adf985a33
9 changed files with 77433 additions and 0 deletions
--- a/04.py
+++ b/04.py
@ -0,0 +1,85 @@
+import os
+import json
+import re
+
+
+# 清洗函数
+def clean_text(text):
+    text = re.sub(r"[#&]", "\n", text)
+    text = re.sub(r"^(\[?[A-Za-z0-9]{2}\]?)+", "", text)
+    text = re.sub(r"/%*\$", "", text)
+    text = re.sub(r"\^1", "", text)
+    return text
+
+
+# 还原函数
+def restore_text(key, text, source_text):
+    # 1. \n -> # or &
+    if key in source_text:
+        original = source_text[key]
+
+        def replace_newline(m):
+            return "#" if "#" in original else "&"
+
+        text = re.sub(r"\n", replace_newline, text)
+    else:
+        text = text.replace("\n", "&")
+
+    # 2. 恢复前缀控制符 z* 或标签
+    match_prefix = re.match(r"^(\[?[A-Za-z0-9]{2}\]?)+", source_text.get(key, ""))
+    if match_prefix:
+        text = match_prefix.group(0) + text
+
+    # 3. 恢复后缀控制符 /%*$
+    match_suffix = re.search(r"/%*\$$", source_text.get(key, ""))
+    if match_suffix:
+        text = text + match_suffix.group(0)
+
+    # 4. 标点前添加停顿符 ^1
+    text = re.sub(r"(?<!\^1)([，。！？）】,!?]|\.\.\.)", r"^1\1", text)
+
+    return text
+
+
+# 文件遍历和处理
+def process_all_jsons(original_root, processed_root, target_root):
+    for root, _, files in os.walk(original_root):
+        for file in files:
+            if not file.endswith(".json"):
+                continue
+
+            # 构建路径
+            rel_path = os.path.relpath(os.path.join(root, file), original_root)
+            original_path = os.path.join(original_root, rel_path)
+            processed_path = os.path.join(processed_root, rel_path)
+            target_path = os.path.join(target_root, rel_path)
+
+            # 创建目录
+            os.makedirs(os.path.dirname(processed_path), exist_ok=True)
+            os.makedirs(os.path.dirname(target_path), exist_ok=True)
+
+            # 读取原始文件
+            with open(original_path, "r", encoding="utf-8") as f:
+                original_data = json.load(f)
+
+            # 清洗处理并保存 processed 文件
+            cleaned_data = {k: clean_text(v) for k, v in original_data.items()}
+            with open(processed_path, "w", encoding="utf-8") as f:
+                json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
+
+            # 再次读取 processed 文件进行还原
+            with open(processed_path, "r", encoding="utf-8") as f:
+                processed_data = json.load(f)
+
+            restored_data = {
+                k: restore_text(k, v, original_data) for k, v in processed_data.items()
+            }
+            with open(target_path, "w", encoding="utf-8") as f:
+                json.dump(restored_data, f, ensure_ascii=False, indent=2)
+
+            print(f"✅ 处理完成: {rel_path}")
+
+
+# 运行入口
+if __name__ == "__main__":
+    process_all_jsons("text_original", "text_processed", "text_target")
--- a/text_processed/ch1/en.json
+++ b/text_processed/ch1/en.json
--- a/text_processed/ch1/zh_CN.json
+++ b/text_processed/ch1/zh_CN.json
--- a/text_processed/ch2/en.json
+++ b/text_processed/ch2/en.json
--- a/text_processed/ch2/zh_CN.json
+++ b/text_processed/ch2/zh_CN.json
--- a/text_target/ch1/en.json
+++ b/text_target/ch1/en.json
--- a/text_target/ch1/zh_CN.json
+++ b/text_target/ch1/zh_CN.json
--- a/text_target/ch2/en.json
+++ b/text_target/ch2/en.json
--- a/text_target/ch2/zh_CN.json
+++ b/text_target/ch2/zh_CN.json