1234
This commit is contained in:
parent
3adf985a33
commit
b3267425a5
85
04.py
Normal file
85
04.py
Normal file
@ -0,0 +1,85 @@
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
# 清洗函数
|
||||
def clean_text(text):
|
||||
text = re.sub(r"[#&]", "\n", text)
|
||||
text = re.sub(r"^(\[?[A-Za-z0-9]{2}\]?)+", "", text)
|
||||
text = re.sub(r"/%*\$", "", text)
|
||||
text = re.sub(r"\^1", "", text)
|
||||
return text
|
||||
|
||||
|
||||
# 还原函数
|
||||
def restore_text(key, text, source_text):
|
||||
# 1. \n -> # or &
|
||||
if key in source_text:
|
||||
original = source_text[key]
|
||||
|
||||
def replace_newline(m):
|
||||
return "#" if "#" in original else "&"
|
||||
|
||||
text = re.sub(r"\n", replace_newline, text)
|
||||
else:
|
||||
text = text.replace("\n", "&")
|
||||
|
||||
# 2. 恢复前缀控制符 z* 或标签
|
||||
match_prefix = re.match(r"^(\[?[A-Za-z0-9]{2}\]?)+", source_text.get(key, ""))
|
||||
if match_prefix:
|
||||
text = match_prefix.group(0) + text
|
||||
|
||||
# 3. 恢复后缀控制符 /%*$
|
||||
match_suffix = re.search(r"/%*\$$", source_text.get(key, ""))
|
||||
if match_suffix:
|
||||
text = text + match_suffix.group(0)
|
||||
|
||||
# 4. 标点前添加停顿符 ^1
|
||||
text = re.sub(r"(?<!\^1)([,。!?)】,!?]|\.\.\.)", r"^1\1", text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# 文件遍历和处理
|
||||
def process_all_jsons(original_root, processed_root, target_root):
|
||||
for root, _, files in os.walk(original_root):
|
||||
for file in files:
|
||||
if not file.endswith(".json"):
|
||||
continue
|
||||
|
||||
# 构建路径
|
||||
rel_path = os.path.relpath(os.path.join(root, file), original_root)
|
||||
original_path = os.path.join(original_root, rel_path)
|
||||
processed_path = os.path.join(processed_root, rel_path)
|
||||
target_path = os.path.join(target_root, rel_path)
|
||||
|
||||
# 创建目录
|
||||
os.makedirs(os.path.dirname(processed_path), exist_ok=True)
|
||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||
|
||||
# 读取原始文件
|
||||
with open(original_path, "r", encoding="utf-8") as f:
|
||||
original_data = json.load(f)
|
||||
|
||||
# 清洗处理并保存 processed 文件
|
||||
cleaned_data = {k: clean_text(v) for k, v in original_data.items()}
|
||||
with open(processed_path, "w", encoding="utf-8") as f:
|
||||
json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# 再次读取 processed 文件进行还原
|
||||
with open(processed_path, "r", encoding="utf-8") as f:
|
||||
processed_data = json.load(f)
|
||||
|
||||
restored_data = {
|
||||
k: restore_text(k, v, original_data) for k, v in processed_data.items()
|
||||
}
|
||||
with open(target_path, "w", encoding="utf-8") as f:
|
||||
json.dump(restored_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"✅ 处理完成: {rel_path}")
|
||||
|
||||
|
||||
# 运行入口
|
||||
if __name__ == "__main__":
|
||||
process_all_jsons("text_original", "text_processed", "text_target")
|
6244
text_processed/ch1/en.json
Normal file
6244
text_processed/ch1/en.json
Normal file
File diff suppressed because it is too large
Load Diff
6244
text_processed/ch1/zh_CN.json
Normal file
6244
text_processed/ch1/zh_CN.json
Normal file
File diff suppressed because it is too large
Load Diff
13093
text_processed/ch2/en.json
Normal file
13093
text_processed/ch2/en.json
Normal file
File diff suppressed because it is too large
Load Diff
13093
text_processed/ch2/zh_CN.json
Normal file
13093
text_processed/ch2/zh_CN.json
Normal file
File diff suppressed because it is too large
Load Diff
6244
text_target/ch1/en.json
Normal file
6244
text_target/ch1/en.json
Normal file
File diff suppressed because it is too large
Load Diff
6244
text_target/ch1/zh_CN.json
Normal file
6244
text_target/ch1/zh_CN.json
Normal file
File diff suppressed because it is too large
Load Diff
13093
text_target/ch2/en.json
Normal file
13093
text_target/ch2/en.json
Normal file
File diff suppressed because it is too large
Load Diff
13093
text_target/ch2/zh_CN.json
Normal file
13093
text_target/ch2/zh_CN.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user