210 lines
6.1 KiB
Python
210 lines
6.1 KiB
Python
import os
|
|
import json, re
|
|
from PIL import Image
|
|
import easyocr
|
|
import numpy as np
|
|
import pytesseract
|
|
|
|
from collections import OrderedDict
|
|
from paddleocr import PaddleOCR
|
|
|
|
COMIC_ROOT = "comic"
|
|
TEXT_OUTPUT_DIR = "text2"
|
|
|
|
reader = easyocr.Reader(["en"], gpu=True)
|
|
|
|
|
|
def get_last_frame_gif(path):
|
|
gif = Image.open(path)
|
|
try:
|
|
while True:
|
|
gif.seek(gif.tell() + 1)
|
|
except EOFError:
|
|
return gif.convert("RGB")
|
|
|
|
|
|
def load_image(path):
|
|
return (
|
|
get_last_frame_gif(path)
|
|
if path.lower().endswith(".gif")
|
|
else Image.open(path).convert("RGB")
|
|
)
|
|
|
|
|
|
def get_color(img, x, y):
|
|
return img.getpixel((x, y))
|
|
|
|
|
|
def is_color(c, hex_code):
|
|
rgb = tuple(int(hex_code[i : i + 2], 16) for i in (1, 3, 5))
|
|
return c == rgb
|
|
|
|
|
|
def check_diag(img, x1, y1, x2, y2, color_hex):
|
|
return is_color(get_color(img, x1, y1), color_hex) and is_color(
|
|
get_color(img, x2, y2), color_hex
|
|
)
|
|
|
|
|
|
def is_battle_type(img):
|
|
pts = [(32, 473), (185, 432), (454, 473), (609, 432)]
|
|
return all(
|
|
is_color(get_color(img, x, y), "#ff7f27")
|
|
or is_color(get_color(img, x, y), "#ffff00")
|
|
for x, y in pts
|
|
)
|
|
|
|
|
|
def has_textbox(img, base_y_offset):
|
|
return (
|
|
check_diag(img, 32, base_y_offset, 37, base_y_offset + 5, "#ffffff")
|
|
and check_diag(
|
|
img, 604, base_y_offset + 146, 609, base_y_offset + 151, "#ffffff"
|
|
)
|
|
and check_diag(img, 38, base_y_offset + 6, 42, base_y_offset + 10, "#000000")
|
|
)
|
|
|
|
|
|
def has_battle_textbox(img, base_y_offset):
|
|
return (
|
|
check_diag(img, 32, base_y_offset, 36, base_y_offset + 4, "#ffffff")
|
|
and check_diag(
|
|
img, 602, base_y_offset + 135, 606, base_y_offset + 139, "#ffffff"
|
|
)
|
|
and check_diag(img, 37, base_y_offset + 5, 41, base_y_offset + 9, "#000000")
|
|
)
|
|
|
|
|
|
def check_avatar_absent(img, y1, y2):
|
|
def line_black(y):
|
|
for x in range(60, 145):
|
|
if get_color(img, x, y) != (0, 0, 0):
|
|
return False
|
|
return True
|
|
|
|
return line_black(y1) and line_black(y2)
|
|
|
|
|
|
def crop_textbox(img, is_battle, has_avatar, base_y_offset):
|
|
x0 = 160 if has_avatar else (39 if is_battle else 40)
|
|
y0 = base_y_offset + (7 if is_battle else 8)
|
|
y1 = base_y_offset + (132 if is_battle else 143)
|
|
x1 = 600 if is_battle else 602
|
|
return img.crop((x0, y0, x1, y1))
|
|
|
|
|
|
# def ocr_text(img_region):
|
|
# np_img = np.array(img_region)
|
|
# result = reader.readtext(np_img, detail=0)
|
|
# return " ".join(result).strip()
|
|
|
|
|
|
ocr = PaddleOCR(use_angle_cls=True, lang="en")
|
|
|
|
|
|
def ocr_text(img_region):
|
|
result = ocr.ocr(np.array(img_region))
|
|
if result and result[0]:
|
|
return " ".join([line[1][0] for line in result[0]]).strip()
|
|
return ""
|
|
|
|
|
|
# def ocr_text(img_region):
|
|
# text = pytesseract.image_to_string(img_region, config="--psm 6")
|
|
# return text.strip().replace("=", "*")
|
|
|
|
|
|
def describe_result(file, pos, avatar, battle_type):
|
|
return f"{file}-{pos}{avatar}{battle_type}"
|
|
|
|
|
|
def process_image(path):
|
|
img = load_image(path)
|
|
file = os.path.basename(path)
|
|
entry = None
|
|
|
|
if is_battle_type(img):
|
|
has_txt = has_battle_textbox(img, 250)
|
|
if has_txt:
|
|
textbox_img = crop_textbox(
|
|
img, is_battle=True, has_avatar=False, base_y_offset=250
|
|
)
|
|
text = ocr_text(textbox_img)
|
|
if text:
|
|
key = describe_result(file, "战斗文本框", "无头像", "战斗")
|
|
entry = {key: text}
|
|
else:
|
|
top = has_textbox(img, 10)
|
|
bottom = has_textbox(img, 320)
|
|
if top:
|
|
avatar_absent = check_avatar_absent(img, 60, 100)
|
|
textbox_img = crop_textbox(
|
|
img, is_battle=False, has_avatar=not avatar_absent, base_y_offset=10
|
|
)
|
|
text = ocr_text(textbox_img)
|
|
if text:
|
|
key = describe_result(
|
|
file,
|
|
"顶部文本框",
|
|
"无头像" if avatar_absent else "有头像",
|
|
"非战斗",
|
|
)
|
|
entry = {key: text}
|
|
elif bottom:
|
|
avatar_absent = check_avatar_absent(img, 370, 410)
|
|
textbox_img = crop_textbox(
|
|
img, is_battle=False, has_avatar=not avatar_absent, base_y_offset=320
|
|
)
|
|
text = ocr_text(textbox_img)
|
|
if text:
|
|
key = describe_result(
|
|
file,
|
|
"底部文本框",
|
|
"无头像" if avatar_absent else "有头像",
|
|
"非战斗",
|
|
)
|
|
entry = {key: text}
|
|
|
|
return entry
|
|
|
|
|
|
def main():
|
|
os.makedirs(TEXT_OUTPUT_DIR, exist_ok=True)
|
|
|
|
for root, dirs, files in os.walk(COMIC_ROOT):
|
|
if os.path.basename(root).startswith("Ch"):
|
|
chapter = os.path.basename(root)
|
|
output = {}
|
|
# 如果输出文件存在,则跳过
|
|
if os.path.exists(os.path.join(TEXT_OUTPUT_DIR, f"{chapter}/OCR2.json")):
|
|
continue
|
|
for file in sorted(files):
|
|
if file.lower().endswith((".png", ".gif")):
|
|
path = os.path.join(root, file)
|
|
entry = process_image(path)
|
|
if entry:
|
|
output.update(entry)
|
|
|
|
def extract_number(s):
|
|
match = re.search(r"(\d+)", s)
|
|
return int(match.group(1)) if match else float("inf")
|
|
|
|
if output:
|
|
sorted_output = OrderedDict(
|
|
sorted(output.items(), key=lambda x: extract_number(x[0]))
|
|
)
|
|
if not os.path.exists(os.path.join(TEXT_OUTPUT_DIR, f"{chapter}")):
|
|
os.makedirs(os.path.join(TEXT_OUTPUT_DIR, f"{chapter}"))
|
|
with open(
|
|
os.path.join(TEXT_OUTPUT_DIR, f"{chapter}/OCR2.json"),
|
|
"w",
|
|
encoding="utf-8",
|
|
) as f:
|
|
json.dump(sorted_output, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✅ 完成 {os.path.join(TEXT_OUTPUT_DIR, f"{chapter}/OCR2.json")}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|