onscreen-translator/draw_translation.py

import easyocr
from pypinyin import pinyin
from PIL import Image, ImageDraw, ImageFont
import os, time, logging, torch, subprocess
from helpers.translation import init_M2M, translate_M2M
import langid
import numpy as np

##### Variables  to edit

text_color = "#ff0000"
font_file = "/home/James/.local/share/fonts/Arial-Unicode-Bold.ttf"
font_size = 16

pyin = True # whether to add pinyin or not
max_translate = 100

# for detecting language to filter out other languages. Only writes the text when it is detected to be src_lang
src_lang = "zh"
tgt_lang = "en"
# af, am, an, ar, as, az, be, bg, bn, br, bs, ca, cs, cy, da, de, dz, el, en, eo, es, et, eu, fa, fi, fo, fr, ga, gl, gu, he, hi, hr, ht, hu, hy, id, is, it, ja, jv, ka, kk, km, kn, ko, ku, ky, la, lb, lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, nb, ne, nl, nn, no, oc, or, pa, pl, ps, pt, qu, ro, ru, rw, se, si, sk, sl, sq, sr, sv, sw, ta, te, th, tl, tr, ug, uk, ur, vi, vo, wa, xh, zh, zu
langid.set_languages([src_lang,tgt_lang,'en'])

# for translator (M2M100)
from_lang = "zh"
target_lang = "en"

# Afrikaans (af), Amharic (am), Arabic (ar), Asturian (ast), Azerbaijani (az), Bashkir (ba), Belarusian (be), Bulgarian (bg), Bengali (bn), Breton (br), Bosnian (bs), Catalan; Valencian (ca), Cebuano (ceb), Czech (cs), Welsh (cy), Danish (da), German (de), Greeek (el), English (en), Spanish (es), Estonian (et), Persian (fa), Fulah (ff), Finnish (fi), French (fr), Western Frisian (fy), Irish (ga), Gaelic; Scottish Gaelic (gd), Galician (gl), Gujarati (gu), Hausa (ha), Hebrew (he), Hindi (hi), Croatian (hr), Haitian; Haitian Creole (ht), Hungarian (hu), Armenian (hy), Indonesian (id), Igbo (ig), Iloko (ilo), Icelandic (is), Italian (it), Japanese (ja), Javanese (jv), Georgian (ka), Kazakh (kk), Central Khmer (km), Kannada (kn), Korean (ko), Luxembourgish; Letzeburgesch (lb), Ganda (lg), Lingala (ln), Lao (lo), Lithuanian (lt), Latvian (lv), Malagasy (mg), Macedonian (mk), Malayalam (ml), Mongolian (mn), Marathi (mr), Malay (ms), Burmese (my), Nepali (ne), Dutch; Flemish (nl), Norwegian (no), Northern Sotho (ns), Occitan (post 1500) (oc), Oriya (or), Panjabi; Punjabi (pa), Polish (pl), Pushto; Pashto (ps), Portuguese (pt), Romanian; Moldavian; Moldovan (ro), Russian (ru), Sindhi (sd), Sinhala; Sinhalese (si), Slovak (sk), Slovenian (sl), Somali (so), Albanian (sq), Serbian (sr), Swati (ss), Sundanese (su), Swedish (sv), Swahili (sw), Tamil (ta), Thai (th), Tagalog (tl), Tswana (tn), Turkish (tr), Ukrainian (uk), Urdu (ur), Uzbek (uz), Vietnamese (vi), Wolof (wo), Xhosa (xh), Yiddish (yi), Yoruba (yo), Chinese (zh), Zulu (zu)

# for easyOCR
OCR_languages = ['ch_sim','en'] # languages to recognise
# https://www.jaided.ai/easyocr/

log_directory = '/var/log/ocr'
printsc = lambda x: subprocess.run(f"grim -t png -o DP-1 -l 0 {x}", shell=True)

# Configure the logger
os.makedirs(log_directory, exist_ok=True)

logging.basicConfig(
    filename=os.path.join(log_directory, 'ocr.log'),
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(message)s',  # Define the format for logging
    datefmt='%Y-%m-%d %H:%M:%S'  # Define the date format
)

# screenshot
printsc(image_old)
time.sleep(1)

# EasyOCR
reader = easyocr.Reader(OCR_languages) # this needs to run only once to load the model into memory

def results():
    result = reader.readtext(image_old)
    results_no_eng = [entry for entry in result if langid.classify(entry[1])[0] == src_lang]
    return results_no_eng

# result is a list of tuples with the following structure:
# (top_left, top_right, bottom_right, bottom_left, text, confidence)
# top_left, top_right, bottom_right, bottom_left are the coordinates of the bounding box
ocr_output = results()
curr_words = set(entry[1] for entry in ocr_output)
prev_words = set()

# translator = GoogleTranslator(source=from_language, target=target_language)

font = ImageFont.truetype(font_file, font_size)

# define a function for checking whether one axis of a shape intercepts with another
def intercepts(x,y):
    # both x and y are two dimensional tuples representing the ends of a line on one dimension.
    x1, x2 = x
    y1, y2 = y
    return (x1 <= y1 <= x2) or (x1 <= y2 <= x2) or (y1 <= x1 <= y2) or (y1 <= x2 <= y2)

while True:
    print('Running')
    if prev_words != curr_words:
        print('Translating')
        image = Image.open(image_old)
        draw = ImageDraw.Draw(image)
        to_translate = [entry[1] for entry in ocr_output][:max_translate]
        translation = translate_M2M(to_translate, from_lang = from_lang, target_lang = target_lang)
        # set counter for limiting the number of translations
        translated_number = 0
        bounding_boxes = []
        for i, (position,words,confidence) in enumerate(ocr_output):
            if translated_number >= max_translate:
                break
            word = translation[i]
            # try:
            top_left, _, _, _ = position
            position = (top_left[0], top_left[1] - 60)
            if pyin:
                py = ' '.join([ py[0] for py in pinyin(words)])
                text_content = f"{translation[i]}\n{py}\n{words}"
            else:
                text_content = f"{translation[i]}\n{words}"
            lines = text_content.split('\n')
            x,y = position

            max_width = 0
            total_height = 0
            line_spacing = 3
            line_height = font_size

            for line in lines:
                bbox = draw.textbbox(position, line, font=font)
                line_width, _ = bbox[2] - bbox[0], bbox[3] - bbox[1]
                max_width = max(max_width, line_width)
                total_height += line_height + line_spacing

            bounding_box = (x, y, x + max_width, y + total_height, words)
            print(f"Bounding Box of Interest: {bounding_box}")

            y = np.max([y,0])
            if len(bounding_boxes) > 0:
                for box in bounding_boxes:
                    print(f'Investigating box: {box}')
                    if intercepts((box[0],box[2]),(bounding_box[0],bounding_box[2])) and intercepts((box[1],box[3]),(y, y+total_height)):
                        print(f'Overlapping change adjustment to {words}')
                        y = np.max([y,box[3]]) + line_spacing
                        print(y, box[3])
                        print(f'Changed to {(x,y, x+max_width, y+total_height, words)}')
            adjusted_bounding_box = (x, y, x + max_width, y + total_height, words)
            bounding_boxes.append(adjusted_bounding_box)
            draw.rectangle([(x,y), (x+max_width, y+total_height)], outline="black", width=1)
            position = (x,y)
            for line in lines:
                draw.text(position, line, fill= text_color, font=font)
                y += font_size + line_spacing
                position = (x,y)
            print("Adjusted_bounding_box:",adjusted_bounding_box)
            print('\n')
            # except Exception as e:
            #     logging.error(e)
            translated_number += 1
        image.save(image_new)
        logging.info(f"Saved the image to {image_new}")
        prev_words = curr_words
        logging.info(f"Successfully translated image. Prev words are:\n{prev_words}")
    else:
        logging.info("The image has remained the same.")
    torch.cuda.empty_cache()
    print('Sleeping')
    time.sleep(10)

    printsc(image_old)
    ocr_output = results()
    curr_words = set(entry[1] for entry in ocr_output)
    logging.info(f'Curr words are:\n{curr_words}')