onscreen-translator/helpers/utils.py

import re, uroman as ur
from pypinyin import pinyin
import pyscreenshot as ImageGrab # wayland tings not sure if it will work on other machines alternatively use mss
import mss, io, os
from PIL import Image
import jaconv, MeCab, unidic, pykakasi
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import subprocess
# for creating furigana
mecab = MeCab.Tagger('-d "{}"'.format(unidic.DICDIR))
uroman = ur.Uroman()

# for romanising japanese text. Can convert to hiragana or katakana as well but does not split the words up so harder to use for furigana
kks = pykakasi.kakasi()


# define a function for checking whether one axis of a shape intercepts with another
def intercepts(x,y):
    # both x and y are two dimensional tuples representing the ends of a line on one dimension.
    x1, x2 = x
    y1, y2 = y
    return (x1 <= y1 <= x2) or (x1 <= y2 <= x2) or (y1 <= x1 <= y2) or (y1 <= x2 <= y2)

def is_wayland():
    return 'WAYLAND_DISPLAY' in os.environ

#  please install grim otherwise this is way too slow for wayland
def printsc_wayland(region: tuple, path: str):
    subprocess.run(['grim','-g', f'{region[0]},{region[1]} {region[2]-region[0]}x{region[3]-region[1]}', '-t', 'jpeg', '-q','90', path])

def printsc_non_wayland(region: tuple, path: str):
    # use mss to capture the screen
    with mss.mss() as sct:
        # grab the screen
        img = sct.grab(region)
        # convert the image to a PIL image
        image = Image.frombytes("RGB", img.size, img.bgra, "raw", "BGRX")
        image.save(path)


def printsc(region: tuple, path: str):
    try:
        if is_wayland():
            printsc_wayland(region, path)
        else:
            printsc_non_wayland(region, path)
    except Exception as e:
        print(f'Error {e}')

def convert_image_to_bytes(img):
    with io.BytesIO() as byte_stream:
        img.save(byte_stream, format='PNG')  # Save the image to the byte stream
        return byte_stream.getvalue()         # Get the byte representation


def bytes_to_image(image_bytes):
    # Load the image from bytes
    byte_stream = io.BytesIO(image_bytes)
    # Open the image from the BytesIO stream
    image = Image.open(byte_stream)
    return image

# for japanese

def add_furigana(text):
    parsed = mecab.parse(text).split('\n')[:-2]
    furigana_string = ''
    for i in parsed:
        words = i.split('\t')[0]
        try :
            add = f'({jaconv.kata2hira(i.split(',')[6])})'
        except:
            add = ''
        to_add = add if contains_kanji(words) else ''
        furigana_string += i.split('\t')[0] + to_add
    return furigana_string


def contains_kanji(text):
    return bool(re.search(r'[\u4E00-\u9FFF]', text))

def contains_hiragana(text):
    return bool(re.search(r'[\u3040-\u309F]', text))

def contains_katakana(text):
    return bool(re.search(r'[\u30A0-\u30FF]', text))


# use kakasi to romanize japanese text
def romanize(text, lang):
    if lang in ['zh','ch_sim','ch_tra']:
        return ' '.join([ py[0] for py in pinyin(text, heteronym=True)])
    if lang == 'ja':
        return ' '.join([romaji['hepburn'] for romaji in kks.convert(text)])
    return uroman.romanize_string(text)

# check if a string contains words from a language
def contains_lang(text, lang):
    # Matches any character in the Unicode range of the language
    if lang == 'zh':
        return bool(re.search(r'[\u4e00-\u9fff]', text))
    elif lang == 'ja':
        return bool(re.search(r'[\u3040-\u30ff]', text)) or bool(re.search(r'[\u4e00-\u9fff]', text))
    elif lang == 'ko':
        return bool(re.search(r'[\uac00-\ud7af]', text))
    elif lang == 'en':
        return bool(re.search(r'[a-zA-Z]', text))
    else:
        raise ValueError("Invalid language. Please use one of 'en', 'zh', 'ja', or 'ko'.")

### en, ch_sim, ch_tra, ja, ko rapidocr only has chinese and en at the moment
def standardize_lang(lang):
    if lang == 'ch_sim':
        easyocr_lang = 'ch_sim'
        paddleocr_lang = 'ch'
        rapidocr_lang = 'ch'
        translation_model_lang = 'zh'
        id_model_lang = 'zh'
    elif lang == 'ch_tra':
        easyocr_lang = 'ch_tra'
        paddleocr_lang = 'ch'
        rapidocr_lang = 'ch'
        translation_model_lang = 'zh'
        id_model_lang = 'zh'
    elif lang == 'ja':
        easyocr_lang = 'ja'
        paddleocr_lang = 'japan'
        rapidocr_lang = 'ja'
        translation_model_lang = 'ja'
        id_model_lang = 'ja'
    elif lang == 'ko':
        easyocr_lang = 'ko'
        paddleocr_lang = 'korean'
        rapidocr_lang = 'ko'
        translation_model_lang = 'ko'
        id_model_lang = 'ko'
    elif lang == 'en':
        easyocr_lang = 'en'
        paddleocr_lang = 'en'
        rapidocr_lang = 'en'
        translation_model_lang = 'en'
        id_model_lang = 'en'
    else:
        raise ValueError(f"Invalid language {lang}. Please use one of 'en', 'ch_sim', 'ch_tra', 'ja', or 'ko'.")
    return {'easyocr_lang': easyocr_lang,
            'paddleocr_lang': paddleocr_lang,
            'rapidocr_lang': rapidocr_lang,
            'translation_model_lang': translation_model_lang,
            'id_model_lang': id_model_lang}

def which_ocr_lang(model):
    if model == 'easy':
        return 'easyocr_lang'
    elif model == 'paddle':
        return 'paddleocr_lang'
    elif model == 'rapid':
        return 'rapidocr_lang'
    else:
        raise ValueError("Invalid OCR model. Please use one of 'easy', 'paddle', or 'rapid'.")

def similar_tfidf(list1,list2) -> float:
    """Calculate cosine similarity using TF-IDF vectors."""
    if not list1 or not list2:
        return 0.0

    vectorizer = TfidfVectorizer()
    all_texts = list1 + list2
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    # Calculate average vectors for each list
    vec1 = np.mean(tfidf_matrix[:len(list1)].toarray(), axis=0).reshape(1, -1)
    vec2 = np.mean(tfidf_matrix[len(list1):].toarray(), axis=0).reshape(1, -1)

    return cosine_similarity(vec1, vec2)[0, 0]

def similar_jacard(list1, list2) -> float:
    if not list1 or not list2:
        return 0.0
    return len(set(list1).intersection(set(list2))) / len(set(list1).union(set(list2)))

def check_similarity(list1, list2, threshold, method = 'tfidf'):
    if method == 'tfidf':
        try:
            confidence = similar_tfidf(list1, list2)
        except ValueError:
            return True
        return True if confidence > threshold else False
    elif method == 'jacard':
        return True if similar_jacard(list1, list2) >= threshold else False
    else:
        raise ValueError("Invalid method. Please use one of 'tfidf' or 'jacard'.")

if __name__ == "__main__":
    # Example usage
    print(romanize(lang='ja', text='世界はひろい'))