onscreen-translator/helpers/utils.py

import re, uroman as ur
from pypinyin import pinyin
import pyscreenshot as ImageGrab # wayland tings not sure if it will work on other machines alternatively use mss
import mss, io, os
from PIL import Image
import jaconv, MeCab, unidic, pykakasi
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# for creating furigana
mecab = MeCab.Tagger('-d "{}"'.format(unidic.DICDIR))
uroman = ur.Uroman()

# for romanising japanese text. Can convert to hiragana or katakana as well but does not split the words up so harder to use for furigana
kks = pykakasi.kakasi()


# define a function for checking whether one axis of a shape intercepts with another
def intercepts(x,y):
    # both x and y are two dimensional tuples representing the ends of a line on one dimension.
    x1, x2 = x
    y1, y2 = y
    return (x1 <= y1 <= x2) or (x1 <= y2 <= x2) or (y1 <= x1 <= y2) or (y1 <= x2 <= y2)

def is_wayland():
    return 'WAYLAND_DISPLAY' in os.environ

# path to save screenshot of monitor to
def printsc_wayland(region, save: bool = False, path: str = None):
    if save:
        im = ImageGrab.grab(bbox=region)
        im.save(path)
    else:
        return ImageGrab.grab(bbox=region)


def printsc_non_wayland(region, save: bool = False, path: str = None):
    # use mss to capture the screen
    with mss.mss() as sct:
        # grab the screen
        img = sct.grab(region)
        # convert the image to a PIL image
        image = Image.frombytes("RGB", img.size, img.bgra, "raw", "BGRX")
        # save the image if save is True
        if save:
            image.save(path)


def printsc(region, save: bool = False, path: str = None):
    try:
        if is_wayland():
            return printsc_wayland(region, save, path)
        else:
            return printsc_non_wayland(region, save, path)
    except Exception as e:
        print(f'Error {e}')

def convert_image_to_bytes(img):
    with io.BytesIO() as byte_stream:
        img.save(byte_stream, format='PNG')  # Save the image to the byte stream
        return byte_stream.getvalue()         # Get the byte representation


def bytes_to_image(image_bytes):
    # Load the image from bytes
    byte_stream = io.BytesIO(image_bytes)
    # Open the image from the BytesIO stream
    image = Image.open(byte_stream)
    return image

# for japanese

def add_furigana(text):
    parsed = mecab.parse(text).split('\n')[:-2]
    furigana_string = ''
    for i in parsed:
        words = i.split('\t')[0]
        try :
            add = f'({jaconv.kata2hira(i.split(',')[6])})'
        except:
            add = ''
        to_add = add if contains_kanji(words) else ''
        furigana_string += i.split('\t')[0] + to_add
    return furigana_string


def contains_kanji(text):
    return bool(re.search(r'[\u4E00-\u9FFF]', text))

def contains_hiragana(text):
    return bool(re.search(r'[\u3040-\u309F]', text))

def contains_katakana(text):
    return bool(re.search(r'[\u30A0-\u30FF]', text))


# use kakasi to romanize japanese text
def romanize(text, lang):
    if lang in ['zh','ch_sim','ch_tra']:
        return ' '.join([ py[0] for py in pinyin(text, heteronym=True)])
    if lang == 'ja':
        return kks.convert(text)[0]['hepburn']
    return uroman.romanize_string(text)

# check if a string contains words from a language
def contains_lang(text, lang):
    # Matches any character in the Unicode range of the language
    if lang == 'zh':
        return bool(re.search(r'[\u4e00-\u9fff]', text))
    elif lang == 'ja':
        return bool(re.search(r'[\u3040-\u30ff]', text))
    elif lang == 'ko':
        return bool(re.search(r'[\uac00-\ud7af]', text))
    elif lang == 'en':
        return bool(re.search(r'[a-zA-Z]', text))
    else:
        raise ValueError("Invalid language. Please use one of 'en', 'zh', 'ja', or 'ko'.")

### en, ch_sim, ch_tra, ja, ko rapidocr only has chinese and en at the moment
def standardize_lang(lang):
    if lang == 'ch_sim':
        easyocr_lang = 'ch_sim'
        paddleocr_lang = 'ch'
        rapidocr_lang = 'ch'
        translation_model_lang = 'zh'
        id_model_lang = 'zh'
    elif lang == 'ch_tra':
        easyocr_lang = 'ch_tra'
        paddleocr_lang = 'ch'
        rapidocr_lang = 'ch'
        translation_model_lang = 'zh'
        id_model_lang = 'zh'
    elif lang == 'ja':
        easyocr_lang = 'ja'
        paddleocr_lang = 'japan'
        rapidocr_lang = 'ja'
        translation_model_lang = 'ja'
        id_model_lang = 'ja'
    elif lang == 'ko':
        easyocr_lang = 'korean'
        paddleocr_lang = 'korean'
        rapidocr_lang = 'ko'
        translation_model_lang = 'ko'
        id_model_lang = 'ko'
    elif lang == 'en':
        easyocr_lang = 'en'
        paddleocr_lang = 'en'
        rapidocr_lang = 'en'
        translation_model_lang = 'en'
        id_model_lang = 'en'
    else:
        raise ValueError(f"Invalid language {lang}. Please use one of 'en', 'ch_sim', 'ch_tra', 'ja', or 'ko'.")
    return {'easyocr_lang': easyocr_lang,
            'paddleocr_lang': paddleocr_lang,
            'rapidocr_lang': rapidocr_lang,
            'translation_model_lang': translation_model_lang,
            'id_model_lang': id_model_lang}

def which_ocr_lang(model):
    if model == 'easy':
        return 'easyocr_lang'
    elif model == 'paddle':
        return 'paddleocr_lang'
    elif model == 'rapid':
        return 'rapidocr_lang'
    else:
        raise ValueError("Invalid OCR model. Please use one of 'easy', 'paddle', or 'rapid'.")

def similar_tfidf(list1,list2,threshold) -> float:
    """Calculate cosine similarity using TF-IDF vectors."""
    if not list1 or not list2:
        return 0.0

    vectorizer = TfidfVectorizer()
    all_texts = list1 + list2
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    # Calculate average vectors for each list
    vec1 = np.mean(tfidf_matrix[:len(list1)].toarray(), axis=0).reshape(1, -1)
    vec2 = np.mean(tfidf_matrix[len(list1):].toarray(), axis=0).reshape(1, -1)

    return float(cosine_similarity(vec1, vec2)[0, 0]) > threshold


if __name__ == "__main__":
    # Example usage
    japanesetext = "本が好きにちは"
    print(add_furigana(japanesetext))