import re, uroman as ur from pypinyin import pinyin import pyscreenshot as ImageGrab # wayland tings not sure if it will work on other machines alternatively use mss import mss, io, os from PIL import Image import jaconv, MeCab, unidic, pykakasi from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import subprocess # for creating furigana mecab = MeCab.Tagger('-d "{}"'.format(unidic.DICDIR)) uroman = ur.Uroman() # for romanising japanese text. Can convert to hiragana or katakana as well but does not split the words up so harder to use for furigana kks = pykakasi.kakasi() # define a function for checking whether one axis of a shape intercepts with another def intercepts(x,y): # both x and y are two dimensional tuples representing the ends of a line on one dimension. x1, x2 = x y1, y2 = y return (x1 <= y1 <= x2) or (x1 <= y2 <= x2) or (y1 <= x1 <= y2) or (y1 <= x2 <= y2) def is_wayland(): return 'WAYLAND_DISPLAY' in os.environ # please install grim otherwise this is way too slow for wayland def printsc_wayland(region: tuple, path: str): subprocess.run(['grim','-g', f'{region[0]},{region[1]} {region[2]-region[0]}x{region[3]-region[1]}', '-t', 'jpeg', '-q','100', path]) def printsc_non_wayland(region: tuple, path: str): # use mss to capture the screen with mss.mss() as sct: # grab the screen img = sct.grab(region) # convert the image to a PIL image image = Image.frombytes("RGB", img.size, img.bgra, "raw", "BGRX") image.save(path) def printsc(region: tuple, path: str): try: if is_wayland(): printsc_wayland(region, path) else: printsc_non_wayland(region, path) except Exception as e: print(f'Error {e}') def convert_image_to_bytes(img): with io.BytesIO() as byte_stream: img.save(byte_stream, format='PNG') # Save the image to the byte stream return byte_stream.getvalue() # Get the byte representation def bytes_to_image(image_bytes): # Load the image from bytes byte_stream = io.BytesIO(image_bytes) # Open the image from the BytesIO stream image = Image.open(byte_stream) return image # for japanese def add_furigana(text): parsed = mecab.parse(text).split('\n')[:-2] furigana_string = '' for i in parsed: words = i.split('\t')[0] try : add = f'({jaconv.kata2hira(i.split(',')[6])})' except: add = '' to_add = add if contains_kanji(words) else '' furigana_string += i.split('\t')[0] + to_add return furigana_string def contains_kanji(text): return bool(re.search(r'[\u4E00-\u9FFF]', text)) def contains_hiragana(text): return bool(re.search(r'[\u3040-\u309F]', text)) def contains_katakana(text): return bool(re.search(r'[\u30A0-\u30FF]', text)) # use kakasi to romanize japanese text def romanize(text, lang): if lang in ['zh','ch_sim','ch_tra']: return ' '.join([ py[0] for py in pinyin(text, heteronym=True)]) if lang == 'ja': return ' '.join([romaji['hepburn'] for romaji in kks.convert(text)]) return uroman.romanize_string(text) # check if a string contains words from a language def contains_lang(text, lang): # Matches any character in the Unicode range of the language if lang == 'zh': return bool(re.search(r'[\u4e00-\u9fff]', text)) elif lang == 'ja': return bool(re.search(r'[\u3040-\u30ff]', text)) or bool(re.search(r'[\u4e00-\u9fff]', text)) elif lang == 'ko': return bool(re.search(r'[\uac00-\ud7af]', text)) elif lang == 'en': return bool(re.search(r'[a-zA-Z]', text)) else: raise ValueError("Invalid language. Please use one of 'en', 'zh', 'ja', or 'ko'.") ### en, ch_sim, ch_tra, ja, ko rapidocr only has chinese and en at the moment def standardize_lang(lang): if lang == 'ch_sim': easyocr_lang = 'ch_sim' paddleocr_lang = 'ch' rapidocr_lang = 'ch' translation_model_lang = 'zh' id_model_lang = 'zh' elif lang == 'ch_tra': easyocr_lang = 'ch_tra' paddleocr_lang = 'ch' rapidocr_lang = 'ch' translation_model_lang = 'zh' id_model_lang = 'zh' elif lang == 'ja': easyocr_lang = 'ja' paddleocr_lang = 'japan' rapidocr_lang = 'ja' translation_model_lang = 'ja' id_model_lang = 'ja' elif lang == 'ko': easyocr_lang = 'ko' paddleocr_lang = 'korean' rapidocr_lang = 'ko' translation_model_lang = 'ko' id_model_lang = 'ko' elif lang == 'en': easyocr_lang = 'en' paddleocr_lang = 'en' rapidocr_lang = 'en' translation_model_lang = 'en' id_model_lang = 'en' else: raise ValueError(f"Invalid language {lang}. Please use one of 'en', 'ch_sim', 'ch_tra', 'ja', or 'ko'.") return {'easyocr_lang': easyocr_lang, 'paddleocr_lang': paddleocr_lang, 'rapidocr_lang': rapidocr_lang, 'translation_model_lang': translation_model_lang, 'id_model_lang': id_model_lang} def which_ocr_lang(model): if model == 'easy': return 'easyocr_lang' elif model == 'paddle': return 'paddleocr_lang' elif model == 'rapid': return 'rapidocr_lang' else: raise ValueError("Invalid OCR model. Please use one of 'easy', 'paddle', or 'rapid'.") def similar_tfidf(list1,list2) -> float: """Calculate cosine similarity using TF-IDF vectors.""" if not list1 or not list2: return 0.0 vectorizer = TfidfVectorizer() all_texts = list1 + list2 tfidf_matrix = vectorizer.fit_transform(all_texts) # Calculate average vectors for each list vec1 = np.mean(tfidf_matrix[:len(list1)].toarray(), axis=0).reshape(1, -1) vec2 = np.mean(tfidf_matrix[len(list1):].toarray(), axis=0).reshape(1, -1) return cosine_similarity(vec1, vec2)[0, 0] def similar_jacard(list1, list2) -> float: if not list1 or not list2: return 0.0 return len(set(list1).intersection(set(list2))) / len(set(list1).union(set(list2))) def check_similarity(list1, list2, threshold, method = 'tfidf'): if method == 'tfidf': try: confidence = similar_tfidf(list1, list2) except ValueError: return True return True if confidence > threshold else False elif method == 'jacard': return True if similar_jacard(list1, list2) >= threshold else False else: raise ValueError("Invalid method. Please use one of 'tfidf' or 'jacard'.") if __name__ == "__main__": # Example usage print(romanize(lang='ja', text='世界はひろい'))