import re, uroman as ur from pypinyin import pinyin import pyscreenshot as ImageGrab # wayland tings not sure if it will work on other machines alternatively use mss import mss, io, os from PIL import Image import jaconv, MeCab, unidic, pykakasi # for creating furigana mecab = MeCab.Tagger('-d "{}"'.format(unidic.DICDIR)) uroman = ur.Uroman() # for romanising japanese text. Can convert to hiragana or katakana as well but does not split the words up so harder to use for furigana kks = pykakasi.kakasi() # define a function for checking whether one axis of a shape intercepts with another def intercepts(x,y): # both x and y are two dimensional tuples representing the ends of a line on one dimension. x1, x2 = x y1, y2 = y return (x1 <= y1 <= x2) or (x1 <= y2 <= x2) or (y1 <= x1 <= y2) or (y1 <= x2 <= y2) def is_wayland(): return 'WAYLAND_DISPLAY' in os.environ # path to save screenshot of monitor to def printsc_wayland(region, save: bool = False, path: str = None): if save: im = ImageGrab.grab(bbox=region) im.save(path) else: return ImageGrab.grab(bbox=region) def printsc_non_wayland(region, save: bool = False, path: str = None): # use mss to capture the screen with mss.mss() as sct: # grab the screen img = sct.grab(region) # convert the image to a PIL image image = Image.frombytes("RGB", img.size, img.bgra, "raw", "BGRX") # save the image if save is True if save: image.save(path) def printsc(region, save: bool = False, path: str = None): try: if is_wayland(): return printsc_wayland(region, save, path) else: return printsc_non_wayland(region, save, path) except Exception as e: print(f'Error {e}') def convert_image_to_bytes(img): with io.BytesIO() as byte_stream: img.save(byte_stream, format='PNG') # Save the image to the byte stream return byte_stream.getvalue() # Get the byte representation def bytes_to_image(image_bytes): # Load the image from bytes byte_stream = io.BytesIO(image_bytes) # Open the image from the BytesIO stream image = Image.open(byte_stream) return image # for japanese def add_furigana(text): parsed = mecab.parse(text).split('\n')[:-2] furigana_string = '' for i in parsed: words = i.split('\t')[0] try : add = f'({jaconv.kata2hira(i.split(',')[6])})' except: add = '' to_add = add if contains_kanji(words) else '' furigana_string += i.split('\t')[0] + to_add return furigana_string def contains_kanji(text): return bool(re.search(r'[\u4E00-\u9FFF]', text)) def contains_hiragana(text): return bool(re.search(r'[\u3040-\u309F]', text)) def contains_katakana(text): return bool(re.search(r'[\u30A0-\u30FF]', text)) # use kakasi to romanize japanese text def romanize(text, lang): if lang == 'zh': return ' '.join([ py[0] for py in pinyin(text, heteronym=True)]) if lang == 'ja': return kks.convert(text)[0]['hepburn'] return uroman.romanize_string(text) # check if a string contains words from a language def contains_lang(text, lang): # Matches any character in the Unicode range of the language if lang == 'zh': return bool(re.search(r'[\u4e00-\u9fff]', text)) elif lang == 'ja': return bool(re.search(r'[\u3040-\u30ff]', text)) elif lang == 'ko': return bool(re.search(r'[\uac00-\ud7af]', text)) elif lang == 'en': return bool(re.search(r'[a-zA-Z]', text)) else: raise ValueError("Invalid language. Please use one of 'en', 'zh', 'ja', or 'ko'.") ### en, ch_sim, ch_tra, ja, ko rapidocr only has chinese and en at the moment def standardize_lang(lang): if lang == 'ch_sim': easyocr_lang = 'ch_sim' paddleocr_lang = 'ch' rapidocr_lang = 'ch' translation_model_lang = 'zh' id_model_lang = 'zh' elif lang == 'ch_tra': easyocr_lang = 'ch_tra' paddleocr_lang = 'ch' rapidocr_lang = 'ch' translation_model_lang = 'zh' id_model_lang = 'zh' elif lang == 'ja': easyocr_lang = 'ja' paddleocr_lang = 'ja' rapidocr_lang = 'ja' translation_model_lang = 'ja' id_model_lang = 'ja' elif lang == 'ko': easyocr_lang = 'korean' paddleocr_lang = 'ko' rapidocr_lang = 'ko' translation_model_lang = 'ko' id_model_lang = 'ko' elif lang == 'en': easyocr_lang = 'en' paddleocr_lang = 'en' rapidocr_lang = 'en' translation_model_lang = 'en' id_model_lang = 'en' else: raise ValueError(f"Invalid language {lang}. Please use one of 'en', 'ch_sim', 'ch_tra', 'ja', or 'ko'.") return {'easyocr_lang': easyocr_lang, 'paddleocr_lang': paddleocr_lang, 'rapidocr_lang': rapidocr_lang, 'translation_model_lang': translation_model_lang, 'id_model_lang': id_model_lang} def which_ocr_lang(model): if model == 'easy': return 'easyocr_lang' elif model == 'paddle': return 'paddleocr_lang' elif model == 'rapid': return 'rapidocr_lang' else: raise ValueError("Invalid OCR model. Please use one of 'easy', 'paddle', or 'rapid'.") if __name__ == "__main__": # Example usage japanesetext = "本が好きにちは" print(add_furigana(japanesetext))