diff --git a/README.md b/README.md index ad13527..45a8bb6 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ +## What does this do? + +It provides translations from a source language to another language of a specified region on your screen while also providing necessary romanisation (including pinyin and furigana) to provide a guide to pronounciation. The main goal of this is primarily for people that have a low/basic level of understanding of a language to further develop that language by allowing the users to have the tool to allow them to immerse themselves in native content. Main uses of this include but are not limited to: playing games and watching videos with subtitles in another language (although technically it might just be better to obtain an audio transcription, translate and replace the subtitles if possible -- however this is not always feasible if watching many episodes and/or you are watching videos spontaneously). + +## Limitations + +If the `learn` mode is enabled for the app, the added translations and romanisation naturally results in texts taking up three times the space and therefore this is less suitable for texts that contain tightly packed words. You can optionally change the config to insert smaller text or change the overall font size of your screen so there is less text. A pure translation mode also exists, although if it is intended for web browsing, Google itself provides a more reliable method of translation which does not rely on the computationally heavy optical character recognition (OCR). + ## Usage (draft) 1. Clone the repository, navigate to the repository and install all required packages with `pip install -r requirements.txt` in a new Python environment (the OCR packages are very finnicky). @@ -6,7 +14,7 @@ 3. Edit the `api_models.json` file for the models you want added. The first level of the json file is the respective class name defined in `helpers/batching.py`. The second level defines the `model` names from their corresponding API endpoints. For the third level, the rates of each model are specified. `rpmin`, `rph`, `rpd`, `rpw`, `rpmth`, `rpy` are respectively the rates per minute, hour, day, week, month, year. -4. Edit the `.env` config file. For information about all the variables to edit, check the section under "EDIT THESE ENVIRONMENTAL VARIABLES". If CUDA is not detected, it will default to using the `CPU` mode for all local LLMs and OCRs. In this case, it is recommended to set the `OCR_MODEL` variable to `rapid` which is optimised for CPUs. Currently the only support for this is with `SOURCE_LANG=ch_tra`, `ch_sim` or `en`. Refer to [notes][1] +4. Create and edit the `.env` config file. For information about all the variables to edit, check the section under "EDIT THESE ENVIRONMENTAL VARIABLES" in the `config.py` file. If CUDA is not detected, it will default to using the `CPU` mode for all local LLMs and OCRs. In this case, it is recommended to set the `OCR_MODEL` variable to `rapid` which is optimised for CPUs. Currently the only support for this is with `SOURCE_LANG=ch_tra`, `ch_sim` or `en`. Refer to [notes][1] 5. If you are using the `wayland` display protocol (only available for Linux -- check with `echo $WAYLAND_DISPLAY`), download the `grim` package onto your machine locally with any of the package managers. @@ -39,6 +47,10 @@ Screenshotting is limited in Wayland, and `grim` is one of the more lightweight 1. CUDNN Version mismatch when using PaddleOCR. Check if LD_LIBRARY_PATH is correctly set to the directory containing the cudnn.so file. If using a local installation, it could help to just remove the nvidia-cudnn-cn12 from your Python environment. 2. Segmentation fault when using PaddleOCR, EasyOCR or RapidOCR. Ensure the only cv2 library is the opencv-contrib-python library. Check out https://pypi.org/project/opencv-python-headless/ for more info. +## Demo + +[Demo](https://youtu.be/Tmv_I0GkOQc) of Korean to Chinese (simplified) translation with the `learn-cover` mode (mode intended for people learning the language to see the romanisation/pinyin/furigana etc with the translation above). + ## TODO: - Create an overlay window that works in Wayland. diff --git a/config.py b/config.py index 2fc4da8..7931d80 100644 --- a/config.py +++ b/config.py @@ -15,7 +15,7 @@ elif platform.system() in ['Linux', 'Darwin']: # The default values should be fine for most cases. Only ones that you need to change are the API keys, and the variables under Translation and API Translation if you choose to use an external API. # available languages: 'ch_sim', 'ch_tra', 'ja', 'ko', 'en' -INTERVAL = float(os.getenv('INTERVAL'), 1.5) # Interval in seconds between translations. If your system is slow, a lower value is probably fine with regards to API rates. +INTERVAL = float(os.getenv('INTERVAL', 1.5)) # Interval in seconds between translations. If your system is slow, a lower value is probably fine with regards to API rates. ### OCR IMAGE_CHANGE_THRESHOLD = float(os.getenv('IMAGE_CHANGE_THRESHOLD', 0.75)) # higher values mean more sensitivity to changes in the screen, too high and the screen will constantly refresh diff --git a/helpers/batching.py b/helpers/batching.py index a6bbc3a..ac17641 100644 --- a/helpers/batching.py +++ b/helpers/batching.py @@ -195,11 +195,12 @@ Expected format: ["translation1", "translation2", ...] Translation:""" - response = await self._request(prompt) + try: + response = await self._request(prompt) response_list = ast.literal_eval(response.strip()) except Exception as e: - logger.error(f"Failed to evaluate response from {self.model} from {self.site}. Error: {e}. Response: {response}") + logger.error(f"Failed to evaluate response from {self.model} from {self.site}. Error: {e}.") return (1, [], 99999) logger.debug(repr(self)) logger.info(f'{self.model} translated texts from: {texts_to_translate} to {response_list}.') diff --git a/helpers/ocr.py b/helpers/ocr.py index 52de95d..7a6f3d0 100644 --- a/helpers/ocr.py +++ b/helpers/ocr.py @@ -34,7 +34,8 @@ def _easy_init(easy_languages: list, use_GPU=True, **kwargs): return easyocr.Reader(easy_languages, gpu=use_GPU, **kwargs) def _easy_ocr(ocr,image) -> list: - return ocr.readtext(image) + detected_texts = ocr.readtext(image) + return detected_texts # RapidOCR mostly for mandarin and some other asian languages # default only supports chinese and english @@ -116,9 +117,6 @@ def get_confidences(ocr_output) -> list: if __name__ == '__main__': - # OCR_languages = ['ch_sim','en'] - # image_old = '/home/James/Pictures/Screenshots/DP-1.jpg' - # reader = easyocr.Reader(OCR_languages, gpu=True) # this needs to run only once to load the model into memory - # result = reader.readtext(image_old) - # print(result) - print(id_keep_source_lang(init_OCR(model='paddle', paddle_lang='zh', easy_languages=['en', 'ch_sim']), '/home/James/Pictures/Screenshots/DP-1.jpg', 'ch_sim')) \ No newline at end of file + OCR_languages = ['ch_sim','en'] + reader = easyocr.Reader(OCR_languages, gpu=True) + \ No newline at end of file diff --git a/helpers/translation.py b/helpers/translation.py index 79b7c27..42a851d 100644 --- a/helpers/translation.py +++ b/helpers/translation.py @@ -71,6 +71,7 @@ async def translate_API_LLM(texts_to_translate: List[str], result = await task logger.debug(f'Result: {result}') if result is not None: + tasks.discard(task) translation_attempts += 1 status_code, translations, translation_mismatches = result if status_code == 0: @@ -79,14 +80,14 @@ async def translate_API_LLM(texts_to_translate: List[str], t.cancel() return translations else: - logger.error(f"Model has failed to translate the text.") + logger.error(f"Model has failed to translate the text. Result: {result}") if translation_attempts == no_of_models: if best_translation is not None: return translations else: logger.error("All models have failed to translate the text.") raise TypeError("Models have likely all outputted garbage translations or rate limited.") - elif status_code == 1: + elif status_code == 2: if best_translation is None: best_translation = (translations, translation_mismatches) else: diff --git a/helpers/utils.py b/helpers/utils.py index 03f3c28..d872945 100644 --- a/helpers/utils.py +++ b/helpers/utils.py @@ -28,7 +28,7 @@ def is_wayland(): # please install grim otherwise this is way too slow for wayland def printsc_wayland(region: tuple, path: str): - subprocess.run(['grim','-g', f'{region[0]},{region[1]} {region[2]-region[0]}x{region[3]-region[1]}', '-t', 'jpeg', '-q','95', path]) + subprocess.run(['grim','-g', f'{region[0]},{region[1]} {region[2]-region[0]}x{region[3]-region[1]}', '-t', 'jpeg', '-q','100', path]) def printsc_non_wayland(region: tuple, path: str): # use mss to capture the screen @@ -94,7 +94,7 @@ def romanize(text, lang): if lang in ['zh','ch_sim','ch_tra']: return ' '.join([ py[0] for py in pinyin(text, heteronym=True)]) if lang == 'ja': - return kks.convert(text)[0]['hepburn'] + return ' '.join([romaji['hepburn'] for romaji in kks.convert(text)]) return uroman.romanize_string(text) # check if a string contains words from a language @@ -103,7 +103,7 @@ def contains_lang(text, lang): if lang == 'zh': return bool(re.search(r'[\u4e00-\u9fff]', text)) elif lang == 'ja': - return bool(re.search(r'[\u3040-\u30ff]', text)) + return bool(re.search(r'[\u3040-\u30ff]', text)) or bool(re.search(r'[\u4e00-\u9fff]', text)) elif lang == 'ko': return bool(re.search(r'[\uac00-\ud7af]', text)) elif lang == 'en': @@ -132,7 +132,7 @@ def standardize_lang(lang): translation_model_lang = 'ja' id_model_lang = 'ja' elif lang == 'ko': - easyocr_lang = 'korean' + easyocr_lang = 'ko' paddleocr_lang = 'korean' rapidocr_lang = 'ko' translation_model_lang = 'ko' @@ -161,7 +161,7 @@ def which_ocr_lang(model): else: raise ValueError("Invalid OCR model. Please use one of 'easy', 'paddle', or 'rapid'.") -def similar_tfidf(list1,list2,threshold) -> float: +def similar_tfidf(list1,list2) -> float: """Calculate cosine similarity using TF-IDF vectors.""" if not list1 or not list2: return 0.0 @@ -174,7 +174,7 @@ def similar_tfidf(list1,list2,threshold) -> float: vec1 = np.mean(tfidf_matrix[:len(list1)].toarray(), axis=0).reshape(1, -1) vec2 = np.mean(tfidf_matrix[len(list1):].toarray(), axis=0).reshape(1, -1) - return float(cosine_similarity(vec1, vec2)[0, 0]) > threshold + return cosine_similarity(vec1, vec2)[0, 0] def similar_jacard(list1, list2) -> float: if not list1 or not list2: @@ -183,7 +183,11 @@ def similar_jacard(list1, list2) -> float: def check_similarity(list1, list2, threshold, method = 'tfidf'): if method == 'tfidf': - return True if similar_tfidf(list1, list2) >= threshold else False + try: + confidence = similar_tfidf(list1, list2) + except ValueError: + return True + return True if confidence > threshold else False elif method == 'jacard': return True if similar_jacard(list1, list2) >= threshold else False else: @@ -191,5 +195,4 @@ def check_similarity(list1, list2, threshold, method = 'tfidf'): if __name__ == "__main__": # Example usage - x = printsc_wayland((0,0,1920,1080), save = False) - print(x) \ No newline at end of file + print(romanize(lang='ja', text='世界はひろい')) \ No newline at end of file diff --git a/logging_config.py b/logging_config.py index 8208a73..8d3488e 100644 --- a/logging_config.py +++ b/logging_config.py @@ -64,5 +64,5 @@ def setup_logger( print(f"Failed to setup logger: {e}") return None -logger = setup_logger('on_screen_translator', log_file='translate.log', level=logging.INFO) +logger = setup_logger('on_screen_translator', log_file='translate.log', level=logging.DEBUG) diff --git a/qt_app.py b/qt_app.py index f8c99ae..1be4f3d 100644 --- a/qt_app.py +++ b/qt_app.py @@ -72,7 +72,7 @@ class ImageGenerator(QThread): def __init__(self): super().__init__() self.running = True - self.OCR_LANGUAGES = [SOURCE_LANG, TARGET_LANG, 'en'] + self.OCR_LANGUAGES = [SOURCE_LANG, 'en'] self.ocr = init_OCR(model=OCR_MODEL, paddle_lang= SOURCE_LANG, easy_languages = self.OCR_LANGUAGES, use_GPU=OCR_USE_GPU) self.ocr_output = id_keep_source_lang(self.ocr, TEMP_IMG_PATH, SOURCE_LANG) self.models = init_API_LLM(SOURCE_LANG, TARGET_LANG) @@ -102,7 +102,7 @@ class ImageGenerator(QThread): self.curr_words = set(get_words(self.ocr_output)) logger.debug(f'Current words: {self.curr_words} Previous words: {self.prev_words}') ### If the OCR detects different words, translate screen -> to ensure that the screen is not refreshing constantly and to save GPU power - if self.prev_words != self.curr_words and not check_similarity(self.curr_words, self.prev_words, threshold = IMAGE_CHANGE_THRESHOLD, method="jacard"): + if self.prev_words != self.curr_words and not check_similarity(list(self.curr_words), list(self.prev_words), threshold = IMAGE_CHANGE_THRESHOLD, method="tfidf"): logger.info('Beginning Translation') to_translate = [entry[1] for entry in self.ocr_output][:MAX_TRANSLATE] @@ -110,8 +110,8 @@ class ImageGenerator(QThread): try: translation = await translate_API_LLM(to_translate, self.models, call_size = 3) except TypeError as e: - logger.error(f"Failed to translate using API models. Error: {e}. Sleeping for 30 seconds.") - time.sleep(30) + logger.error(f"Failed to translate using API models. Error: {e}. Sleeping for {2*INTERVAL} seconds.") + time.sleep(2*INTERVAL) continue logger.debug('Translation complete. Modifying image.') self.translated_image = modify_image(TEMP_IMG_PATH, self.ocr_output, translation) diff --git a/web_app.py b/web_app.py index 3f98974..5e47d0c 100644 --- a/web_app.py +++ b/web_app.py @@ -27,7 +27,7 @@ async def web_app_main(): create_tables() ##### Initialize the OCR ##### - OCR_LANGUAGES = [SOURCE_LANG, TARGET_LANG, 'en'] + OCR_LANGUAGES = [SOURCE_LANG, 'en'] ocr = init_OCR(model=OCR_MODEL, paddle_lang= SOURCE_LANG, easy_languages = OCR_LANGUAGES, use_GPU=OCR_USE_GPU) ##### Initialize the translation ##### @@ -57,7 +57,7 @@ async def web_app_main(): logger.debug(f'Current words: {curr_words} Previous words: {prev_words}') ### If the OCR detects different words, translate screen -> to ensure that the screen is not refreshing constantly and to save GPU power - if prev_words != curr_words and not check_similarity(curr_words,prev_words, threshold = IMAGE_CHANGE_THRESHOLD, method="jacard"): + if prev_words != curr_words and not check_similarity(list(curr_words),list(prev_words), threshold = IMAGE_CHANGE_THRESHOLD, method="tfidf"): logger.info('Beginning Translation') to_translate = [entry[1] for entry in ocr_output][:MAX_TRANSLATE] @@ -68,7 +68,7 @@ async def web_app_main(): continue translation = await translate_API_LLM(to_translate, models, call_size = 3) except TypeError as e: - logger.error(f"Failed to translate using API models. Error: {e}. Sleeping for 30 seconds.") + logger.error(f"Failed to translate using API models. Error: {e}. Sleeping for {2*INTERVAL} seconds.") time.sleep(2*INTERVAL) continue logger.debug('Translation complete. Modifying image.')