From 66bc8f205c3b19f0cc141ded2f139fd22712a54d Mon Sep 17 00:00:00 2001
From: chickenflyshigh <jamesli12345678908@gmail.com>
Date: Sat, 9 Nov 2024 00:10:43 +1100
Subject: [PATCH] Bug with OPUS model

---
 README.md              | 13 ++++++-------
 helpers/batching.py    | 18 ++++++++++--------
 helpers/ocr.py         | 11 ++++++-----
 helpers/translation.py |  5 ++++-
 helpers/utils.py       |  2 +-
 logging_config.py      |  2 +-
 qt_app.py              |  4 +++-
 7 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 45a8bb6..172abe9 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 ## What does this do?
 
-It provides translations from a source language to another language of a specified region on your screen while also providing necessary romanisation (including pinyin and furigana) to provide a guide to pronounciation. The main goal of this is primarily for people that have a low/basic level of understanding of a language to further develop that language by allowing the users to have the tool to allow them to immerse themselves in native content. Main uses of this include but are not limited to: playing games and watching videos with subtitles in another language (although technically it might just be better to obtain an audio transcription, translate and replace the subtitles if possible -- however this is not always feasible if watching many episodes and/or you are watching videos spontaneously).
+It continuously provides translations from a source language to another language of a specified region on your screen while also (optionally) providing romanisation (including pinyin and furigana) to provide a guide to pronounciation. The main goal of this is primarily for people that have a low/basic level of understanding of a language to further develop that language by allowing the users to have the tool to allow them to immerse themselves in native content. Main uses of this include but are not limited to: playing games and watching videos with subtitles in another language (although technically it might just be better to obtain an audio transcription, translate and replace the subtitles if possible -- however this is not always feasible if watching many episodes and/or you are watching videos spontaneously).
 
 ## Limitations
 
-If the `learn` mode is enabled for the app, the added translations and romanisation naturally results in texts taking up three times the space and therefore this is less suitable for texts that contain tightly packed words. You can optionally change the config to insert smaller text or change the overall font size of your screen so there is less text. A pure translation mode also exists, although if it is intended for web browsing, Google itself provides a more reliable method of translation which does not rely on the computationally heavy optical character recognition (OCR).
+If the `learn` mode is enabled for the app, the added translations and romanisation naturally results in texts taking up three times the space and therefore this is less suitable for texts that contain tightly packed words. You can optionally change the config to insert smaller text or change the overall font size of your screen so there are less text. A pure translation mode also exists, although if it is intended for web browsing, Google itself provides a more reliable method of translation which does not rely on the computationally heavy optical character recognition (OCR).
 
 ## Usage (draft)
 
@@ -54,13 +54,12 @@ Screenshotting is limited in Wayland, and `grim` is one of the more lightweight
 ## TODO:
 
 -   Create an overlay window that works in Wayland.
--   Make use of the translation data -> maybe make a personalised game that uses
+-   Make use of the translation data -> maybe make a personalised game that uses the data.
+-   Providing the option for simplifying and automating most of the install process.
 
-## Terms of Use
+# Terms of Use
 
-By using this application, you agree to the following terms and conditions.
-
-# Data Collection and External API Use
+## Data Collection and External API Use
 
 1.1 Onscreen Data Transmission: The application is designed to send data displayed on your screen, including potentially sensitive or personal information, to an external API if local processing is not setup.
 
diff --git a/helpers/batching.py b/helpers/batching.py
index ac17641..1379299 100644
--- a/helpers/batching.py
+++ b/helpers/batching.py
@@ -435,11 +435,13 @@ def generate_text(
     return all_generated_texts
 
 if __name__ == '__main__':
-    GROQ_API_KEY = os.getenv('GROQ_API_KEY')
-    GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
-    groq = Groqq('gemma-7b-it', 15, GROQ_API_KEY)
-    groq.set_lang('zh','en')
-    gemini = Gemini('gemini-1.5-pro', 15, GEMINI_API_KEY)
-    gemini.set_lang('zh','en')
-    print(gemini.translate(['荷兰咯']))
-    print(groq.translate(['荷兰咯']))
\ No newline at end of file
+    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, M2M100Tokenizer, M2M100ForConditionalGeneration
+    opus_model = 'Helsinki-NLP/opus-mt-en-zh'
+    LOCAL_FILES_ONLY = True
+    tokenizer = AutoTokenizer.from_pretrained(opus_model, local_files_only=LOCAL_FILES_ONLY)
+    model = AutoModelForSeq2SeqLM.from_pretrained(opus_model, local_files_only=LOCAL_FILES_ONLY, torch_dtype=torch.float16).to(device)
+    # tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", local_files_only=LOCAL_FILES_ONLY)
+    # tokenizer.src_lang = "en"
+    # model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M", local_files_only=LOCAL_FILES_ONLY).to(device)
+    
+    print(generate_text([ i.lower().capitalize() for i in ['placeholder','Story','StoRY', 'TufoRIaL', 'CovFfG', 'LoaD DaTA', 'SAME DATa', 'ReTulN@TitIE', 'View', '@niirm', 'SysceM', 'MeNu:', 'MaND', 'CoM', 'SeLEcT', 'Frogguingang', 'Tutorias', 'Back']], model, tokenizer))
\ No newline at end of file
diff --git a/helpers/ocr.py b/helpers/ocr.py
index 7a6f3d0..1033f2e 100644
--- a/helpers/ocr.py
+++ b/helpers/ocr.py
@@ -87,11 +87,12 @@ def _id_filtered(ocr, image, lang) -> list:
 def _id_lang(ocr, image, lang) -> list:
     result = _identify(ocr, image)
     lang = standardize_lang(lang)['id_model_lang']
-    try:
-        filtered = [entry for entry in result if contains_lang(entry[1], lang)]
-    except:
-        logger.error(f"Selected language not part of default: {default_languages}.")
-        raise ValueError(f"Selected language not part of default: {default_languages}.")
+    # try:
+    logger.info(f"Filtering out phrases not in {lang}.")
+    filtered = [entry for entry in result if contains_lang(entry[1], lang)]
+    # except:
+        # logger.error(f"Selected language not part of default: {default_languages}.")
+        # raise ValueError(f"Selected language not part of default: {default_languages}.")
     return filtered
 
 def id_keep_source_lang(ocr, image, lang) -> list:
diff --git a/helpers/translation.py b/helpers/translation.py
index 42a851d..830410a 100644
--- a/helpers/translation.py
+++ b/helpers/translation.py
@@ -143,15 +143,17 @@ def get_OPUS_model(from_lang, target_lang):
 
 def init_OPUS(from_lang = 'ch_sim', target_lang = 'en'):
     opus_model = get_OPUS_model(from_lang, target_lang)
+    logger.debug(f"OPUS model: {opus_model}")
     tokenizer = AutoTokenizer.from_pretrained(opus_model, local_files_only=LOCAL_FILES_ONLY)
     model = AutoModelForSeq2SeqLM.from_pretrained(opus_model, local_files_only=LOCAL_FILES_ONLY, torch_dtype=torch.float16).to(device)
     model.eval()
     return (model, tokenizer)
 
 def translate_OPUS(text: list[str], model, tokenizer) -> list[str]:
-    translated_text = generate_text(model,tokenizer, text, 
+    translated_text = generate_text(text, model,tokenizer, 
                   batch_size=BATCH_SIZE, device=device, 
                   max_length=MAX_INPUT_TOKENS, max_new_tokens=MAX_OUTPUT_TOKENS)
+    logger.debug(f"Translated text: {translated_text}")
     return translated_text
 
 ###############################
@@ -174,6 +176,7 @@ def translate_Seq_LLM(text,
                         model,
                         tokenizer,
                        **kwargs):
+    text = [t.lower().capitalize() for t in text]
     if model_type == 'opus':
         return translate_OPUS(text, model, tokenizer)
     elif model_type == 'm2m':
diff --git a/helpers/utils.py b/helpers/utils.py
index d872945..a04a1e8 100644
--- a/helpers/utils.py
+++ b/helpers/utils.py
@@ -28,7 +28,7 @@ def is_wayland():
 
 #  please install grim otherwise this is way too slow for wayland
 def printsc_wayland(region: tuple, path: str):
-    subprocess.run(['grim','-g', f'{region[0]},{region[1]} {region[2]-region[0]}x{region[3]-region[1]}', '-t', 'jpeg', '-q','100', path])
+    subprocess.run(['grim','-g', f'{region[0]},{region[1]} {region[2]-region[0]}x{region[3]-region[1]}', '-t', 'jpeg', '-q','90', path])
 
 def printsc_non_wayland(region: tuple, path: str):
     # use mss to capture the screen
diff --git a/logging_config.py b/logging_config.py
index 8d3488e..8208a73 100644
--- a/logging_config.py
+++ b/logging_config.py
@@ -64,5 +64,5 @@ def setup_logger(
         print(f"Failed to setup logger: {e}")
         return None
     
-logger = setup_logger('on_screen_translator', log_file='translate.log', level=logging.DEBUG)
+logger = setup_logger('on_screen_translator', log_file='translate.log', level=logging.INFO)
 
diff --git a/qt_app.py b/qt_app.py
index 1be4f3d..ab1de15 100644
--- a/qt_app.py
+++ b/qt_app.py
@@ -71,11 +71,13 @@ class ImageGenerator(QThread):
     
     def __init__(self):
         super().__init__()
+        printsc(REGION, TEMP_IMG_PATH)
         self.running = True
         self.OCR_LANGUAGES = [SOURCE_LANG, 'en']
         self.ocr = init_OCR(model=OCR_MODEL, paddle_lang= SOURCE_LANG, easy_languages = self.OCR_LANGUAGES, use_GPU=OCR_USE_GPU)
         self.ocr_output = id_keep_source_lang(self.ocr, TEMP_IMG_PATH, SOURCE_LANG)
         self.models = init_API_LLM(SOURCE_LANG, TARGET_LANG)
+        # self.model, self.tokenizer = init_Seq_LLM(TRANSLATION_MODEL, from_lang =SOURCE_LANG , target_lang = TARGET_LANG)
         self.runs = 0
         self.prev_words = set()
         self.curr_words = set(get_words(self.ocr_output))
@@ -106,7 +108,7 @@ class ImageGenerator(QThread):
                 logger.info('Beginning Translation')
 
                 to_translate = [entry[1] for entry in self.ocr_output][:MAX_TRANSLATE]
-                # translation = translate_Seq_LLM(to_translate, model_type = TRANSLATION_MODEL, model = model, tokenizer = tokenizer, from_lang = SOURCE_LANG, target_lang = TARGET_LANG)
+                # translation = translate_Seq_LLM(to_translate, model_type = TRANSLATION_MODEL, model = self.model, tokenizer = self.tokenizer, from_lang = SOURCE_LANG, target_lang = TARGET_LANG)
                 try:
                     translation = await translate_API_LLM(to_translate, self.models, call_size = 3)
                 except TypeError as e: