198 lines
6.7 KiB
Python
198 lines
6.7 KiB
Python
import re, uroman as ur
|
|
from pypinyin import pinyin
|
|
import pyscreenshot as ImageGrab # wayland tings not sure if it will work on other machines alternatively use mss
|
|
import mss, io, os
|
|
from PIL import Image
|
|
import jaconv, MeCab, unidic, pykakasi
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import numpy as np
|
|
import subprocess
|
|
# for creating furigana
|
|
mecab = MeCab.Tagger('-d "{}"'.format(unidic.DICDIR))
|
|
uroman = ur.Uroman()
|
|
|
|
# for romanising japanese text. Can convert to hiragana or katakana as well but does not split the words up so harder to use for furigana
|
|
kks = pykakasi.kakasi()
|
|
|
|
|
|
# define a function for checking whether one axis of a shape intercepts with another
|
|
def intercepts(x,y):
|
|
# both x and y are two dimensional tuples representing the ends of a line on one dimension.
|
|
x1, x2 = x
|
|
y1, y2 = y
|
|
return (x1 <= y1 <= x2) or (x1 <= y2 <= x2) or (y1 <= x1 <= y2) or (y1 <= x2 <= y2)
|
|
|
|
def is_wayland():
|
|
return 'WAYLAND_DISPLAY' in os.environ
|
|
|
|
# please install grim otherwise this is way too slow for wayland
|
|
def printsc_wayland(region: tuple, path: str):
|
|
subprocess.run(['grim','-g', f'{region[0]},{region[1]} {region[2]-region[0]}x{region[3]-region[1]}', '-t', 'jpeg', '-q','90', path])
|
|
|
|
def printsc_non_wayland(region: tuple, path: str):
|
|
# use mss to capture the screen
|
|
with mss.mss() as sct:
|
|
# grab the screen
|
|
img = sct.grab(region)
|
|
# convert the image to a PIL image
|
|
image = Image.frombytes("RGB", img.size, img.bgra, "raw", "BGRX")
|
|
image.save(path)
|
|
|
|
|
|
def printsc(region: tuple, path: str):
|
|
try:
|
|
if is_wayland():
|
|
printsc_wayland(region, path)
|
|
else:
|
|
printsc_non_wayland(region, path)
|
|
except Exception as e:
|
|
print(f'Error {e}')
|
|
|
|
def convert_image_to_bytes(img):
|
|
with io.BytesIO() as byte_stream:
|
|
img.save(byte_stream, format='PNG') # Save the image to the byte stream
|
|
return byte_stream.getvalue() # Get the byte representation
|
|
|
|
|
|
|
|
def bytes_to_image(image_bytes):
|
|
# Load the image from bytes
|
|
byte_stream = io.BytesIO(image_bytes)
|
|
# Open the image from the BytesIO stream
|
|
image = Image.open(byte_stream)
|
|
return image
|
|
|
|
# for japanese
|
|
|
|
def add_furigana(text):
|
|
parsed = mecab.parse(text).split('\n')[:-2]
|
|
furigana_string = ''
|
|
for i in parsed:
|
|
words = i.split('\t')[0]
|
|
try :
|
|
add = f'({jaconv.kata2hira(i.split(',')[6])})'
|
|
except:
|
|
add = ''
|
|
to_add = add if contains_kanji(words) else ''
|
|
furigana_string += i.split('\t')[0] + to_add
|
|
return furigana_string
|
|
|
|
|
|
def contains_kanji(text):
|
|
return bool(re.search(r'[\u4E00-\u9FFF]', text))
|
|
|
|
def contains_hiragana(text):
|
|
return bool(re.search(r'[\u3040-\u309F]', text))
|
|
|
|
def contains_katakana(text):
|
|
return bool(re.search(r'[\u30A0-\u30FF]', text))
|
|
|
|
|
|
# use kakasi to romanize japanese text
|
|
def romanize(text, lang):
|
|
if lang in ['zh','ch_sim','ch_tra']:
|
|
return ' '.join([ py[0] for py in pinyin(text, heteronym=True)])
|
|
if lang == 'ja':
|
|
return ' '.join([romaji['hepburn'] for romaji in kks.convert(text)])
|
|
return uroman.romanize_string(text)
|
|
|
|
# check if a string contains words from a language
|
|
def contains_lang(text, lang):
|
|
# Matches any character in the Unicode range of the language
|
|
if lang == 'zh':
|
|
return bool(re.search(r'[\u4e00-\u9fff]', text))
|
|
elif lang == 'ja':
|
|
return bool(re.search(r'[\u3040-\u30ff]', text)) or bool(re.search(r'[\u4e00-\u9fff]', text))
|
|
elif lang == 'ko':
|
|
return bool(re.search(r'[\uac00-\ud7af]', text))
|
|
elif lang == 'en':
|
|
return bool(re.search(r'[a-zA-Z]', text))
|
|
else:
|
|
raise ValueError("Invalid language. Please use one of 'en', 'zh', 'ja', or 'ko'.")
|
|
|
|
### en, ch_sim, ch_tra, ja, ko rapidocr only has chinese and en at the moment
|
|
def standardize_lang(lang):
|
|
if lang == 'ch_sim':
|
|
easyocr_lang = 'ch_sim'
|
|
paddleocr_lang = 'ch'
|
|
rapidocr_lang = 'ch'
|
|
translation_model_lang = 'zh'
|
|
id_model_lang = 'zh'
|
|
elif lang == 'ch_tra':
|
|
easyocr_lang = 'ch_tra'
|
|
paddleocr_lang = 'ch'
|
|
rapidocr_lang = 'ch'
|
|
translation_model_lang = 'zh'
|
|
id_model_lang = 'zh'
|
|
elif lang == 'ja':
|
|
easyocr_lang = 'ja'
|
|
paddleocr_lang = 'japan'
|
|
rapidocr_lang = 'ja'
|
|
translation_model_lang = 'ja'
|
|
id_model_lang = 'ja'
|
|
elif lang == 'ko':
|
|
easyocr_lang = 'ko'
|
|
paddleocr_lang = 'korean'
|
|
rapidocr_lang = 'ko'
|
|
translation_model_lang = 'ko'
|
|
id_model_lang = 'ko'
|
|
elif lang == 'en':
|
|
easyocr_lang = 'en'
|
|
paddleocr_lang = 'en'
|
|
rapidocr_lang = 'en'
|
|
translation_model_lang = 'en'
|
|
id_model_lang = 'en'
|
|
else:
|
|
raise ValueError(f"Invalid language {lang}. Please use one of 'en', 'ch_sim', 'ch_tra', 'ja', or 'ko'.")
|
|
return {'easyocr_lang': easyocr_lang,
|
|
'paddleocr_lang': paddleocr_lang,
|
|
'rapidocr_lang': rapidocr_lang,
|
|
'translation_model_lang': translation_model_lang,
|
|
'id_model_lang': id_model_lang}
|
|
|
|
def which_ocr_lang(model):
|
|
if model == 'easy':
|
|
return 'easyocr_lang'
|
|
elif model == 'paddle':
|
|
return 'paddleocr_lang'
|
|
elif model == 'rapid':
|
|
return 'rapidocr_lang'
|
|
else:
|
|
raise ValueError("Invalid OCR model. Please use one of 'easy', 'paddle', or 'rapid'.")
|
|
|
|
def similar_tfidf(list1,list2) -> float:
|
|
"""Calculate cosine similarity using TF-IDF vectors."""
|
|
if not list1 or not list2:
|
|
return 0.0
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
all_texts = list1 + list2
|
|
tfidf_matrix = vectorizer.fit_transform(all_texts)
|
|
|
|
# Calculate average vectors for each list
|
|
vec1 = np.mean(tfidf_matrix[:len(list1)].toarray(), axis=0).reshape(1, -1)
|
|
vec2 = np.mean(tfidf_matrix[len(list1):].toarray(), axis=0).reshape(1, -1)
|
|
|
|
return cosine_similarity(vec1, vec2)[0, 0]
|
|
|
|
def similar_jacard(list1, list2) -> float:
|
|
if not list1 or not list2:
|
|
return 0.0
|
|
return len(set(list1).intersection(set(list2))) / len(set(list1).union(set(list2)))
|
|
|
|
def check_similarity(list1, list2, threshold, method = 'tfidf'):
|
|
if method == 'tfidf':
|
|
try:
|
|
confidence = similar_tfidf(list1, list2)
|
|
except ValueError:
|
|
return True
|
|
return True if confidence > threshold else False
|
|
elif method == 'jacard':
|
|
return True if similar_jacard(list1, list2) >= threshold else False
|
|
else:
|
|
raise ValueError("Invalid method. Please use one of 'tfidf' or 'jacard'.")
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage
|
|
print(romanize(lang='ja', text='世界はひろい')) |