from paddleocr import PaddleOCR import easyocr from typing import Optional from rapidocr_onnxruntime import RapidOCR import langid, sys,os from utils import contains_lang, standardize_lang from concurrent.futures import ThreadPoolExecutor sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from logging_config import logger # PaddleOCR # Paddleocr supports Chinese, English, French, German, Korean and Japanese. # You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` # to switch the language model in order. # need to run only once to download and load model into memory default_languages = ['en', 'ch', 'ja', 'ko'] def _paddle_init(paddle_lang, use_angle_cls=False, use_GPU=True, **kwargs): return PaddleOCR(use_angle_cls=use_angle_cls, lang=paddle_lang, use_GPU=use_GPU, **kwargs) def _paddle_ocr(ocr, image) -> list: ### return a list containing the bounding box, text and confidence of the detected text result = ocr.ocr(image, cls=False)[0] if not isinstance(result, list): return [] result = [ (pos, text[0], text[1]) for pos, text in result] return result # EasyOCR has support for many languages def _easy_init(easy_languages: list, use_GPU=True, **kwargs): langs = [] for lang in easy_languages: langs.append(standardize_lang(lang)['easyocr_lang']) return easyocr.Reader(langs, gpu=use_GPU, **kwargs) def _easy_ocr(ocr,image) -> list: return ocr.readtext(image) # RapidOCR mostly for mandarin and some other asian languages def _rapid_init(use_GPU=True, **kwargs): return RapidOCR(use_gpu=use_GPU, **kwargs) def _rapid_ocr(ocr, image) -> list: return ocr(image) ### Initialize the OCR model def init_OCR(model='paddle', easy_languages: Optional[list] = ['ch_sim','en'], paddle_lang: Optional[str] = 'ch', use_GPU=True, **kwargs): if model == 'paddle': return _paddle_init(paddle_lang=paddle_lang, use_GPU=use_GPU) elif model == 'easy': return _easy_init(easy_languages=easy_languages, use_GPU=use_GPU) elif model == 'rapid': return _rapid_init(use_GPU=use_GPU) ### Perform OCR on the image def _identify(ocr, image) -> list: if isinstance(ocr, PaddleOCR): return _paddle_ocr(ocr, image) elif isinstance(ocr, easyocr.Reader): return _easy_ocr(ocr, image) elif isinstance(ocr, RapidOCR): return _rapid_ocr(ocr, image) else: raise ValueError("Invalid OCR model. Please initialise the OCR model first with init() and pass it as an argument to _identify().") ### Filter out the results that are not in the source language. Slower but for a wider range of languages # not working but also not very reliable so don't worry about it def _id_filtered(ocr, image, lang) -> list: lang = standardize_lang(lang)['id_model_lang'] result = _identify(ocr, image) ### Parallelise since langid is slow def classify_text(entry): return entry if langid.classify(entry[1])[0] == lang else None with ThreadPoolExecutor() as executor: results_no_eng = list(filter(None, executor.map(classify_text, result))) return results_no_eng # ch_sim, ch_tra, ja, ko, en def _id_lang(ocr, image, lang) -> list: result = _identify(ocr, image) lang = standardize_lang(lang)['id_model_lang'] try: filtered = [entry for entry in result if contains_lang(entry[1], lang)] except: logger.error(f"Selected language not part of default: {default_languages}.") raise ValueError(f"Selected language not part of default: {default_languages}.") return filtered def id_keep_source_lang(ocr, image, lang) -> list: try: return _id_lang(ocr, image, lang) except ValueError: try: return _id_filtered(ocr, image, lang) except Exception as e: print(f'Probably an issue with the _id_filtered function. {e}') sys.exit(1) def get_words(ocr_output) -> list: return [entry[1] for entry in ocr_output] def get_positions(ocr_output) -> list: return [entry[0] for entry in ocr_output] def get_confidences(ocr_output) -> list: return [entry[2] for entry in ocr_output] if __name__ == '__main__': # OCR_languages = ['ch_sim','en'] # image_old = '/home/James/Pictures/Screenshots/DP-1.jpg' # reader = easyocr.Reader(OCR_languages, gpu=True) # this needs to run only once to load the model into memory # result = reader.readtext(image_old) # print(result) print(id_keep_source_lang(init_OCR(model='paddle', paddle_lang='zh', easy_languages=['en', 'ch_sim']), '/home/James/Pictures/Screenshots/DP-1.jpg', 'ch_sim'))