import cv2 import numpy as np import json from PIL import Image, ImageDraw, ImageFont from transformers import pipeline from huggingface_hub import from_pretrained_keras def resize_image(img_in,input_height,input_width): return cv2.resize( img_in, ( input_width,input_height) ,interpolation=cv2.INTER_NEAREST) def write_dict_to_json(dictionary, save_path, indent=4): with open(save_path, "w") as outfile: json.dump(dictionary, outfile, indent=indent) def load_json_to_dict(load_path): with open(load_path) as json_file: return json.load(json_file) class OCRD: """ Optical Character Recognition and Document processing class that provides functionalities to preprocess images, detect text lines, perform OCR, and visualize the results. The class utilizes deep learning models for various tasks such as binarization and text line segmentation. It provides comprehensive methods to handle image scaling, prediction, text extraction, and overlaying recognized text on images. Attributes: image (ndarray): The image loaded into memory from the specified path. This image is used across various methods within the class. Methods: __init__(img_path: str): Initializes the OCRD class by loading an image from the specified file path. scale_image(img: ndarray) -> ndarray: Scales an image while maintaining its aspect ratio based on predefined width thresholds. predict(model, img: ndarray) -> ndarray: Uses a specified model to make predictions on the image. This function handles image resizing and segmenting for model input. binarize_image(img: ndarray, binarize_mode: str) -> ndarray: Applies binarization to the image based on the specified mode ('detailed', 'fast', or 'no'). segment_textlines(img: ndarray) -> ndarray: Segments text lines from the binarized image using a pretrained model. extract_filter_and_deskew_textlines(img: ndarray, textline_mask: ndarray, min_pixel_sum: int, median_bounds: tuple) -> (dict, ndarray): Processes an image to extract and correct orientation of text lines based on the provided mask. ocr_on_textlines(textline_images: dict) -> dict: Performs OCR on the extracted text lines and returns the recognized text. create_text_overlay_image(textline_images: dict, textline_preds: dict, img_shape: tuple, font_size: int) -> Image: Creates an image overlay with the recognized text annotations. visualize_model_output(prediction: ndarray, img: ndarray) -> ndarray: Visualizes the model's prediction by overlaying it onto the original image with distinct colors. """ def __init__(self, img_path): self.image = np.array(Image.open(img_path)) def scale_image(self, img): """ Scales an image to have dimensions suitable for neural network inference. Scaling is based on the input width parameter. The new width and height of the image are calculated to maintain the aspect ratio of the original image. Parameters: - img (ndarray): The image to be scaled, expected to be in the form of a numpy array where img.shape[0] is the height and img.shape[1] is the width. Behavior: - If image width is less than 1100, the new width is set to 2000 pixels. The height is adjusted to maintain the aspect ratio. - If image width is between 1100 (inclusive) and 2500 (exclusive), the width remains unchanged and the height is adjusted to maintain the aspect ratio. - If image width is 2500 or more, the width is set to 2000 pixels and the height is similarly adjusted to maintain the aspect ratio. Returns: - img_new (ndarray): A new image array that has been resized according to the specified rules. The aspect ratio of the original image is preserved. Note: - This function assumes that a function `resize_image(img, height, width)` is available and is used to resize the image where `img` is the original image array, `height` is the new height, and `width` is the new width. """ width_early = img.shape[1] if width_early < 1100: img_w_new = 2000 img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000) elif width_early >= 1100 and width_early < 2500: img_w_new = width_early img_h_new = int(img.shape[0] / float(img.shape[1]) * width_early) else: img_w_new = 2000 img_h_new = int(img.shape[0] / float(img.shape[1]) * 2000) img_new = resize_image(img, img_h_new, img_w_new) return img_new def predict(self, model, img): """ Processes an image to predict segmentation outputs using a given model. The function handles image resizing to match the model's input dimensions and ensures that the entire image is processed by segmenting it into patches that the model can handle. The prediction from these patches is then reassembled into a single output image. Parameters: - model (keras.Model): The neural network model used for predicting the image segmentation. The model should have predefined input dimensions (height and width). - img (ndarray): The image to be processed, represented as a numpy array. Returns: - prediction_true (ndarray): An image of the same size as the input image, containing the segmentation prediction with each pixel labeled according to the model's output. Details: - The function first scales the input image according to the model's required input dimensions. If the scaled image is smaller than the model's height or width, it is resized to match exactly. - The function processes the image in overlapping patches to ensure smooth transitions between the segments. These patches are then processed individually through the model. - Predictions from these patches are then stitched together to form a complete output image, ensuring that edge artifacts are minimized by carefully blending the overlapping areas. - This method assumes the availability of `resize_image` function for scaling and resizing operations, respectively. - The output is converted to an 8-bit image before returning, suitable for display or further processing. """ # bitmap output img_height_model=model.layers[len(model.layers)-1].output_shape[1] img_width_model=model.layers[len(model.layers)-1].output_shape[2] img = self.scale_image(img) if img.shape[0] < img_height_model: img = resize_image(img, img_height_model, img.shape[1]) if img.shape[1] < img_width_model: img = resize_image(img, img.shape[0], img_width_model) marginal_of_patch_percent = 0.1 margin = int(marginal_of_patch_percent * img_height_model) width_mid = img_width_model - 2 * margin height_mid = img_height_model - 2 * margin img = img / float(255.0) img = img.astype(np.float16) img_h = img.shape[0] img_w = img.shape[1] prediction_true = np.zeros((img_h, img_w, 3)) nxf = img_w / float(width_mid) nyf = img_h / float(height_mid) nxf = int(nxf) + 1 if nxf > int(nxf) else int(nxf) nyf = int(nyf) + 1 if nyf > int(nyf) else int(nyf) for i in range(nxf): for j in range(nyf): if i == 0: index_x_d = i * width_mid index_x_u = index_x_d + img_width_model else: index_x_d = i * width_mid index_x_u = index_x_d + img_width_model if j == 0: index_y_d = j * height_mid index_y_u = index_y_d + img_height_model else: index_y_d = j * height_mid index_y_u = index_y_d + img_height_model if index_x_u > img_w: index_x_u = img_w index_x_d = img_w - img_width_model if index_y_u > img_h: index_y_u = img_h index_y_d = img_h - img_height_model img_patch = img[index_y_d:index_y_u, index_x_d:index_x_u, :] label_p_pred = model.predict(img_patch.reshape(1, img_patch.shape[0], img_patch.shape[1], img_patch.shape[2]), verbose=0) seg = np.argmax(label_p_pred, axis=3)[0] seg_color = np.repeat(seg[:, :, np.newaxis], 3, axis=2) if i == 0 and j == 0: seg_color = seg_color[0 : seg_color.shape[0] - margin, 0 : seg_color.shape[1] - margin, :] prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg_color elif i == nxf - 1 and j == nyf - 1: seg_color = seg_color[margin : seg_color.shape[0] - 0, margin : seg_color.shape[1] - 0, :] prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - 0, :] = seg_color elif i == 0 and j == nyf - 1: seg_color = seg_color[margin : seg_color.shape[0] - 0, 0 : seg_color.shape[1] - margin, :] prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + 0 : index_x_u - margin, :] = seg_color elif i == nxf - 1 and j == 0: seg_color = seg_color[0 : seg_color.shape[0] - margin, margin : seg_color.shape[1] - 0, :] prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg_color elif i == 0 and j != 0 and j != nyf - 1: seg_color = seg_color[margin : seg_color.shape[0] - margin, 0 : seg_color.shape[1] - margin, :] prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + 0 : index_x_u - margin, :] = seg_color elif i == nxf - 1 and j != 0 and j != nyf - 1: seg_color = seg_color[margin : seg_color.shape[0] - margin, margin : seg_color.shape[1] - 0, :] prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - 0, :] = seg_color elif i != 0 and i != nxf - 1 and j == 0: seg_color = seg_color[0 : seg_color.shape[0] - margin, margin : seg_color.shape[1] - margin, :] prediction_true[index_y_d + 0 : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg_color elif i != 0 and i != nxf - 1 and j == nyf - 1: seg_color = seg_color[margin : seg_color.shape[0] - 0, margin : seg_color.shape[1] - margin, :] prediction_true[index_y_d + margin : index_y_u - 0, index_x_d + margin : index_x_u - margin, :] = seg_color else: seg_color = seg_color[margin : seg_color.shape[0] - margin, margin : seg_color.shape[1] - margin, :] prediction_true[index_y_d + margin : index_y_u - margin, index_x_d + margin : index_x_u - margin, :] = seg_color prediction_true = prediction_true.astype(np.uint8) return prediction_true def binarize_image(self, img, binarize_mode='detailed'): """ Binarizes an image according to the specified mode. Parameters: - img (ndarray): The input image to be binarized. - binarize_mode (str): The mode of binarization. Can be 'detailed', 'fast', or 'no'. - 'detailed': Uses a pre-trained deep learning model for binarization. - 'fast': Uses OpenCV for a quicker, threshold-based binarization. - 'no': Returns a copy of the original image. Returns: - ndarray: The binarized image. Raises: - ValueError: If an invalid binarize_mode is provided. Description: Depending on the 'binarize_mode', the function processes the image differently: - For 'detailed' mode, it loads a specific model and performs prediction to binarize the image. - For 'fast' mode, it quickly converts the image to grayscale and applies a threshold. - For 'no' mode, it simply returns the original image unchanged. If an unsupported mode is provided, the function raises a ValueError. Note: - The 'detailed' mode requires a pre-trained model from huggingface_hub. - This function depends on OpenCV (cv2) for image processing in 'fast' mode. """ if binarize_mode == 'detailed': model_name = "SBB/eynollah-binarization" model = from_pretrained_keras(model_name) binarized = self.predict(model, img) # Convert from mask to image (letters black) binarized = binarized.astype(np.int8) binarized = -binarized + 1 binarized = (binarized * 255).astype(np.uint8) elif binarize_mode == 'fast': binarized = self.scale_image(img, self.image) binarized = cv2.cvtColor(binarized, cv2.COLOR_BGR2GRAY) _, binarized = cv2.threshold(binarized, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) binarized = np.repeat(binarized[:, :, np.newaxis], 3, axis=2) elif binarize_mode == 'no': binarized = img.copy() else: accepted_values = ['detailed', 'fast', 'no'] raise ValueError(f"Invalid value provided: {binarize_mode}. Accepted values are: {accepted_values}") binarized = binarized.astype(np.uint8) return binarized def segment_textlines(self, img): ''' ADD DOCUMENTATION! ''' model_name = "SBB/eynollah-textline" model = from_pretrained_keras(model_name) textline_segments = self.predict(model, img) return textline_segments def extract_filter_and_deskew_textlines(self, img, textline_mask, min_pixel_sum=20, median_bounds=(.5, 20)): """ Extracts and deskews text lines from an image based on a provided textline mask. This function identifies text lines, filters out those that do not meet size criteria, calculates their minimum area rectangles, performs perspective transformations to deskew each text line, and handles potential rotations to ensure text lines are presented horizontally. Parameters: - img (numpy.ndarray): The original image from which to extract and deskew text lines. It should be a 3D array. - textline_mask (numpy.ndarray): A binary mask where text lines have been segmented. It should be a 2D array. - min_pixel_sum (int, optional): The minimum number of pixels (area) a connected component must have to be considered a valid text line. If None, no filtering is applied. - median_bounds (tuple, optional): A tuple representing the lower and upper bounds as multipliers for filtering text lines based on the median size of identified text lines. If None, no filtering is applied. Returns: - tuple: - dict: A dictionary containing lists of the extracted and deskewed text line images along with their metadata (center, left side, height, width, and rotation angle of the bounding box). - numpy.ndarray: An image visualization of the filtered text line mask for debugging or analysis. Description: The function first uses connected components to identify potential text lines from the mask. It filters these based on absolute size (min_pixel_sum) and relative size (median_bounds). For each valid text line, it computes a minimum area rectangle, extracts and deskews the bounded region. This includes rotating the text line if it is detected as vertical (taller than wide). Finally, it aggregates the results and provides an image for visualization of the text lines retained after filtering. Notes: - This function assumes the textline_mask is properly segmented and binary (0s for background, 255 for text lines). - Errors in perspective transformation due to incorrect contour extraction or bounding box calculations are handled gracefully, reporting the error but continuing with other text lines. """ num_labels, labels_im = cv2.connectedComponents(textline_mask) # Thresholds for filtering MIN_PIXEL_SUM = min_pixel_sum # absolute filtering MEDIAN_LOWER_BOUND = median_bounds[0] # relative filtering MEDIAN_UPPER_BOUND = median_bounds[1] # relative filtering # Gather masks and their sizes cc_sizes = [] masks = [] labels_im_filtered = labels_im > 0 # for visualizing filtering result for label in range(1, num_labels): # ignore background class mask = np.where(labels_im == label, True, False) if MIN_PIXEL_SUM is None: is_above_min_pixel_sum = True else: is_above_min_pixel_sum = mask.sum() > MIN_PIXEL_SUM if is_above_min_pixel_sum: # dismiss mini segmentations to avoid skewing of median cc_sizes.append(mask.sum()) masks.append(mask) # filter masks by size in relation to median; then calculate contours and min area bounding box for remaining ones rectangles = [] median = np.median(cc_sizes) for mask in masks: mask_sum = mask.sum() if MEDIAN_LOWER_BOUND is None: is_above_lower_media_bound = True else: is_above_lower_media_bound = mask_sum > median*MEDIAN_LOWER_BOUND if MEDIAN_UPPER_BOUND is None: is_below_upper_median_bound = True else: is_below_upper_median_bound = mask_sum < median*MEDIAN_UPPER_BOUND if is_above_lower_media_bound and is_below_upper_median_bound: labels_im_filtered[mask > 0] = False mask = (mask*255).astype(np.uint8) contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) rect = cv2.minAreaRect(contours[0]) if np.prod(rect[1]) > 0: # filter out if height or width = 0 rectangles.append(rect) # Transform (rotated) bounding boxes to horizontal; store together with rotation angle for downstream process re-transform if rectangles: # Filter rectangles and de-skew images textline_images = [] for rect in rectangles: width, height = rect[1] rotation_angle = rect[2] # clarify how to interpret and use rotation angle! # Convert dimensions to integer and ensure they are > 0 width = int(width) height = int(height) # get source and destination points for image transform box = cv2.boxPoints(rect) box = np.intp(box) src_pts = box.astype("float32") dst_pts = np.array([[0, height-1], [0, 0], [width-1, 0], [width-1, height-1]], dtype="float32") try: M = cv2.getPerspectiveTransform(src_pts, dst_pts) warped = cv2.warpPerspective(img, M, (width, height)) # Check and rotate if the text line is taller than wide if height > width: warped = cv2.rotate(warped, cv2.ROTATE_90_CLOCKWISE) temp = height height = width width = temp rotation_angle = 90-rotation_angle center = rect[0] left = center[0] - width//2 textline_images.append((warped, center, left, height, width, rotation_angle)) except cv2.error as e: print(f"Error with warpPerspective: {e}") # cast to dict keys = ['array', 'center', 'left', 'height', 'width', 'rotation_angle'] textline_images = {key: [tup[i] for tup in textline_images] for i, key in enumerate(keys)} num_labels_filtered = len(textline_images['array']) labels_im_filtered = np.repeat(labels_im_filtered[:, :, np.newaxis], 3, axis=2).astype(np.uint8) # 3 color channels for plotting print(f'Kept {num_labels_filtered} of {num_labels} text segments after filtering.') print(f'All segments deleted smaller than {MIN_PIXEL_SUM} pixels (absolute min size).') if MEDIAN_LOWER_BOUND is not None: print(f'All segments deleted smaller than {median*MEDIAN_LOWER_BOUND} pixels (lower median bound).') if MEDIAN_UPPER_BOUND is not None: print(f'All segments deleted bigger than {median*MEDIAN_UPPER_BOUND} pixels (upper median bound).') if MEDIAN_LOWER_BOUND is not None or MEDIAN_UPPER_BOUND is not None: print(f'Median segment size (pixel sum) used for filtering: {int(median)}.') return textline_images, labels_im_filtered def ocr_on_textlines(self, textline_images, model_name="microsoft/trocr-base-handwritten"): """ Processes a list of image arrays using a pre-trained OCR model to extract text. Parameters: - textline_images (dict): A dictionary with a key 'array' that contains a list of image arrays. Each image array represents a line of text that will be processed by the OCR model. - model_name (str): A huggingface model trained for OCR on single text lines Returns: - dict: A dictionary containing a list of extracted text under the key 'preds'. Description: The function initializes the OCR model 'microsoft/trocr-base-handwritten' using Hugging Face's `pipeline` API for image-to-text conversion. Each image in the input list is converted from an array format to a PIL Image, processed by the model, and the text prediction is collected. The progress of image processing is printed every 10 images. The final result is a dictionary with the key 'preds' that holds all text predictions as a list. Note: - This function requires the `transformers` library from Hugging Face and PIL library to run. - Ensure that the model 'microsoft/trocr-base-handwritten' is correctly loaded and the `transformers` library is updated to use the pipeline. """ pipe = pipeline("image-to-text", model=model_name) # Model inference textline_preds = [] len_array = len(textline_images['array']) for i, textline in enumerate(textline_images['array'][:]): if i % 10 == 1: print(f'Processing textline no. {i} of {len_array}') textline = Image.fromarray(textline) textline_preds.append(pipe(textline)) # Convert to dict preds = [pred[0]['generated_text'] for pred in textline_preds] textline_preds_dict = {'preds': preds} return textline_preds_dict def adjust_font_size(self, draw, text, box_width): """ Adjusts the font size to ensure the text fits within a specified width. Parameters: - draw (ImageDraw.Draw): An instance of ImageDraw.Draw used to render the text. - text (str): The text string to be rendered. - box_width (int): The maximum width in pixels that the text should occupy. Returns: - ImageFont: A font object with a size adjusted to fit the text within the specified width. """ for font_size in range(1, 200): # Adjust the range as needed font = ImageFont.load_default(font_size) text_width = draw.textlength(text, font=font) if text_width > box_width: font_size = max(5, int(font_size - 10)) # min font size of 5 return ImageFont.load_default(font_size) # Return the last fitting size return font # Return max size if none exceeded the box def create_text_overlay_image(self, textline_images, textline_preds, img_shape, font_size=-1): """ Creates an image overlay with text annotations based on provided bounding box information and predictions. Parameters: - textline_images (dict): A dictionary containing the bounding box data for each text segment. It should have keys 'left', 'center', 'width', and optionally 'height'. Each key should have a list of values corresponding to each text segment's properties. - textline_preds (dict): A dictionary containing the predicted text segments. It should have a key 'preds' which holds a list of text predictions corresponding to the bounding boxes in textline_images. - img_shape (tuple): A tuple representing the shape of the image where the text is to be drawn. The format should be (height, width). - font_size (int, optional): Specifies the font size for the text. If set to -1 (default), the font size is dynamically adjusted to fit the text within its bounding box width using the `adjust_font_size` function. If a specific integer is provided, it uses that size for all text segments. Returns: - Image: An image object with text drawn over a blank white background. Raises: - AssertionError: If the lengths of the lists in `textline_images` and `textline_preds['preds']` do not correspond, indicating a mismatch in the number of bounding boxes and text predictions. """ for key in textline_images.keys(): assert len(textline_images[key]) == len(textline_preds['preds']), f'Length of {key} and preds doesnt correspond' # Create a blank white image img_gen = Image.new('RGB', (img_shape[1], img_shape[0]), color=(255, 255, 255)) draw = ImageDraw.Draw(img_gen) # Draw each text segment within its bounding box for i in range(len(textline_preds['preds'])): left_x = textline_images['left'][i] center_y = textline_images['center'][i][1] #height = textline_images['height'][i] width = textline_images['width'][i] text = textline_preds['preds'][i] # dynamic or static text size if font_size==-1: font = self.adjust_font_size(draw, text, width) else: font = ImageFont.load_default(font_size) draw.text((left_x, center_y), text, fill=(0, 0, 0), font=font, align='left') return img_gen def visualize_model_output(self, prediction, img): """ Visualizes the output of a model prediction by overlaying predicted classes with distinct colors onto the original image. Parameters: - prediction (ndarray): A 3D array where the first channel holds the class predictions. - img (ndarray): The original image to overlay predictions onto. This should be in the same dimensions or resized accordingly. Returns: - ndarray: An image where the model's predictions are overlaid on the original image using a predefined color map. Description: The function first identifies unique classes present in the prediction's first channel. Each class is assigned a specific color from a predefined dictionary `rgb_colors`. The function then creates an output image where each pixel's color corresponds to the class predicted at that location. The function resizes the original image to match the dimensions of the prediction if necessary. It then blends the original image and the colored prediction output using OpenCV's `addWeighted` method to produce a final image that highlights the model's predictions with transparency. Note: - This function relies on `numpy` for array manipulations and `cv2` for image processing. - Ensure the `rgb_colors` dictionary contains enough colors for all classes your model can predict. - The function assumes `prediction` array's shape is compatible with `img`. """ unique_classes = np.unique(prediction[:,:,0]) rgb_colors = {'0' : [255, 255, 255], '1' : [255, 0, 0], '2' : [255, 125, 0], '3' : [255, 0, 125], '4' : [125, 125, 125], '5' : [125, 125, 0], '6' : [0, 125, 255], '7' : [0, 125, 0], '8' : [125, 125, 125], '9' : [0, 125, 255], '10' : [125, 0, 125], '11' : [0, 255, 0], '12' : [0, 0, 255], '13' : [0, 255, 255], '14' : [255, 125, 125], '15' : [255, 0, 255]} output = np.zeros(prediction.shape) for unq_class in unique_classes: rgb_class_unique = rgb_colors[str(int(unq_class))] output[:,:,0][prediction[:,:,0]==unq_class] = rgb_class_unique[0] output[:,:,1][prediction[:,:,0]==unq_class] = rgb_class_unique[1] output[:,:,2][prediction[:,:,0]==unq_class] = rgb_class_unique[2] img = resize_image(img, output.shape[0], output.shape[1]) output = output.astype(np.int32) img = img.astype(np.int32) #added_image = cv2.addWeighted(img,0.5,output,0.1,0) # orig by eynollah (gives dark image output) added_image = cv2.addWeighted(img,0.8,output,0.2,10) return added_image