欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

python ocr text detection 数据增强

程序员文章站 2022-04-22 12:11:38
def augmentation(im: np.ndarray, text_polys: np.ndarray, scales: np.ndarray, degrees: int, input_size: int) -> tuple: # the images are rescaled with ratio {0.5, 1.0, 2.0, 3.0} randomly im, text_polys = data_aug.random_scale(im, text_polys, scal....
def augmentation(im: np.ndarray, text_polys: np.ndarray, scales: np.ndarray, degrees: int, input_size: int) -> tuple:
    # the images are rescaled with ratio {0.5, 1.0, 2.0, 3.0} randomly
    im, text_polys = data_aug.random_scale(im, text_polys, scales)
    # the images are horizontally fliped and rotated in range [−10◦, 10◦] randomly
    if random.random() < 0.5:
        im, text_polys = data_aug.horizontal_flip(im, text_polys)
    if random.random() < 0.5:
        im, text_polys = data_aug.random_rotate_img_bbox(im, text_polys, degrees)
    # 640 × 640 random samples are cropped from the transformed images
    # im, text_polys = data_aug.random_crop_img_bboxes(im, text_polys)

    # im, text_polys = data_aug.resize(im, text_polys, input_size, keep_ratio=False)
    # im, text_polys = data_aug.random_crop_image_pse(im, text_polys, input_size)

    return im, text_polys
# -*- coding: utf-8 -*-
# @Time    : 2019/1/12 13:06

import cv2
import numbers
import math
import random
import numpy as np
from skimage.util import random_noise


def show_pic(img, bboxes=None, name='pic'):
    '''
    输入:
        img:图像array
        bboxes:图像的所有boudning box list, 格式为[[x_min, y_min, x_max, y_max]....]
        names:每个box对应的名称
    '''
    show_img = img.copy()
    if not isinstance(bboxes, np.ndarray):
        bboxes = np.array(bboxes)
    for point in bboxes.astype(np.int):
        cv2.line(show_img, tuple(point[0]), tuple(point[1]), (255, 0, 0), 2)
        cv2.line(show_img, tuple(point[1]), tuple(point[2]), (255, 0, 0), 2)
        cv2.line(show_img, tuple(point[2]), tuple(point[3]), (255, 0, 0), 2)
        cv2.line(show_img, tuple(point[3]), tuple(point[0]), (255, 0, 0), 2)
    # cv2.namedWindow(name, 0)  # 1表示原图
    # cv2.moveWindow(name, 0, 0)
    # cv2.resizeWindow(name, 1200, 800)  # 可视化的图片大小
    cv2.imshow(name, show_img)


# 图像均为cv2读取
class DataAugment():
    def __init__(self):
        pass

    def add_noise(self, im: np.ndarray):
        """
        对图片加噪声
        :param img: 图像array
        :return: 加噪声后的图像array,由于输出的像素是在[0,1]之间,所以得乘以255
        """
        return (random_noise(im, mode='gaussian', clip=True) * 255).astype(im.dtype)

    def random_scale(self, im: np.ndarray, text_polys: np.ndarray, scales: np.ndarray or list) -> tuple:
        """
        从scales中随机选择一个尺度,对图片和文本框进行缩放
        :param im: 原图
        :param text_polys: 文本框
        :param scales: 尺度
        :return: 经过缩放的图片和文本
        """
        tmp_text_polys = text_polys.copy()
        rd_scale = float(np.random.choice(scales))
        im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
        tmp_text_polys *= rd_scale
        return im, tmp_text_polys

    def random_rotate_img_bbox(self, img, text_polys, degrees: numbers.Number or list or tuple or np.ndarray,
                               same_size=False):
        """
        从给定的角度中选择一个角度,对图片和文本框进行旋转
        :param img: 图片
        :param text_polys: 文本框
        :param degrees: 角度,可以是一个数值或者list
        :param same_size: 是否保持和原图一样大
        :return: 旋转后的图片和角度
        """
        if isinstance(degrees, numbers.Number):
            if degrees < 0:
                raise ValueError("If degrees is a single number, it must be positive.")
            degrees = (-degrees, degrees)
        elif isinstance(degrees, list) or isinstance(degrees, tuple) or isinstance(degrees, np.ndarray):
            if len(degrees) != 2:
                raise ValueError("If degrees is a sequence, it must be of len 2.")
            degrees = degrees
        else:
            raise Exception('degrees must in Number or list or tuple or np.ndarray')
        # ---------------------- 旋转图像 ----------------------
        w = img.shape[1]
        h = img.shape[0]
        angle = np.random.uniform(degrees[0], degrees[1])

        if same_size:
            nw = w
            nh = h
        else:
            # 角度变弧度
            rangle = np.deg2rad(angle)
            # 计算旋转之后图像的w, h
            nw = (abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w))
            nh = (abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w))
        # 构造仿射矩阵
        rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, 1)
        # 计算原图中心点到新图中心点的偏移量
        rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0]))
        # 更新仿射矩阵
        rot_mat[0, 2] += rot_move[0]
        rot_mat[1, 2] += rot_move[1]
        # 仿射变换
        rot_img = cv2.warpAffine(img, rot_mat, (int(math.ceil(nw)), int(math.ceil(nh))), flags=cv2.INTER_LANCZOS4)

        # ---------------------- 矫正bbox坐标 ----------------------
        # rot_mat是最终的旋转矩阵
        # 获取原始bbox的四个中点,然后将这四个点转换到旋转后的坐标系下
        rot_text_polys = list()
        for bbox in text_polys:
            point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1]))
            point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1]))
            point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1]))
            point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1]))
            rot_text_polys.append([point1, point2, point3, point4])
        return rot_img, np.array(rot_text_polys, dtype=np.float32)

    def random_crop_img_bboxes(self, im: np.ndarray, text_polys: np.ndarray, max_tries=50) -> tuple:
        """
        从图片中裁剪出 cropsize大小的图片和对应区域的文本框
        :param im: 图片
        :param text_polys: 文本框
        :param max_tries: 最大尝试次数
        :return: 裁剪后的图片和文本框
        """
        h, w, _ = im.shape
        pad_h = h // 10
        pad_w = w // 10
        h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
        w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
        for poly in text_polys:
            poly = np.round(poly, decimals=0).astype(np.int32)  # 四舍五入取整
            minx = np.min(poly[:, 0])
            maxx = np.max(poly[:, 0])
            w_array[minx + pad_w:maxx + pad_w] = 1  # 将文本区域的在w_array上设为1,表示x轴方向上这部分位置有文本
            miny = np.min(poly[:, 1])
            maxy = np.max(poly[:, 1])
            h_array[miny + pad_h:maxy + pad_h] = 1  # 将文本区域的在h_array上设为1,表示y轴方向上这部分位置有文本
        # 在两个轴上 拿出背景位置去进行随机的位置选择,避免选择的区域穿过文本
        h_axis = np.where(h_array == 0)[0]
        w_axis = np.where(w_array == 0)[0]
        if len(h_axis) == 0 or len(w_axis) == 0:
            # 整张图全是文本的情况下,直接返回
            return im, text_polys
        for i in range(max_tries):
            xx = np.random.choice(w_axis, size=2)
            # 对选择区域进行边界控制
            xmin = np.min(xx) - pad_w
            xmax = np.max(xx) - pad_w
            xmin = np.clip(xmin, 0, w - 1)
            xmax = np.clip(xmax, 0, w - 1)
            yy = np.random.choice(h_axis, size=2)
            ymin = np.min(yy) - pad_h
            ymax = np.max(yy) - pad_h
            ymin = np.clip(ymin, 0, h - 1)
            ymax = np.clip(ymax, 0, h - 1)
            if xmax - xmin < 0.1 * w or ymax - ymin < 0.1 * h:
                # 选择的区域过小
                # area too small
                continue
            if text_polys.shape[0] != 0:  # 这个判断不知道干啥的
                poly_axis_in_area = (text_polys[:, :, 0] >= xmin) & (text_polys[:, :, 0] <= xmax) \
                                    & (text_polys[:, :, 1] >= ymin) & (text_polys[:, :, 1] <= ymax)
                selected_polys = np.where(np.sum(poly_axis_in_area, axis=1) == 4)[0]
            else:
                selected_polys = []
            if len(selected_polys) == 0:
                # 区域内没有文本
                continue
            im = im[ymin:ymax + 1, xmin:xmax + 1, :]
            polys = text_polys[selected_polys]
            # 坐标调整到裁剪图片上
            polys[:, :, 0] -= xmin
            polys[:, :, 1] -= ymin
            return im, polys
        return im, text_polys

    def random_crop_image_pse(self, im: np.ndarray, text_polys: np.ndarray, input_size) -> tuple:
        """
        从图片中裁剪出 cropsize大小的图片和对应区域的文本框
        :param im: 图片
        :param text_polys: 文本框
        :param input_size: 输出图像大小
        :return: 裁剪后的图片和文本框
        """
        h, w, _ = im.shape
        short_edge = min(h, w)
        if short_edge < input_size:
            # 保证短边 >= inputsize
            scale = input_size / short_edge
            im = cv2.resize(im, dsize=None, fx=scale, fy=scale)
            text_polys *= scale
            h, w, _ = im.shape
        # 计算随机范围
        w_range = w - input_size
        h_range = h - input_size
        for _ in range(50):
            xmin = random.randint(0, w_range)
            ymin = random.randint(0, h_range)
            xmax = xmin + input_size
            ymax = ymin + input_size
            if text_polys.shape[0] != 0:
                selected_polys = []
                for poly in text_polys:
                    if poly[:, 0].max() < xmin or poly[:, 0].min() > xmax or \
                            poly[:, 1].max() < ymin or poly[:, 1].min() > ymax:
                        continue
                    # area_p = cv2.contourArea(poly)
                    poly[:, 0] -= xmin
                    poly[:, 1] -= ymin
                    poly[:, 0] = np.clip(poly[:, 0], 0, input_size)
                    poly[:, 1] = np.clip(poly[:, 1], 0, input_size)
                    # rect = cv2.minAreaRect(poly)
                    # area_n = cv2.contourArea(poly)
                    # h1, w1 = rect[1]
                    # if w1 < 10 or h1 < 10 or area_n / area_p < 0.5:
                    #     continue
                    selected_polys.append(poly)
            else:
                selected_polys = []
            # if len(selected_polys) == 0:
                # 区域内没有文本
                # continue
            im = im[ymin:ymax, xmin:xmax, :]
            polys = np.array(selected_polys)
            return im, polys
        return im, text_polys

    def random_crop_author(self,imgs, img_size):
        h, w = imgs[0].shape[0:2]
        th, tw = img_size
        if w == tw and h == th:
            return imgs
        ttt=random.random()
        # print(ttt)
        # label中存在文本实例,并且按照概率进行裁剪
        if np.max(imgs[1][:,:,-1]) > 0 and random.random() > 3.0 / 8.0:
            # 文本实例的top left点
            tl = np.min(np.where(imgs[1][:,:,-1] > 0), axis=1) - img_size
            tl[tl < 0] = 0
            # 文本实例的 bottom right 点
            br = np.max(np.where(imgs[1][:,:,-1] > 0), axis=1) - img_size
            br[br < 0] = 0
            # 保证选到右下角点是,有足够的距离进行crop
            br[0] = min(br[0], h - th)
            br[1] = min(br[1], w - tw)
            for _ in range(50000):
                i = random.randint(tl[0], br[0])
                j = random.randint(tl[1], br[1])
                # 保证最小的图有文本
                if imgs[1][:,:,0][i:i + th, j:j + tw].sum() <= 0:
                    continue
                else:
                    break
        else:
            i = random.randint(0, h - th)
            j = random.randint(0, w - tw)

        # return i, j, th, tw
        for idx in range(len(imgs)):
            if len(imgs[idx].shape) == 3:
                imgs[idx] = imgs[idx][i:i + th, j:j + tw, :]
            else:
                imgs[idx] = imgs[idx][i:i + th, j:j + tw]
        return imgs

    def resize(self, im: np.ndarray, text_polys: np.ndarray,
               input_size: numbers.Number or list or tuple or np.ndarray, keep_ratio: bool = False) -> tuple:
        """
        对图片和文本框进行resize
        :param im: 图片
        :param text_polys: 文本框
        :param input_size: resize尺寸,数字或者list的形式,如果为list形式,就是[w,h]
        :param keep_ratio: 是否保持长宽比
        :return: resize后的图片和文本框
        """
        if isinstance(input_size, numbers.Number):
            if input_size < 0:
                raise ValueError("If input_size is a single number, it must be positive.")
            input_size = (input_size, input_size)
        elif isinstance(input_size, list) or isinstance(input_size, tuple) or isinstance(input_size, np.ndarray):
            if len(input_size) != 2:
                raise ValueError("If input_size is a sequence, it must be of len 2.")
            input_size = (input_size[0], input_size[1])
        else:
            raise Exception('input_size must in Number or list or tuple or np.ndarray')
        if keep_ratio:
            # 将图片短边pad到和长边一样
            h, w, c = im.shape
            max_h = max(h, input_size[0])
            max_w = max(w, input_size[1])
            im_padded = np.zeros((max_h, max_w, c), dtype=np.uint8)
            im_padded[:h, :w] = im.copy()
            im = im_padded
        text_polys = text_polys.astype(np.float32)
        h, w, _ = im.shape
        im = cv2.resize(im, input_size)
        w_scale = input_size[0] / float(w)
        h_scale = input_size[1] / float(h)
        text_polys[:, :, 0] *= w_scale
        text_polys[:, :, 1] *= h_scale
        return im, text_polys

    def horizontal_flip(self, im: np.ndarray, text_polys: np.ndarray) -> tuple:
        """
        对图片和文本框进行水平翻转
        :param im: 图片
        :param text_polys: 文本框
        :return: 水平翻转之后的图片和文本框
        """
        flip_text_polys = text_polys.copy()
        flip_im = cv2.flip(im, 1)
        h, w, _ = flip_im.shape
        flip_text_polys[:, :, 0] = w - flip_text_polys[:, :, 0]
        return flip_im, flip_text_polys

    def vertical_flip(self, im: np.ndarray, text_polys: np.ndarray) -> tuple:
        """
         对图片和文本框进行竖直翻转
        :param im: 图片
        :param text_polys: 文本框
        :return: 竖直翻转之后的图片和文本框
        """
        flip_text_polys = text_polys.copy()
        flip_im = cv2.flip(im, 0)
        h, w, _ = flip_im.shape
        flip_text_polys[:, :, 1] = h - flip_text_polys[:, :, 1]
        return flip_im, flip_text_polys

    def test(self, im: np.ndarray, text_polys: np.ndarray):
        print('随机尺度缩放')
        t_im, t_text_polys = self.random_scale(im, text_polys, [0.5, 1, 2, 3])
        print(t_im.shape, t_text_polys.dtype)
        show_pic(t_im, t_text_polys, 'random_scale')

        print('随机旋转')
        t_im, t_text_polys = self.random_rotate_img_bbox(im, text_polys, 10)
        print(t_im.shape, t_text_polys.dtype)
        show_pic(t_im, t_text_polys, 'random_rotate_img_bbox')

        print('随机裁剪')
        t_im, t_text_polys = self.random_crop_img_bboxes(im, text_polys)
        print(t_im.shape, t_text_polys.dtype)
        show_pic(t_im, t_text_polys, 'random_crop_img_bboxes')

        print('水平翻转')
        t_im, t_text_polys = self.horizontal_flip(im, text_polys)
        print(t_im.shape, t_text_polys.dtype)
        show_pic(t_im, t_text_polys, 'horizontal_flip')

        print('竖直翻转')
        t_im, t_text_polys = self.vertical_flip(im, text_polys)
        print(t_im.shape, t_text_polys.dtype)
        show_pic(t_im, t_text_polys, 'vertical_flip')
        show_pic(im, text_polys, 'vertical_flip_ori')

        print('加噪声')
        t_im = self.add_noise(im)
        print(t_im.shape)
        show_pic(t_im, text_polys, 'add_noise')
        show_pic(im, text_polys, 'add_noise_ori') 

本文地址:https://blog.csdn.net/u012483097/article/details/108241481

相关标签: python 数据增强