欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

用卷积神经网络(CNN)识别文字

程序员文章站 2023-12-31 20:42:58
...

好久没有写博客了,趁最近比较闲来发一篇。
有个项目:

  • 从计算机屏幕上截图
  • 识别截图上的文字

早先识别的方法是:

  • 对比度
  • 颜色
  • 边缘检测

问题是:有些图片会被识别为文字。

前几天闲着就试了一下用卷积神经网络(CNN)来实现这个功能。

生成训练集、测试集和验证集。

首先,拿到一幅带有文字的计算机画面,用工具找到文字区域的起始和终止坐标,再用工具找到图片区域的起始和终止坐标。用下面的代码生成训练集、测试集和验证集(validation dataset,总觉得说验证集怪怪的)。

from random import randint
import numpy as np
from PIL import Image
#这个是我自己写的一个库[PyImageProcess](https://github.com/Kenneth111/PyImageProcess)
from PyImageProcess.utils_yuv import get_a_frame, save_a_patch

def save_patches(com_y, start, end, path, num):
    l1 = len(start)
    l2 = len(end)
    if l1 != l2:
        raise UserWarning("the len of start is not same to that of end!")
        exit(-1)
    mb_size = 16
    for i in range(num):
        # 随机选一幅图
        idx = randint(0, l1 - 1)
        # 随机选一个区域来生成图片
        (startx, starty) = start[idx]
        (endx, endy) = end[idx]
        x = randint(startx, endx - 16)
        y = randint(starty, endy - 16)
        filename = path + ("%d.bmp" % i)
        img = Image.fromarray(np.uint8(com_y[y: y + mb_size, x: x + mb_size]))
        img.save(filename)

def save_text(y, start, end, training = 0):
    if training == 0:
        save_patches(y, start, end, "train\\text\\", 1000)
    elif training == 1:
        save_patches(y, start, end, "val\\text\\", 100)
    else:
        save_patches(y, start, end, "test\\text\\", 200)

def save_img(y, start, end, training = 0):
    if training == 0:
        save_patches(y, start, end, "train\\img\\", 1000)
    elif training == 1:
        save_patches(y, start, end, "val\\img\\", 100)
    else:
        save_patches(y, start, end, "test\\img\\", 200)    

def main():
    mb_size = 16
    height = 1080
    width = 1920
    # 我读取的是一个YUV444P的图像
    filename_yuv = "screen_cursor1.yuv"
    a_frame = get_a_frame( "F:\\" + filename_yuv, height, width, 1)
    # 只用Y分量来识别文字
    com_y = a_frame[:, 0].reshape(height, width)
    start_i0 = [(1175, 357), (382, 552), (950, 0), (1440, 0)]
    end_i0 = [(1650, 1027), (1650, 1027), (1333, 462), (1804, 928)]    
    save_img(com_y, start_i0, end_i0, 0)
    save_img(com_y, start_i0, end_i0, 1)
    save_img(com_y, start_i0, end_i0, 2)
    start_t0 = [(207, 498), (1, 43), (111, 42), (221, 42), (769, 46), (857, 42), (592, 316), (670, 316), (869, 315),
        (1352, 225), (1339, 317), (1809, 225), (1817, 316), (1809, 498), (27, 681), (118, 953)]
    end_t0 = [(266, 533), (79, 77), (173, 59), (253, 59), (845, 78), (948, 59), (641, 352), (749, 351), (930, 354),
        (1401, 261), (1415, 350), (1893, 259), (1887, 351), (1895, 534), (65, 715), (164, 988)]    
    save_text(com_y, start_t0, end_t0, 0)
    save_text(com_y, start_t0, end_t0, 1)
    save_text(com_y, start_t0, end_t0, 2)

if __name__ == "__main__":
    main()

构建模型开始训练

卷积神经网络(CNN)训练和测试代码如下。代码参考了Keras的官方示例

import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from PIL import Image
import numpy as np

def CNN():
    input_shape = (16, 16, 1)
    num_classes = 2
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))    
    return model

def train():
    train_datagen = ImageDataGenerator(rescale= 1./255)
    train_generator = train_datagen.flow_from_directory('train', target_size=(16, 16), color_mode="grayscale", batch_size=64, class_mode='binary')
    val_datagen = ImageDataGenerator(rescale= 1./255)
    val_generator = val_datagen.flow_from_directory('val', target_size=(16, 16), color_mode="grayscale", batch_size=64, class_mode='binary')
    checkpointer = ModelCheckpoint(filepath='CNN_weights.hdf5', verbose=1, save_best_only=True)
    earlystopping = EarlyStopping(patience=5)
    model = CNN()
    model.compile(loss=keras.losses.binary_crossentropy,
                optimizer=keras.optimizers.Adadelta(),
                metrics=['accuracy'])
    model.fit_generator(train_generator, steps_per_epoch = 200, epochs = 50, validation_data=val_generator,
            validation_steps=100, callbacks=[checkpointer, earlystopping])

def test():
    test_datagen = ImageDataGenerator(rescale= 1./255)
    test_generator = test_datagen.flow_from_directory('test', target_size=(16, 16), color_mode="grayscale", batch_size=64, class_mode='binary')
    model = CNN()
    model.compile(loss=keras.losses.binary_crossentropy,
                optimizer=keras.optimizers.Adadelta(),
                metrics=['accuracy'])    
    # 因为在训练时check point只存储了网络权重,所以这里要重新构建模型,compile。然后在读入权重。
    model.load_weights('CNN_weights.hdf5')
    # 把模型带优化信息、权重都存起来,以后就不用再像上面那样构建一遍了。
    # 用的时候可以这样用model = keras.models.load_model('CNN_model.h5')
    model.save('CNN_model.h5')
    # 使用generator来进行测试
    test_loss = model.evaluate_generator(test_generator)
    print(test_loss)
    # 再测试两幅图像
    txt_img = Image.open('train/text/0.bmp')
    txt_img = np.array(txt_img).reshape(1, 16, 16, 1) * 1. /255
    pred1 = model.predict(txt_img, batch_size=1)
    img_img = Image.open('train/img/0.bmp')
    img_img = np.array(img_img).reshape(1, 16, 16, 1) * 1. / 255
    pred2 = model.predict(img_img, batch_size=1)
    # 接近1代表文字,接近0代表图像
    print(pred1, pred2)

def main():
    # 训练的时候打开这里
    # train()
    # 测试的时候打开这里
    test()

if __name__ == "__main__":
    main()

模型效果

能识别出几乎所有对比度高的文字和大多数对比度高的图标(这也是我们想要的)。但是难以识别对比度较低的文字(比如:excel红底黑字就无法识别为文字),这和训练样本有关系。另外注意到在excel下,模型将大多数横线和竖线识别为文字,在网页内将一些白底黑图的图片识别为文字。

上一篇:

下一篇: