Github - TextBoxes

Github - TextBoxes_plusplus

Paper - TextBoxes: A Fast Text Detector with a Single Deep Neural Network-AAAI2017

Paper - TextBoxes++: A Single-Shot Oriented Scene Text Detector-2018

TextBoxes 和 TextBoxes++ 是基于 SSD 的基础上进行的改进. 这里主要简单测试 TextBoxes 和 TextBoxes++ 的文字检测部分.

1. 网络结构

1.1. TextBoxes

image

1.2. TextBoxes++

image

2. 测试Demo

模型下载:

TextBoxes

TextBoxes++:

2.1. TextBoxes

#!/usr/bin/python3
import numpy as np
import matplotlib.pyplot as plt
import sys
from nms import nms
import cv2


caffe_root = '/path/to/TextBoxes/'
sys.path.insert(0, caffe_root+'python')
import caffe

caffe.set_device(0)
caffe.set_mode_gpu()


class general_textboxes_detector(object):
    def __init__(self, use_multi_scale=False):
        self.net_height = 768
        self.net_width = 768
        prototxt   = './icdar13/deploy.prototxt'
        caffemodel = './icdar13/TextBoxes_icdar13.caffemodel'
        self.net = caffe.Net(prototxt, caffemodel, caffe.TEST)

        self.det_score_threshold = 0.2
        self.overlap_threshold = 0.2

        if use_multi_scale:
            # self.scales = ((300, 300), (700, 700), 
            #                 (700, 500), (700, 300), 
            #                 (1600, 1600))
            self.scales = ((300, 300), (700, 700), 
                           (700, 500), (700, 300))
        else:
            self.scales = ((700, 700),)


    def get_transformer(self, net_height, net_width):
        transformer = caffe.io.Transformer(
            {'data': (1, 3, net_height, net_width)})
        transformer.set_transpose('data', (2, 0, 1))
        transformer.set_mean('data', np.array([104, 117, 123]))
        transformer.set_raw_scale('data', 255)
        transformer.set_channel_swap('data', (2, 1, 0))

        return transformer


    def extract_detections(self, detections, img_height, img_width):
        dt_results = []

        # Parse the outputs.
        det_label = detections[0, 0, :, 1]
        det_conf = detections[0, 0, :, 2]
        det_xmin = detections[0, 0, :, 3]
        det_ymin = detections[0, 0, :, 4]
        det_xmax = detections[0, 0, :, 5]
        det_ymax = detections[0, 0, :, 6]
        top_indices = [i for i, conf in enumerate(det_conf) 
                       if conf >= self.det_score_threshold]
        top_conf = det_conf[top_indices]
        top_xmin = det_xmin[top_indices]
        top_ymin = det_ymin[top_indices]
        top_xmax = det_xmax[top_indices]
        top_ymax = det_ymax[top_indices]

        for i in range(top_conf.shape[0]):
            xmin = int(round(top_xmin[i] * img_width))
            ymin = int(round(top_ymin[i] * img_height))
            xmax = int(round(top_xmax[i] * img_width))
            ymax = int(round(top_ymax[i] * img_height))
            xmin = max(1, xmin)
            ymin = max(1, ymin)
            xmax = min(img_width - 1, xmax)
            ymax = min(img_height - 1, ymax)
            score = top_conf[i]
            dt_result = [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, score]
            dt_results.append(dt_result)

        return dt_results


    def apply_quad_nms(self, bboxes):
        dt_results = sorted(bboxes, key=lambda x: -float(x[8]))
        nms_flag = nms(dt_results, self.overlap_threshold)

        results = []
        for k, dt in enumerate(dt_results):
            if nms_flag[k]:
                if dt not in results:
                    results.append(dt)
        return results


    def predict(self, img_file):
        img = caffe.io.load_image(img_file)
        img_height, img_width, channels = img.shape

        scales_results = []
        for scale in self.scales:
            img_resize_height, img_resize_width = scale[0], scale[1]
            print("[INFO]Img resize height: {}, Img resize width: {}"
                  .format(img_resize_height, img_resize_width))

            self.net.blobs['data'].reshape(1, 
                                           3, 
                                           img_resize_height, 
                                           img_resize_width)
            transformer = self.get_transformer(
                img_resize_height, img_resize_width)
            transformed_image = transformer.preprocess('data', img)
            self.net.blobs['data'].data[...] = transformed_image

            # Forward pass.
            detections = self.net.forward()['detection_out']

            scale_results = self.extract_detections(
                detections, img_height, img_width)
            scales_results.extend(scale_results)

        # apply non-maximum suppression
        results = self.apply_quad_nms(scales_results)

        return results


    def vis_res(self, img_file, results):
        img_cv2 = cv2.imread(img_file)

        plt.figure(figsize=(10, 8))
        plt.imshow(img_cv2[:, :, ::-1])
        currentAxis = plt.gca()
        for result in results:
            score = result[-1]
            x1 = result[0]
            y1 = result[1]
            x2 = result[2]
            y2 = result[3]
            x3 = result[4]
            y3 = result[5]
            x4 = result[6]
            y4 = result[7]

            quad = np.array([[x1, y1],
                             [x2, y2],
                             [x3, y3],
                             [x4, y4]])
            color_quad = 'r'
            currentAxis.add_patch(
                plt.Polygon(quad,
                            fill=False,
                            edgecolor=color_quad,
                            linewidth=2))
        plt.axis('off')
        plt.show()


def analyze():
    general_textboxes_detector_model = \
            general_textboxes_detector(use_multi_scale=True)

    img_file = 'test.jpg'
    results = general_textboxes_detector_model.predict(img_file)
    general_textboxes_detector_model.vis_res(img_file, results)

    print('[INFO]Detection Done.')

    
if __name__ == '__main__':
    analyze()

如:

image

2.2. TextBoxes++

#!/usr/bin/python3
import cv2
import numpy as np
import matplotlib.pyplot as plt
from nms import nms

import sys
caffe_root = '/path/to/TextBoxes_plusplus/'
sys.path.insert(0, caffe_root + 'python')
import caffe

caffe.set_device(0)
caffe.set_mode_gpu()


class general_textboxes_detector(object):
    def __init__(self):
        self.net_height = 768
        self.net_width = 768
        prototxt   = './icdar15/deploy.prototxt'
        caffemodel = './icdar15/model_icdar15.caffemodel'
        self.net = caffe.Net(prototxt, caffemodel, caffe.TEST)
        self.net.blobs['data'].reshape(1, 
                                       3, 
                                       self.net_height, 
                                       self.net_width)

        self.transformer = caffe.io.Transformer(
            {'data': (1, 3, self.net_height, self.net_width)})
        self.transformer.set_transpose('data', (2, 0, 1))
        self.transformer.set_mean('data', np.array([104, 117, 123]))
        self.transformer.set_raw_scale('data', 255)
        self.transformer.set_channel_swap('data', (2, 1, 0))

        self.det_score_threshold = 0.2
        self.overlap_threshold = 0.2

    def extract_detections(self, detections, img_height, img_width):
        det_conf = detections[0, 0, :, 2]
        det_x1 = detections[0, 0, :, 7]
        det_y1 = detections[0, 0, :, 8]
        det_x2 = detections[0, 0, :, 9]
        det_y2 = detections[0, 0, :, 10]
        det_x3 = detections[0, 0, :, 11]
        det_y3 = detections[0, 0, :, 12]
        det_x4 = detections[0, 0, :, 13]
        det_y4 = detections[0, 0, :, 14]
        # Get detections with confidence higher than 0.6.
        top_indices = [i for i, conf in enumerate(det_conf) 
                       if conf >= self.det_score_threshold]
        top_conf = det_conf[top_indices]
        top_x1 = det_x1[top_indices]
        top_y1 = det_y1[top_indices]
        top_x2 = det_x2[top_indices]
        top_y2 = det_y2[top_indices]
        top_x3 = det_x3[top_indices]
        top_y3 = det_y3[top_indices]
        top_x4 = det_x4[top_indices]
        top_y4 = det_y4[top_indices]

        bboxes = []
        for i in range(top_conf.shape[0]):
            x1 = int(round(top_x1[i] * img_width))
            y1 = int(round(top_y1[i] * img_height))
            x2 = int(round(top_x2[i] * img_width))
            y2 = int(round(top_y2[i] * img_height))
            x3 = int(round(top_x3[i] * img_width))
            y3 = int(round(top_y3[i] * img_height))
            x4 = int(round(top_x4[i] * img_width))
            y4 = int(round(top_y4[i] * img_height))
            x1 = max(1, min(x1, img_width - 1))
            x2 = max(1, min(x2, img_width - 1))
            x3 = max(1, min(x3, img_width - 1))
            x4 = max(1, min(x4, img_width - 1))
            y1 = max(1, min(y1, img_height - 1))
            y2 = max(1, min(y2, img_height - 1))
            y3 = max(1, min(y3, img_height - 1))
            y4 = max(1, min(y4, img_height - 1))
            score = top_conf[i]
            bbox = [x1, y1, x2, y2, x3, y3, x4, y4, score]
            bboxes.append(bbox)
        return bboxes

    def apply_quad_nms(self, bboxes):
        dt_lines = sorted(bboxes, key=lambda x: -float(x[8]))
        nms_flag = nms(dt_lines, self.overlap_threshold)
        results = []
        for k, dt in enumerate(dt_lines):
            if nms_flag[k]:
                if dt not in results:
                    results.append(dt)
        return results


    def predict(self, img_file):
        img = caffe.io.load_image(img_file)
        transformed_img = self.transformer.preprocess('data', img)
        self.net.blobs['data'].data[...] = transformed_img

        img_height, img_width, channels = img.shape
        detections = self.net.forward()['detection_out']

        bboxes = self.extract_detections(detections, img_height, img_width)
        # apply non-maximum suppression
        results = self.apply_quad_nms(bboxes)

        return results


    def vis_res(self, img_file, results):
        img_cv2 = cv2.imread(img_file)

        plt.figure(figsize=(10, 8))
        plt.imshow(img_cv2[:, :, ::-1])
        currentAxis = plt.gca()
        for result in results:
            score = result[-1]
            x1 = result[0]
            y1 = result[1]
            x2 = result[2]
            y2 = result[3]
            x3 = result[4]
            y3 = result[5]
            x4 = result[6]
            y4 = result[7]

            quad = np.array([[x1, y1], 
                             [x2, y2], 
                             [x3, y3], 
                             [x4, y4]])
            color_quad = 'r'
            currentAxis.add_patch(
                plt.Polygon(quad, 
                            fill=False, 
                            edgecolor=color_quad, 
                            linewidth=2))
        plt.axis('off')
        plt.show()


def analyze():
    general_textboxes_detector_model = general_textboxes_detector()

    img_file = "test.jpg"
    results = general_textboxes_detector_model.predict(img_file)
    general_textboxes_detector_model.vis_res(img_file, results)

    print('[INFO]Detection Done.')


if __name__ == '__main__':
    analyze()

如:

image

相比于 TextBoxes,这张图片来看效果更好.

2.3. Textboxes NMS 函数

import shapely
from shapely.geometry import Polygon,MultiPoint


def polygon_from_list(line):
    """
    Create a shapely polygon object from gt or dt line.
    """
    #polygon_points = [float(o) for o in line.split(',')[:8]]
    polygon_points = np.array(line).reshape(4, 2)
    polygon = Polygon(polygon_points).convex_hull
    return polygon


def polygon_iou(list1, list2):
    """
    两个 shapely 多边形间的 IoU.
    """
    polygon_points1 = np.array(list1).reshape(4, 2)
    poly1 = Polygon(polygon_points1).convex_hull
    polygon_points2 = np.array(list2).reshape(4, 2)
    poly2 = Polygon(polygon_points2).convex_hull
    union_poly = np.concatenate((polygon_points1,polygon_points2))
    if not poly1.intersects(poly2): 
        # this test is fast and can accelerate calculation
        iou = 0
    else:
        try:
            inter_area = poly1.intersection(poly2).area
            #union_area = poly1.area + poly2.area - inter_area
            union_area = MultiPoint(union_poly).convex_hull.area
            if union_area == 0:
                return 0
            iou = float(inter_area) / union_area
        except shapely.geos.TopologicalError:
            print('shapely.geos.TopologicalError occured, iou set to 0')
            iou = 0
    return iou

def nms(boxes,overlap):
    rec_scores = [b[-1] for b in boxes]
    indices = sorted(range(len(rec_scores)), 
                     key=lambda k: -rec_scores[k])
    box_num = len(boxes)
    nms_flag = [True]*box_num
    for i in range(box_num):
        ii = indices[i]
        if not nms_flag[ii]:
            continue
        for j in range(box_num):
            jj = indices[j]
            if j == i:
                continue
            if not nms_flag[jj]:
                continue
            box1 = boxes[ii]
            box2 = boxes[jj] 
            box1_score = rec_scores[ii]
            box2_score = rec_scores[jj] 
            # str1 = box1[9] 
            # str2 = box2[9]
            box_i = [box1[0],box1[1],box1[4],box1[5]]
            box_j = [box2[0],box2[1],box2[4],box2[5]]
            poly1 = polygon_from_list(box1[0:8])
            poly2 = polygon_from_list(box2[0:8])
            iou = polygon_iou(box1[0:8],box2[0:8])
            thresh = overlap
     
            if iou > thresh:
                if box1_score > box2_score:
                    nms_flag[jj] = False  
                if box1_score == box2_score and \
                        poly1.area > poly2.area:
                    nms_flag[jj] = False  
                if box1_score == box2_score and \
                        poly1.area<=poly2.area:
                    nms_flag[ii] = False  
                    break
            '''
            if abs((box_i[3]-box_i[1])-(box_j[3]-box_j[1])) <
                ((box_i[3]-box_i[1])+(box_j[3]-box_j[1]))/2:
                if abs(box_i[3]-box_j[3])+abs(box_i[1]-box_j[1])<
                    (max(box_i[3],box_j[3])-min(box_i[1],box_j[1]))/3:
                    if box_i[0]<=box_j[0] and \
                        (box_i[2]+min(box_i[3]-box_i[1],box_j[3]-box_j[1])>=box_j[2]):
                        nms_flag[jj] = False
            '''
    return nms_flag

3. 网络 deploy 结构可视化

利用 Netron 导入 deploy.prototxt 的网络结构.

Github 项目 - 深度网络可视化工具 - Netron - AIUAI

3.1. TextBoxes

image

点击放大.

3.1. TextBoxes++

image

点击放大.

Last modification:April 8th, 2019 at 06:10 pm