# Introduction to YOLO v3 algorithm

### Data preprocessing

Input picture dimension: (416, 416, 3)

Input picture label: $[(x_1, y_1, x_2, y_2, class{\_}index), (x_1, y_1, x_2, y_2,class{\_}index), \ldots, (x_1, y_1, x_2, y_2,class{\_}index)]$represents all the real boxes marked in the picture, of which $class {\} Index$represents the category of the corresponding box, $(x_1, y_1)$represents the coordinate value of the upper left corner of the corresponding box, and $(x_2, y_2)$represents the coordinate value of the lower right corner of the corresponding box

YOLO v3 has 9 anchor boxes in total, and each detector has 3 anchor boxes. The anchor box in YOLO v3 is determined by clustering all real boxes in the training set. The clustering distance is defined by IoU. The larger the IoU, the smaller the distance: $$d(\text {box}, \text {centroid})=1-\operatorname{IoU}(\text {box}, \text {centroid})$$IoU is defined as shown in the following figure:

The most important step in preprocessing is to convert the image annotation into the output format of the model. First, determine which anchor box corresponds to each box (the anchor box with the largest IoU of the box), and then write the information of the box in the position of the corresponding anchor box.

###########The following code is for illustration only and does not consider performance and code structure##########
train_output_sizes = [52, 26, 13]
label = [np.zeros((train_output_sizes[i], train_output_sizes[i], 3, 85))  for i in range(3)]
bboxes_count = np.zeros((3,))
max_bbox_per_scale = 150 #each detector Reality in box Maximum number of
bboxes_xywh = [np.zeros((max_bbox_per_scale, 4)) for _ in range(3)]
# YOLO v3 Default 9 anchor box of width and height
anchors = [[(10,13), (16,30), (33,23)], [(30,61), (62,45), (59,119)], [(116,90), (156,198), (373,326)]]
# bboxes For all the facts marked in a picture box
for bbox in bboxes:
bbox_coor = bbox[:4]
bbox_class_ind = bbox[4]
#onehot encode for class
onehot = np.zeros(80, dtype=np.float)
onehot[bbox_class_ind] = 1.0
# take box Coordinates of(x1,y1,x2,y2)convert to(xc, yc, width, height)
bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1)
# Find and box Have the largest IoU of anchor box
iou = []
for anchors_detector in anchors:
for anchor in  anchors_detector:
intersection = min(bbox_xywh[2], anchor[0])*min(bbox_xywh[3], anchor[1])
box_area = bbox_xywh[2]*bbox_xywh[3]
anchor_area = anchor[0] * anchor[1]
iou.append(intersection / (box_area + anchor_area - intersection))
anchor_idx = np.argmax(np.array(iou))
# take anchor_idx Switch to the corresponding output position
best_detect = int(anchor_idx/3)
best_anchor = int(anchor_idx % 3)
scale = int(416/train_output_sizes[best_detect])
xind, yind = int(bbox_xywh[0]/scale), int(bbox_xywh[1]/scale)
label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
label[best_detect][yind, xind, best_anchor, 5:] = onehot
# storage box Information
bboxes_xywh[best_detect][bboxes_count[best_detect], :4] = bbox_xywh
bboxes_count[best_detect] += 1
label_sbbox, label_mbbox, label_lbbox = label
sbboxes, mbboxes, lbboxes = bboxes_xywh

### Model architecture

The architecture of YOLO v3 is mainly divided into two parts. The first part constructs 52x52, 26x26 and 13x13 feature maps based on Darknet network, and the second part constructs detectors based on these three types of feature maps, as shown in the following figure:

Picture from https://towardsdatascience.com/dive-really-deep-into-yolo-v3-a-beginners-guide-9e3d2666280e (the error has been corrected)

### convolutional and residual blocks
def _conv_block(inp, convs, skip=True):
x = inp
count = 0
for conv in convs:
# skip over 2 layers
if count == (len(convs) - 2) and skip:
skip_connection = x
count += 1
if conv['stride'] > 1: x = ZeroPadding2D(((1,0),(1,0)))(x) # left and top padding
x = Conv2D(conv['filter'],
conv['kernel'],
strides=conv['stride'],
padding='valid' if conv['stride'] > 1 else 'same',
name='conv_' + str(conv['layer_idx']),
use_bias=False if conv['bnorm'] else True)(x)
if conv['bnorm']: x = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(x)
if conv['leaky']: x = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(x)
return Add()([skip_connection, x]) if skip else x

### backbone
def make_yolov3_model():
input_image = Input(shape=(None, None, 3)) #(416, 416,3)
###### Part 1 ######
# (208, 208, 64)
x = _conv_block(input_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0},
{'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1},
{'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2},
{'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}])
# (104, 104, 128)
x = _conv_block(x, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5},
{'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6},
{'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}])
# (104, 104, 128)
x = _conv_block(x, [{'filter':  64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9},
{'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}])
# (52, 52, 256)
x = _conv_block(x, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}])
# (52, 52, 256)
for i in range(7):
x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}])
skip_36 = x #52x52 feature map
# (26, 26, 512)
x = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}])
# (26, 26, 512)
for i in range(7):
x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}])
skip_61 = x #26x26 feature map
# (13, 13, 1024)
x = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62},
{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}])
# (13, 13, 1024)
for i in range(3):
x = _conv_block(x, [{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}]) #13x13 feature map
###### Part 2 ######
# (13, 13, 512)
x = _conv_block(x, [{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76},
{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78},
{'filter':  512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False)
# (13, 13, 255)
yolo_82 = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 80},
{'filter':  255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}], skip=False) #13x13 detector
# concatenate with 26x26 feature map, (26, 26, 256+512)
x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False)
x = UpSampling2D(2)(x)
x = Concatenate()([x, skip_61])
# (26, 26, 256)
x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False)
# (26, 26, 255)
yolo_94 = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 92},
{'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}], skip=False) #26x26 detector
# concatenate with 52x52 feature map, (52, 52, 128+256)
x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True,   'layer_idx': 96}], skip=False)
x = UpSampling2D(2)(x)
x = Concatenate()([x, skip_36])
# (52, 52, 255)
yolo_106 = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 99},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 100},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 101},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 102},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 103},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True,  'leaky': True,  'layer_idx': 104},
{'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False) #52x52 detector
model = Model(input_image, [yolo_82, yolo_94, yolo_106])
return model

### loss function

There are many different variants of the loss function in YOLO v3. Here we choose a classic one to introduce. The loss function can be decomposed into the sum of frame loss, target loss and classification loss. These three items are introduced one by one below.

Frame loss: in the original paper, MSE (mean square error) is used as the loss function of the frame, but sometimes the prediction results of different quality can not be distinguished by MSE. Using IoU can better reflect the quality of the regression box and has scale invariance, but IoU can only describe the area where two frames overlap, not the form of two frames overlap; And if the two frames do not intersect at all, IoU is 0, which is not suitable for gradient optimization. GIoU (Generalized IoU) inherits the advantages of IoU and solves the existing problems of IoU to a certain extent: $G I o U=I o U-\frac{|C \backslash(B_1 \cup B_2)|}{|C} $$where C includes B_ 1 and B_ 2 . The frame loss can be expressed as 1-G I o U . Next, take the detector 13x13 as an example to calculate the frame loss. The total frame loss is the sum of the losses of the three detectors. To calculate the border loss, first convert the network output of YOLO v3. Suppose the border information of the network output is (t_x,t_y,t_w,t_h) , where (t_x,t_y) is the information of the center point of the border, and (t_w,t_h) is the width and height information of the border. The conversion formula is as follows: b_x=sigmoid(t_x)+c_x;\text{ }b_y=sigmoid(t_y)+c_y;\text{ }b_w=p_wexp(t_w);\text{ }b_h=p_hexp(t_h)$$where$(c_x,c_y) $represents the coordinate position of the point in the upper left corner of the grid where$(t_x,t_y) $is located, and$(p_w,p_h) $represents the width and height of the anchor box corresponding to the border. output_size = 13 anchors = np.array([[116,90], [156,198], [373,326]]) #anchor boxes in 13x13 detector, See the data preprocessing code section # yolo_82_batch: 13x13 detector output, (batch_size, 13, 13, 255), yolo_82 Please refer to the model architecture code section for the calculation of conv_output = tf.reshape(yolo_82_batch, (batch_size, output_size, output_size, 3, 85)) #(batch_size, 13, 13, 3, 85) t_xy, t_wh, objectness, classes = tf.split(conv_output, (2, 2, 1, 80), axis=-1) #t_xy:(batch_size, 13, 13, 3, 2); t_wh:(batch_size, 13, 13, 3, 2) c_xy = tf.meshgrid(tf.range(output_size), tf.range(output_size)) #a list of two (13,13) arrays c_xy = tf.stack(c_xy, axis=-1) #(13,13,2) c_xy = tf.tile(c_xy[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, 3, 1]) #(batch_size,13,13,3,2) scale = int(416/output_size) b_xy = (tf.sigmoid(t_xy) + c_xy) * scale #(batch_size,13,13,3,2) b_wh = tf.exp(t_wh) * anchors #(batch_size,13,13,3,2) b_xywh = tf.concat([b_xy, b_wh], axis=-1) #(batch_size,13,13,3,4) Next, calculate the GIoU of the network output frame and the real frame, and then get the frame loss: def bbox_giou(boxes1, boxes2): # transform from (xc, yc, w, h) to (xmin, ymin, xmax, ymax) boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5, boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1) boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5, boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1) # two box aeras boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1]) boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1]) # intersection area left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2]) right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:]) inter_section = tf.maximum(right_down - left_up, 0.0) inter_area = inter_section[..., 0] * inter_section[..., 1] # compute iou union_area = boxes1_area + boxes2_area - inter_area iou = inter_area / union_area # enclosed area enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2]) enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:]) enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0) enclose_area = enclose[..., 0] * enclose[..., 1] # compute giou giou = iou - 1.0 * (enclose_area - union_area) / enclose_area return giou ### label_ lbbox_ batch: ground truth boxes in 13x13 detector, (batch_size, 13, 13, 3, 85), label_ See the data preprocessing code section for the calculation of lbbox label_xywh = label_lbbox_batch[:, :, :, :, 0:4] #ground truth box (xc, yc, w, h) respond_bbox = label_lbbox_batch[:, :, :, :, 4:5] #Corresponding anchor box Whether there is a real object in the frame. If it is 1, the frame loss will be calculated, and if it is 0, it will be ignored giou = tf.expand_dims(bbox_giou(b_xywh, label_xywh), axis=-1) #(batch_size, 13, 13, 3, 1) input_size = tf.cast(416, tf.float32) bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2) #Weight of frame loss, corresponding to ground truth box The larger the area, the higher the error tolerance rate and the smaller the weight giou_loss = respond_bbox * bbox_loss_scale * (1- giou) #giou loss, (batch_size, 13, 13, 3, 1) Target loss: Taking the detector 13x13 as an example, the target loss is actually an unbalanced binary classification problem, because generally speaking, the number of real objects (positive samples) in the detector's 13x13x3 anchor box is much less than the number of real objects (negative samples). Focus loss is used to deal with this problem. Focus loss uses a larger weight for samples that are difficult to classify and a smaller weight for samples that are easy to classify:$f l (P) = \ left \ {\ begin {aligned} - (1-p) ^ {\ gamma} \ log (P), & \ text {positive samples} \ \ - P ^ {\ gamma} \ log (1-p), & \ text {negative samples} \ end {aligned} \ right$$Another formula for Focal Loss is to introduce category weight \ alpha :$$f l (P) = \ left \ {\ begin {aligned} - \ alpha (1-p) ^ {\ gamma} \ log (P), & \ text {positive samples} \ \ - (1 - \ alpha) P ^ {\ gamma} \ log (1-p), & \ text {negative samples} \ end {aligned} \ right This article uses the first formula and sets $\ gamma$to 2. In addition, the definition of negative sample is modified in the calculation of target loss. If there is no real object in an anchor box, but its predicted frame has a large IoU with a real frame on the corresponding detector, it will not be regarded as a negative sample, and it will be ignored in the calculation of loss, which also reduces the number of negative samples to a certain extent.

### lbboxes_batch: 13x13 All existing on the detector ground truth box of(xc,yc,w,h)information, (batch_size, max_bbox_per_scale, 4), lbboxes See the data preprocessing code section for the calculation of
### label_lbbox_batch: ground truth boxes in 13x13 detector, (batch_size, 13, 13, 3, 85), label_lbbox See the data preprocessing code section for the calculation of
### objectness: Predicted object authenticity, (batch_size, 13, 13, 3, 1), See the output conversion code section in frame loss
### b_xywh: predicted frame information, (batch_size, 13, 13, 3, 4). See output conversion code in frame loss
respond_bbox = label_lbbox_batch[:, :, :, :, 4:5] #Corresponding anchor box Whether there is a real object in the. If it is 1, it is a positive sample, and if it is 0, it is a negative sample
### Reduce the number of negative samples for calculation ###
### 1. Calculated and predicted box With all reality box of IoU ###
boxes1 = tf.tile(lbboxes_batch[:, tf.newaxis, tf.newaxis, tf.newaxis, :, :], [1, 13, 13, 3, 1, 1]) #(batch_size, 13, 13, 3, max_bbox_per_scale, 4)
boxes2 = tf.tile(b_xywh[:, :, :, :, tf.newaxis, :], [1, 1, 1, 1, max_bbox_per_scale, 1]) #(batch_size, 13, 13, 3, max_bbox_per_scale, 4)
boxes1_area = boxes1[..., 2] * boxes1[..., 3]
boxes2_area = boxes2[..., 2] * boxes2[..., 3]
# (xc, yc, w, h)->(xmin, ymin, xmax, ymax)
boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5, boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5, boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)
# compute IoU
left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])
inter_section = tf.maximum(right_down - left_up, 0.0)
inter_area = inter_section[..., 0] * inter_section[..., 1]
union_area = boxes1_area + boxes2_area - inter_area
iou = 1.0 * inter_area / union_area #(batch_size, 13, 13, 3, max_bbox_per_scale)
### 2. Looking for the biggest IoU，If the value is greater than the given critical value, the sample is ignored in the loss calculation###
max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1) #(batch_size, 13, 13, 3, 1)
IOU_LOSS_THRESH = 0.5
respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < IOU_LOSS_THRESH, tf.float32) #(batch_size, 13, 13, 3, 1)
###########################
pred_conf  = tf.sigmoid(objectness) #Probability of being predicted as a real object
conf_focal = tf.pow(respond_bbox - pred_conf, 2) #gamma=2
focal_loss_p = conf_focal * respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=objectness) #Positive sample loss
focal_loss_n = conf_focal * respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=objectness) #Negative sample loss
focal_loss = focal_loss_p + focal_loss_n #(batch_size, 13, 13, 3, 1)  

Classification loss: still take the detector 13x13 as an example and use the cross entropy loss function. It is worth noting that sigmoid is used as the activation function to replace the previous softmax in the category prediction of YOLO v3, mainly because different categories are not necessarily mutually exclusive, and an object may belong to multiple categories at the same time.

### label_lbbox_batch: ground truth boxes in 13x13 detector, (batch_size, 13, 13, 3, 85), label_lbbox See the data preprocessing code section for the calculation of
### classes: the category of the prediction object (batch_size, 13, 13, 3, 80). See the output conversion code section in the frame loss
respond_bbox  = label_lbbox_batch[:, :, :, :, 4:5] #Corresponding anchor box Whether there are real objects in the. If it is 1, the classification loss will be calculated. If it is 0, it will be ignored
labels_onehot = label_lbbox_batch[:, :, :, :, 5:] #The real category to which the object belongs
classes_prob  = tf.sigmoid(classes) #Predict the probability of belonging to each category
ce_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_onehot, logits=classes) #cross entropy loss, (batch_size, 13, 13, 3, 80)

Based on the above three types of losses, the total loss on the 13x13 detector can be calculated. The losses on the other two detectors (26x26, 52x52) can be calculated in the same way. The total loss of the three detectors is:

giou_loss_13 = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4]))
focal_loss_13 = tf.reduce_mean(tf.reduce_sum(focal_loss, axis=[1,2,3,4]))
ce_loss_13 = tf.reduce_mean(tf.reduce_sum(ce_loss, axis=[1,2,3,4]))
total_loss_13 = giou_loss_13 + focal_loss_13 + ce_loss_13
# total loss
total_loss = total_loss_13 + total_loss_26 + total_loss_52 

### model prediction

As described in the loss function, first convert the format of network output:

### Still 13 x13 Take the detector as an example, The dimension of the input picture to be predicted is(1, 416, 416, 3)
### b_ xywh, pred_ conf, classes_ See the loss function code section for the calculation of prob
output_13 = tf.concat([b_xywh, pred_conf, classes_prob], axis=-1) #(batch_size, 13, 13, 3, 85)，At this time batch_size Is 1
### It can be calculated in the same way output_26 (26x26 detector), output_52 (52x52 detector)
### output_26: (1, 26, 26, 3, 85); output_52: (1, 52, 52, 3, 85)
pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in (output_13, output_26, output_52)] #[(13*13*3, 85), (26*26*3, 85), (52*52*3, 85)]
pred_bbox = tf.concat(pred_bbox, axis=0) #All predicted box Information, (13*13*3+26*26*3+52*52*3, 85)

Next, delete the prediction box with low score, and the score is determined by multiplying the probability that the box is a real object by the maximum category probability

score_threshold = 0.5
pred_xywh = pred_bbox[:, 0:4]
# (xc, yc, w, h) --> (xmin, ymin, xmax, ymax) for computing IoU
pred_coor = np.concatenate([pred_xywh[:, :2] - pred_xywh[:, 2:] * 0.5, pred_xywh[:, :2] + pred_xywh[:, 2:] * 0.5], axis=-1)
# compute box score
pred_conf = pred_bbox[:, 4]
pred_prob = pred_bbox[:, 5:]
classes = np.argmax(pred_prob, axis=-1) #each box Category of predicted corresponding maximum probability
scores = pred_conf * np.max(pred_prob, axis=-1)
# discard boxes with low scores
filter_boxes = np.concatenate([coors, scores[:, np.newaxis], classes[:, np.newaxis]], axis=-1) #(number of remaining boxes, 6) 

Conduct non maximum suppression (NMS) on the remaining prediction boxes. The main purpose of NMS is to remove boxes with the same prediction categories but with large overlap:

iou_threshold = 0.5
classes_in_img = list(set(filter_boxes[:, 5])) #All forecast categories on the picture
best_bboxes = [] #Final remaining box
for cls in classes_in_img:
cls_mask = (filter_boxes[:, 5] == cls)
cls_bboxes = filter_boxes[cls_mask] #It is predicted to be all in the same category box
while len(cls_bboxes) > 0:
max_ind = np.argmax(cls_bboxes[:, 4])
best_bbox = cls_bboxes[max_ind] #surplus box The one with the highest score box
best_bboxes.append(best_bbox)
### Calculate the highest score box And surplus box of IoU ###
cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]], axis=0) #surplus box (Excluding the one with the highest score box)
best_bbox_xy = best_bbox[np.newaxis, :4]
cls_bboxes_xy = cls_bboxes[:, :4]
### IoU
best_bbox_area = (best_bbox_xy[:, 2] - best_bbox_xy[:, 0]) * (best_bbox_xy[:, 3] - best_bbox_xy[:, 1])
cls_bboxes_area = (cls_bboxes_xy[:, 2] - cls_bboxes_xy[:, 0]) * (cls_bboxes_xy[:, 3] - cls_bboxes_xy[:, 1])
left_up = np.maximum(best_bbox_xy[:, :2], cls_bboxes_xy[:, :2])
right_down = np.minimum(best_bbox_xy[:, 2:], cls_bboxes_xy[:, 2:])
inter_section = np.maximum(right_down - left_up, 0.0)
inter_area = inter_section[:, 0] * inter_section[:, 1]
union_area = cls_bboxes_area + best_bbox_area - inter_area
ious = 1.0 * inter_area / union_area
### Delete the one with the highest score box of IoU Larger box ###
cls_bboxes = cls_bboxes[iou_mask]