zynq7000版yolov4_tiny代码框架

算法框图

img

img

yolov4tinylable.drawio

​ YOLOv4-tiny结构是YOLOv4的精简版,属于轻量化模型,参数只有600万相当于原来的十分之一,这使得检测速度提升很大。整体网络结构共有38层,使用了三个残差单元,激活函数使用了LeakyReLU,目标的分类与回归改为使用两个特征层,合并有效特征层时使用了特征金字塔(FPN)网络。其同样使用了CSPnet结构,并对特征提取网络进行通道分割,将经过3x3卷积后输出的特征层通道划分为两部分,并取第二部分。在COCO数据集上得到了40.2%的AP50、371FPS,相较于其他版本的轻量化模型性能优势显著。

​ CNN卷积层的基本组成单元标配:Conv + BN +ReLU 三剑客,可以将BN层的运算融合到Conv层中,把三层减少为一层减少运算量,加速推理。本质上是修改了卷积核的参数,在不增加Conv层计算量的同时,适用于模型推理。

​ BN(批归一化)层常用于在卷积层之后,对feature maps进行归一化,从而加速网络学习,也具有一定的正则化效果。训练时,BN需要学习一个minibatch数据的均值、方差,然后利用这些信息进行归一化。而在推理过程,通常为了加速,都会把BN融入到其上层卷积中,这样就将两步运算变成了一步,也就达到了加速目的。

CPU无加速版本

tools\predictor\yolo4-tiny

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class Yolo4_Tiny{
void forward(float* in,float* out0,float* out1){
float* feat1=new float[256*26*26];
float* feat2=new float[512*13*13];
float* P5=new float[256*13*13];
float* out0_tmp=new float[512*13*13];
float* P5_Upsample=new float[128*13*13];
float* P4=new float[384*26*26];
float* out1_tmp=new float[256*26*26];
//compute
this->backbone->forward(in,P4+128*26*26,feat2); //相当于concat,P4=concat(Upsample_out,feat1)
//conv_forP5
this->conv_forP5_conv->forward(feat2,P5);
//out0
this->yolo_headP5_basic_conv1->forward(P5,out0_tmp);
conv(512,this->out_ch,0,1,1,13,13,out0_tmp,this->yolo_headP5_w2,this->yolo_headP5_b2,out0);
//upsample
this->upsample_conv->forward(P5,P5_Upsample);
upsample(P5_Upsample,P4,13,13,128);
//out1
this->yolo_headP4_basic_conv1->forward(P4,out1_tmp);
conv(256,this->out_ch,0,1,1,26,26,out1_tmp,this->yolo_headP4_w2,this->yolo_headP4_b2,out1);
}
};

ZYNQ加速版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
CSPDarkNet* backbone;
BasicConv* conv_forP5_conv;
BasicConv* yolo_headP4_basic_conv1;
BasicConv* upsample_conv;

Yolo4_Tiny-forward()
{
static data_t feat1[256*26*26];
static data_t feat2[512*13*13];
static data_t P5[256*13*13];
static data_t out0_tmp[512*13*13];
static data_t P5_Upsample[128*13*13];
static data_t P4[384*26*26];
static data_t out1_tmp[256*26*26];

//相当于concat,P4=concat(Upsample_out,feat1)
this->backbone->forward(in,P4+128*26*26,feat2);

//conv_forP5
this->conv_forP5_conv->forward(feat2,P5);
//out0
this->yolo_headP5_basic_conv1->forward(P5,out0_tmp);
conv_leakyrelu(512,OUT_CH,0,1,1,13,13,out0_tmp,this->yolo_headP5_w2,this->yolo_headP5_b2,out0,0);

//conv_forP4
//upsample_conv
this->upsample_conv->forward(P5,P5_Upsample);
//upsample
sampling(P5_Upsample,P4,128,13,0);
//out1
this->yolo_headP4_basic_conv1->forward(P4,out1_tmp);
conv_leakyrelu(256,OUT_CH,0,1,1,26,26,out1_tmp,this->yolo_headP4_w2,this->yolo_headP4_b2,out1,0);
}

backbone

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
CSPDarkNet->forward()
{
BasicConclass CSPDarkNet{
public:
BasicConv* basic_conv1;
BasicConv* basic_conv2;
BasicConv* basic_conv3;
Resblock_body* resblock1;
Resblock_body* resblock2;
Resblock_body* resblock3;
CSPDarkNet(){
this->basic_conv1=new BasicConv(416,416,3,2,1,3,32);
this->basic_conv2=new BasicConv(208,208,3,2,1,32,64);
this->basic_conv3=new BasicConv(13,13,3,1,1,512,512);
this->resblock1=new Resblock_body(104,104,64,64);
this->resblock2=new Resblock_body(52,52,128,128);
this->resblock3=new Resblock_body(26,26,256,256);
}
void load_weight(string dir){
this->basic_conv1->load_weight(dir+"\\BasicConv1");
this->basic_conv2->load_weight(dir+"\\BasicConv2");
this->basic_conv3->load_weight(dir+"\\BasicConv3");
this->resblock1->load_weight(dir+"\\ResBlock1");
this->resblock2->load_weight(dir+"\\ResBlock2");
this->resblock3->load_weight(dir+"\\ResBlock3");
}
void forward(data_t* in,data_t* feat1,data_t* feat2){
static data_t basic_conv1_out[32*208*208];
static data_t basic_conv2_out[64*104*104];
static data_t resblock1_out[128*52*52];
static data_t resblock2_out[256*26*26];
static data_t resblock3_out[512*13*13];
static data_t tmp[64*104*104];
//forward
this->basic_conv1->forward(in,basic_conv1_out);
//cout<<"basic conv1 end\n";
this->basic_conv2->forward(basic_conv1_out,basic_conv2_out);
//cout<<"basic conv2 end\n";
this->resblock1->forward(basic_conv2_out,tmp,resblock1_out);
//cout<<"resblock1 end\n";
this->resblock2->forward(resblock1_out,tmp,resblock2_out);
//cout<<"resblock2 end\n";
this->resblock3->forward(resblock2_out,feat1,resblock3_out);
//cout<<"resblock3 end\n";
this->basic_conv3->forward(resblock3_out,feat2);
}
};

forP5和forP4

backbone、forP5、forP4都是由基本的Basicconv实现,其中conv_leakyrelu()为zynq7000硬件加速

1
2
3
4
5
class BasicConv{
void forward(data_t* in,data_t* out){ //folded_conv+leakyrelu
conv_leakyrelu(this->ch_in,this->ch_out,this->p,this->s,this->k,this->h,this->w,in,this->weight,this->bias,out,1);
}
}

SDK:conv_leakyrelu

SDK\src > basic_op.cpp

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
void conv_leakyrelu(int ch_in,int ch_out,int pad,int stride,int k,int h,int w,data_t* in,data_t *weight,data_t *bias,data_t *out,int act){
//Xil_DCacheFlushRange((u32)in,sizeof(data_t)*ch_in*h*w);
XConv_Set_in1_V(&conv_inst, (u32)in);
XConv_Set_in2_V(&conv_inst, (u32)in);
XConv_Set_in3_V(&conv_inst, (u32)in);
XConv_Set_in4_V(&conv_inst, (u32)in);

XConv_Set_w1_V(&conv_inst, (u32)weight);
XConv_Set_w2_V(&conv_inst, (u32)weight);
XConv_Set_w3_V(&conv_inst, (u32)weight);
XConv_Set_w4_V(&conv_inst, (u32)weight);

XConv_Set_b_V(&conv_inst, (u32)bias);
XConv_Set_out1_V(&conv_inst, (u32)out);
XConv_Set_out2_V(&conv_inst, (u32)out);
XConv_Set_out3_V(&conv_inst, (u32)out);
XConv_Set_out4_V(&conv_inst, (u32)out);

XConv_Set_ch_in(&conv_inst,ch_in);
XConv_Set_ch_out(&conv_inst,ch_out);
XConv_Set_fsize(&conv_inst,h);
XConv_Set_stride(&conv_inst,stride);
XConv_Set_kernel(&conv_inst,k);
XConv_Set_act(&conv_inst,act);
//
XConv_Start(&conv_inst);
while(XConv_IsDone(&conv_inst)==0);
}

HLS:conv_leakyrelu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
void conv(data_t* in1,data_t* in2,data_t* in3,data_t* in4,data_t* w1,data_t* w2,data_t* w3,
data_t* w4,data_t* b,data_t* out1,data_t* out2,data_t* out3,data_t* out4,
int ch_in,int ch_out,int fsize,int stride,int kernel,int act){
//PRAGMA
#pragma HLS INTERFACE s_axilite port=return bundle=CTRL
#pragma HLS INTERFACE s_axilite port=fsize bundle=CTRL
#pragma HLS INTERFACE s_axilite port=ch_out bundle=CTRL
#pragma HLS INTERFACE s_axilite port=act bundle=CTRL
#pragma HLS INTERFACE s_axilite port=stride bundle=CTRL
#pragma HLS INTERFACE s_axilite port=ch_in bundle=CTRL
#pragma HLS INTERFACE s_axilite port=kernel bundle=CTRL
#pragma HLS INTERFACE m_axi depth=60000 port=in1 offset=slave bundle=FM1 max_read_burst_length=256 max_write_burst_length=256
#pragma HLS INTERFACE m_axi depth=60000 port=in2 offset=slave bundle=FM2 max_read_burst_length=256 max_write_burst_length=256
#pragma HLS INTERFACE m_axi depth=60000 port=in3 offset=slave bundle=FM3 max_read_burst_length=256 max_write_burst_length=256
#pragma HLS INTERFACE m_axi depth=60000 port=in4 offset=slave bundle=FM4 max_read_burst_length=256 max_write_burst_length=256
#pragma HLS INTERFACE m_axi depth=65536 port=w1 offset=slave bundle=W1 max_read_burst_length=256
#pragma HLS INTERFACE m_axi depth=65536 port=w2 offset=slave bundle=W2 max_read_burst_length=256
#pragma HLS INTERFACE m_axi depth=65536 port=w3 offset=slave bundle=W3 max_read_burst_length=256
#pragma HLS INTERFACE m_axi depth=65536 port=w4 offset=slave bundle=W4 max_read_burst_length=256
#pragma HLS INTERFACE m_axi depth=128 port=b offset=slave bundle=W1 max_read_burst_length=256
#pragma HLS INTERFACE m_axi port=out1 offset=slave bundle=FM1
#pragma HLS INTERFACE m_axi port=out2 offset=slave bundle=FM2
#pragma HLS INTERFACE m_axi port=out3 offset=slave bundle=FM3
#pragma HLS INTERFACE m_axi port=out4 offset=slave bundle=FM4
//content
if(kernel==3){
//WinogradConv(in1,in2,in3,in4,w1,w2,w3,w4,b,out1,out2,ch_out,ch_in,fsize,stride,act);
std_conv(in1,in2,in3,in4,w1,w2,w3,w4,b,out1,out2,out3,out4,ch_in,ch_out,fsize,stride,act);
}
else if(kernel==1){
pwconv(in1,in2,in3,in4,w1,b,out1,ch_in,ch_out,fsize,act);
}
else if(kernel==2){ //kernel==2 maxpool; kernel==0 upsample
maxpool(in1,out1,ch_in,fsize,fsize);
}
else{
upsample(in1,out1);
}
}

zynq7000版yolov4_tiny代码框架
http://blog.uanet.cn/AI/zynq7000版yolov4_tiny代码框架.html
作者
dnsnat
发布于
2025年2月13日
许可协议