三个不同AXI IP核的实现的方法_性能的对比及差异的分析

本文先总结不同axi ip核的实现的方法，性能的对比，性能差异的分析，可能改进的方面。使用的硬件平台是zedboard。
不同的axi总线卷积加速模块的概况
这次实现并逐渐优化了三个版本的卷积加速模块，先简要描述各个版本的主要内容。
版本一
版本一主要是用来测试axi总线ip核的实现可能。
该模块拥有19个32位寄存器
其中前9个寄存器用来保存需要计算的值
后面9个寄存器用来保存卷积核
在读取第19个寄存器的地址的时候计算9个寄存器的卷积和（该计算可以在一个时钟周期内完成）
9个寄存器单独赋值，程序中分别向对应地址写入内容，通过总线进行传输。
故乐观的来算，需要10个总线周期可以获取一个输出
可以从驱动的书写简单理解一下：
void conv_hw（int filter［3］［3］， int arr［100］［100］，
int filterw， int filterh， int arrw， int arrh） {
int i， j;
for （i = 2; i 《 filterh + arrh - 3; i++） {
for （j = 2; j 《 filterw + arrw - 3; j++） {
xil_out32（xpar_conv_0_s00_axi_baseaddr， arr［i］［j］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr+4， arr［i］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr+8， arr［i］［j - 2］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr+12， arr［i - 1］［j］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr+16， arr［i - 1］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr+20， arr［i - 1］［j - 2］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr+24， arr［i - 2］［j］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr+28， arr［i - 2］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr+32， arr［i - 2］［j - 2］）;
res［i］［j］ = xil_in32（xpar_conv_0_s00_axi_baseaddr + 72）;
}
if （i % 15 == 0）
printf（“=”）;
}
}
版本一性能
版本一性能最惨，由于没有时间戳，目测软件计算速度远远快于fpga核心运算速度。
版本一的改进速度就是引入滑动窗口，能够最大程度减少总线周期。
版本二
版本二引入滑动窗口，和初期设计的概念相同。
该模块拥有19个32位寄存器
其中前9个寄存器用来保存需要计算的值
后面9个寄存器用来保存卷积核
在读取第19个寄存器的地址的时候计算9个寄存器的卷积和（该计算可以在一个时钟周期内完成）
三个寄存器滑动赋值，该计算窗口在计算矩阵上滑动除了冷启动多余两个周期用来预载寄存器，后面的每一个计算只需要四个总线周期
可以通过写的驱动简单理解一下：
void conv_hw（int filter［3］［3］， int arr［100］［100］， int arrw， int arrh） {
int i， j;
i = 2; j = 2;
for （i = 2; i 《 arrh; i++） {
//pre load
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， arr［i - 1］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， arr［i］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， arr［i + 1］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， arr［i - 1］［j］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， arr［i］［j］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， arr［i + 1］［j］）;
for （j = 2; j 《 arrw; j++） {
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， arr［i - 1］［j + 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， arr［i］［j + 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， arr［i + 1］［j + 1］）;
res［i］［j］ = xil_in32（xpar_conv_0_s00_axi_baseaddr + 72）;
}
}
}
版本二性能
测试样本 500*500的32bit单位的矩阵计算200次。
软件消耗33.78秒，卷积ip核心40.25秒
这样的结果还是非常不乐观，分析可能有两种限制了ip核的速度。
两个寄存器的乘法lut太大，无法硬件优化
总线周期太慢太慢
版本三对于这两种可能进行探索。
版本二的fpga部分核心代码
// implement memory mapped register select and write logic generation
// the write data is accepted and written to memory mapped registers when
// axi_awready， s_axi_wvalid， axi_wready and s_axi_wvalid are asserted. write strobes are used to
// select byte enables of slave registers while writing.
// these registers are cleared when reset （active low） is applied.
// slave register write enable is asserted when valid address and data are available
// and the slave is ready to accept the write address and write data.
assign slv_reg_wren = axi_wready && s_axi_wvalid && axi_awready && s_axi_awvalid;
always @（ posedge s_axi_aclk ）
begin
if （ s_axi_aresetn == 1‘b0 ）
begin
slv_reg0 《= 0;
slv_reg1 《= 0;
slv_reg2 《= 0;
slv_reg3 《= 0;
slv_reg4 《= 0;
slv_reg5 《= 0;
slv_reg6 《= 0;
slv_reg7 《= 0;
slv_reg8 《= 0;
slv_reg9 《= 0;
slv_reg10 《= 0;
slv_reg11 《= 0;
slv_reg12 《= 0;
slv_reg13 《= 0;
slv_reg14 《= 0;
slv_reg15 《= 0;
slv_reg16 《= 0;
slv_reg17 《= 0;
// slv_reg18 《= 0;
end
else begin
if （slv_reg_wren）
begin
case （ axi_awaddr［addr_lsb+opt_mem_addr_bits:addr_lsb］）
5’h00：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 0
slv_reg0［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5‘h01：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 1
slv_reg1［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5’h02：
begin
slv_reg0 《= slv_reg1;
slv_reg1 《= slv_reg2;
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 2
slv_reg2［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
end
5‘h03：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 3
slv_reg3［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5’h04：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 4
slv_reg4［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5‘h05：
begin
slv_reg3 《= slv_reg4;
slv_reg4 《= slv_reg5;
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 5
slv_reg5［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
end
5’h06：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 6
slv_reg6［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5‘h07：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 7
slv_reg7［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5’h08：
begin
slv_reg6 《= slv_reg7;
slv_reg7 《= slv_reg8;
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 8
slv_reg8［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
end
5‘h09：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 9
slv_reg9［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5’h0a：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 10
slv_reg10［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5‘h0b：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 11
slv_reg11［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5’h0c：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 12
slv_reg12［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5‘h0d：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 13
slv_reg13［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5’h0e：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 14
slv_reg14［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5‘h0f：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 15
slv_reg15［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5’h10：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 16
slv_reg16［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
5‘h11：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 17
slv_reg17［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
// 5’h12：
// for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
// if （ s_axi_wstrb［byte_index］ == 1 ） begin
// // respective byte enables are asserted as per write strobes
// // slave register 18
// slv_reg18［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
// end
default ： begin
slv_reg0 《= slv_reg0;
slv_reg1 《= slv_reg1;
slv_reg2 《= slv_reg2;
slv_reg3 《= slv_reg3;
slv_reg4 《= slv_reg4;
slv_reg5 《= slv_reg5;
slv_reg6 《= slv_reg6;
slv_reg7 《= slv_reg7;
slv_reg8 《= slv_reg8;
slv_reg9 《= slv_reg9;
slv_reg10 《= slv_reg10;
slv_reg11 《= slv_reg11;
slv_reg12 《= slv_reg12;
slv_reg13 《= slv_reg13;
slv_reg14 《= slv_reg14;
slv_reg15 《= slv_reg15;
slv_reg16 《= slv_reg16;
slv_reg17 《= slv_reg17;
end
endcase
end
end
end
// implement memory mapped register select and read logic generation
// slave register read enable is asserted when valid address is available
// and the slave is ready to accept the read address.
assign slv_reg_rden = axi_arready & s_axi_arvalid & ~axi_rvalid;
always @（*）
begin
// address decoding for reading registers
case （ axi_araddr［addr_lsb+opt_mem_addr_bits:addr_lsb］）
5‘h00 ： reg_data_out 《= slv_reg0;
5’h01 ： reg_data_out 《= slv_reg1;
5‘h02 ： reg_data_out 《= slv_reg2;
5’h03 ： reg_data_out 《= slv_reg3;
5‘h04 ： reg_data_out 《= slv_reg4;
5’h05 ： reg_data_out 《= slv_reg5;
5‘h06 ： reg_data_out 《= slv_reg6;
5’h07 ： reg_data_out 《= slv_reg7;
5‘h08 ： reg_data_out 《= slv_reg8;
5’h09 ： reg_data_out 《= slv_reg9;
5‘h0a ： reg_data_out 《= slv_reg10;
5’h0b ： reg_data_out 《= slv_reg11;
5‘h0c ： reg_data_out 《= slv_reg12;
5’h0d ： reg_data_out 《= slv_reg13;
5‘h0e ： reg_data_out 《= slv_reg14;
5’h0f ： reg_data_out 《= slv_reg15;
5‘h10 ： reg_data_out 《= slv_reg16;
5’h11 ： reg_data_out 《= slv_reg17;
5‘h12 ： reg_data_out 《= slv_reg0 * slv_reg9 +
slv_reg1 * slv_reg10 +
slv_reg2 * slv_reg11 +
slv_reg3 * slv_reg12 +
slv_reg4 * slv_reg13 +
slv_reg5 * slv_reg14 +
slv_reg6 * slv_reg15 +
slv_reg7 * slv_reg16 +
slv_reg8 * slv_reg17;
default ： reg_data_out 《= 0;
endcase
end
版本三
先尝试生成更小的lut
该模块拥有19个32位寄存器
其中前9个寄存器用来保存需要计算的值
卷积核固定在verilog中，用来生成更小的lut
一个计算只需要四个总线周期
性能测试
仍然软件消耗33秒，卷积ip核心40秒
基本否决是lut问题。
下面测试axi总线问题：
假设所有数据均来自于fpga，无需从总线写入：
void conv_hw（int filter［3］［3］， int arr［100］［100］， int arrw， int arrh） {
int i， j;
i = 2; j = 2;
for （i = 2; i 《 arrh; i++） {
for （j = 2; j 《 arrw; j++） {
res［i］［j］ = xil_in32（xpar_conv_0_s00_axi_baseaddr + 72）;
}
}
}
只需要9.47秒即可完成计算，并传回cpu ！！！
总结
至此，基本上可以否决利用axi传数据的可能，所有需要利用axi总线传输数据的模块均会被总线周期所连累，在优化了传输后，仍然无法解决该问题。确实需要一个更快的方式来传输数据。
在altera的nios2中，直接利用io口传输数据，无需总线周期，再因为nios ii内核没有流水线优化，所以硬件确实比较快。
附1:axi4 总线的 fpga 接口部分
先看总线接口：
// users to add ports here
// user ports ends
// do not modify the ports beyond this line
// global clock signal
// 全局时钟
input wire s_axi_aclk，
// global reset signal. this signal is active low
// 全局复位信号
input wire s_axi_aresetn，
// write address （issued by master， acceped by slave）
// 写地址
input wire ［c_s_axi_addr_width-1 ： 0］ s_axi_awaddr，
// 写地址的保护模式包括privilege和security level
// write channel protection type. this signal indicates the
// privilege and security level of the transaction， and whether
// the transaction is a data access or an instruction access.
input wire ［2 ： 0］ s_axi_awprot，
// 写地址有效信号。为高指示地址有效。
// write address valid. this signal indicates that the master signaling
// valid write address and control information.
input wire s_axi_awvalid，
// 写地址准备信号。为高表示从设备空闲，准备接收地址；为低表示从设备忙。
// ********** 注意这里是地址下面是数据 ********
// write address ready. this signal indicates that the slave is ready
// to accept an address and associated control signals.
output wire s_axi_awready，
// 写数据，32位到1024位宽
// 从主设备来的数据从设备接收
// write data （issued by master， acceped by slave）
input wire ［c_s_axi_data_width-1 ： 0］ s_axi_wdata，
// 写字节选通，用于表示更新存储器的字节通道，对于数据总线的每8位数据有一位写选通信号。
// write strobes. this signal indicates which byte lanes hold
// valid data. there is one write strobe bit for each eight
// bits of the write data bus.
input wire ［（c_s_axi_data_width/8）-1 ： 0］ s_axi_wstrb，
// 写有效。为高指示数据有效。
// write valid. this signal indicates that valid write
// data and strobes are available.
input wire s_axi_wvalid，
// 写准备。为高表示从设备空闲，准备接收数据；为低表示从设备忙。
// write ready. this signal indicates that the slave
// can accept the write data.
output wire s_axi_wready，
// 写响应。该信号表示写状态，可允许相应的表示为okayexokayslverrdecerr。
// write response. this signal indicates the status
// of the write transaction.
output wire ［1 ： 0］ s_axi_bresp，
// 写响应有效。为高指示响应数据有效
// write response valid. this signal indicates that the channel
// is signaling a valid write response.
output wire s_axi_bvalid，
// 写响应准备。为高表示主设备空闲，准备接收写响应；为低表示主设备忙。
// response ready. this signal indicates that the master
// can accept a write response.
input wire s_axi_bready，
//
// 读地址。读地址给出突发数据传输的第一个传输地址。
// read address （issued by master， acceped by slave）
input wire ［c_s_axi_addr_width-1 ： 0］ s_axi_araddr，
// 保护类型，建议值为000。
// protection type. this signal indicates the privilege
// and security level of the transaction， and whether the
// transaction is a data access or an instruction access.
input wire ［2 ： 0］ s_axi_arprot，
//
// read address valid. this signal indicates that the channel
// is signaling valid read address and control information.
input wire s_axi_arvalid，
// 读地址准备信号。为高表示从设备空闲，准备接收地址；为低表示从设备忙。
// read address ready. this signal indicates that the slave is
// ready to accept an address and associated control signals.
output wire s_axi_arready，
// read data （issued by slave）
output wire ［c_s_axi_data_width-1 ： 0］ s_axi_rdata，
// read response. this signal indicates the status of the
// read transfer.
output wire ［1 ： 0］ s_axi_rresp，
// read valid. this signal indicates that the channel is
// signaling the required read data.
output wire s_axi_rvalid，
// read ready. this signal indicates that the master can
// accept the read data and response information.
input wire s_axi_rready
）;
// axi4lite signals
reg ［c_s_axi_addr_width-1 ： 0］ axi_awaddr;
reg axi_awready;
reg axi_wready;
reg ［1 ： 0］ axi_bresp;
reg axi_bvalid;
reg ［c_s_axi_addr_width-1 ： 0］ axi_araddr;
reg axi_arready;
reg ［c_s_axi_data_width-1 ： 0］ axi_rdata;
reg ［1 ： 0］ axi_rresp;
reg axi_rvalid;
其中最为重要的读取总线信号寻址的部分：
assign slv_reg_wren = axi_wready && s_axi_wvalid && axi_awready && s_axi_awvalid;
always @（ posedge s_axi_aclk ）
begin
if （ s_axi_aresetn == 1’b0 ）
begin
slv_reg0 《= 0;
slv_reg1 《= 0;
slv_reg2 《= 0;
slv_reg3 《= 0;
slv_reg4 《= 0;
slv_reg5 《= 0;
slv_reg6 《= 0;
slv_reg7 《= 0;
slv_reg8 《= 0;
slv_reg9 《= 0;
end
else begin
if （slv_reg_wren）
begin
// 进行寻址
// 地址寻址是这么玩的
// 当寄存器是32位的最后就是 2位 4个byte addr_lsb = 2
// 当寄存器是64位的最后就是 3位 8个byte addr_lsb = 3
// opt_mem_addr_bits 用来寻址寄存器这里选了十个寄存器所以这里就是4位
case （ axi_awaddr［addr_lsb+opt_mem_addr_bits:addr_lsb］）
4‘h0：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
// 只有在对应的bit位置为1的时候才能开始读取
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 0
slv_reg0［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4’h1：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 1
slv_reg1［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4‘h2：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 2
slv_reg2［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4’h3：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 3
slv_reg3［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4‘h4：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 4
slv_reg4［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4’h5：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 5
slv_reg5［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4‘h6：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 6
slv_reg6［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4’h7：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 7
slv_reg7［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4‘h8：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 8
slv_reg8［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
4’h9：
for （ byte_index = 0; byte_index 《= （c_s_axi_data_width/8）-1; byte_index = byte_index+1 ）
if （ s_axi_wstrb［byte_index］ == 1 ） begin
// respective byte enables are asserted as per write strobes
// slave register 9
slv_reg9［（byte_index*8） +： 8］《= s_axi_wdata［（byte_index*8） +： 8］;
end
default ： begin
slv_reg0 《= slv_reg0;
slv_reg1 《= slv_reg1;
slv_reg2 《= slv_reg2;
slv_reg3 《= slv_reg3;
slv_reg4 《= slv_reg4;
slv_reg5 《= slv_reg5;
slv_reg6 《= slv_reg6;
slv_reg7 《= slv_reg7;
slv_reg8 《= slv_reg8;
slv_reg9 《= slv_reg9;
end
endcase
end
end
end
附2:axi4的测试模块与仿真测试
`timescale 1ns/1ns
module conv_axi_test（）;
parameter integer c_s00_axi_data_width = 32;
parameter integer c_s00_axi_addr_width = 6;
reg s00_axi_aclk;
// 全局复位信号
reg s00_axi_aresetn;
reg ［c_s00_axi_addr_width-1 ： 0］ s00_axi_awaddr;
wire ［2 ： 0］ s00_axi_awprot;
reg s00_axi_awvalid;
wire s00_axi_awready;
reg ［c_s00_axi_data_width-1 ： 0］ s00_axi_wdata;
reg ［（c_s00_axi_data_width/8）-1 ： 0］ s00_axi_wstrb;
reg s00_axi_wvalid;
wire s00_axi_wready;
wire ［1 ： 0］ s00_axi_bresp;
wire s00_axi_bvalid;
wire s00_axi_bready;
reg ［c_s00_axi_addr_width-1 ： 0］ s00_axi_araddr;
wire ［2 ： 0］ s00_axi_arprot;
reg s00_axi_arvalid;
wire s00_axi_arready;
wire ［c_s00_axi_data_width-1 ： 0］ s00_axi_rdata;
wire ［1 ： 0］ s00_axi_rresp;
wire s00_axi_rvalid;
wire s00_axi_rready;
conv_v1_0_s00_axi # （
.c_s_axi_data_width（c_s00_axi_data_width），
.c_s_axi_addr_width（c_s00_axi_addr_width）
） conv_v1_0_s00_axi_inst （
.s_axi_aclk（s00_axi_aclk），
.s_axi_aresetn（s00_axi_aresetn），
.s_axi_awaddr（s00_axi_awaddr），
.s_axi_awprot（s00_axi_awprot），
.s_axi_awvalid（s00_axi_awvalid），
.s_axi_awready（s00_axi_awready），
.s_axi_wdata（s00_axi_wdata），
.s_axi_wstrb（s00_axi_wstrb），
.s_axi_wvalid（s00_axi_wvalid），
.s_axi_wready（s00_axi_wready），
.s_axi_bresp（s00_axi_bresp），
.s_axi_bvalid（s00_axi_bvalid），
.s_axi_bready（s00_axi_bready），
.s_axi_araddr（s00_axi_araddr），
.s_axi_arprot（s00_axi_arprot），
.s_axi_arvalid（s00_axi_arvalid），
.s_axi_arready（s00_axi_arready），
.s_axi_rdata（s00_axi_rdata），
.s_axi_rresp（s00_axi_rresp），
.s_axi_rvalid（s00_axi_rvalid），
.s_axi_rready（s00_axi_rready）
）;
initial
begin:d
integer i;
s00_axi_aclk = 1;
for（i = 0; i《 1000;i++）
begin
#1 s00_axi_aclk = ~ s00_axi_aclk;
end
$finish（）;
end
initial
begin
s00_axi_aresetn = 0;
s00_axi_arvalid = 0;
#4 s00_axi_aresetn = 1;
s00_axi_awvalid = 1;
s00_axi_wvalid = 1;
s00_axi_awaddr = 0;
s00_axi_wstrb = 4‘b1111;
s00_axi_wdata = 3;
#4 s00_axi_awaddr = 6’b000100;
s00_axi_wdata = 21;
#4 s00_axi_awaddr = 6‘b001000;
s00_axi_wdata = 19;
#4 s00_axi_awaddr = 6’b001100;
s00_axi_wdata = 22;
#4 s00_axi_awaddr = 6‘b010000;
s00_axi_wdata = 20;
#4 s00_axi_awaddr = 6’b010100;
s00_axi_wdata = 13;
#4 s00_axi_awaddr = 6‘b011000;
s00_axi_wdata = 16;
#4 s00_axi_awaddr = 6’b011100;
s00_axi_wdata = 14;
#4 s00_axi_awaddr = 6‘b100000;
s00_axi_wdata = 7;
#4
s00_axi_arvalid = 1;
s00_axi_araddr = 6’b100100;
end
initial
begin
$dumpfile（“test.vcd”）;
$dumpvars（）;
end
endmodule
利用iverilog进行仿真gtkwave显示测试波形如下
新建ip核如下：
工程顶层图如下：
附3：软件驱动
#include
#include “platform.h”
#include “xbasic_types.h”
#include “xparameters.h”
#include “xil_io.h”
#define test_speed
int res［1000］［1000］;
void delay（） {
int i， j， k;
for （i = 0; i 《 1000; i++） {
for （j = 0; j 《 1000; j++） {
for （k = 0; k 《 100; k++）
;
}
}
}
void show_reg（） {
int i;
u32 result;
printf（“============show reg ================”）;
for （i = 0; i 《 9; i++） {
result = xil_in32（xpar_conv_0_s00_axi_baseaddr + 4 * i）;
printf（“reg %3d ： %u”， i， result）;
}
}
void load_kernel（int filter［3］［3］） {
uintptr kernel_addr = （uintptr） xpar_conv_0_s00_axi_baseaddr + 36;
xil_out32（kernel_addr， filter［0］［0］）;
kernel_addr = kernel_addr + 0x4;
xil_out32（kernel_addr， filter［0］［1］）;
kernel_addr = kernel_addr + 0x4;
xil_out32（kernel_addr， filter［0］［2］）;
kernel_addr = kernel_addr + 0x4;
xil_out32（kernel_addr， filter［1］［0］）;
kernel_addr = kernel_addr + 0x4;
xil_out32（kernel_addr， filter［1］［1］）;
kernel_addr = kernel_addr + 0x4;
xil_out32（kernel_addr， filter［1］［2］）;
kernel_addr = kernel_addr + 0x4;
xil_out32（kernel_addr， filter［2］［0］）;
kernel_addr = kernel_addr + 0x4;
xil_out32（kernel_addr， filter［2］［1］）;
kernel_addr = kernel_addr + 0x4;
xil_out32（kernel_addr， filter［2］［2］）;
}
void test_set（） {
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， 3）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， 22）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， 16）;
printf（“1”）;
show_reg（）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， 21）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， 20）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， 14）;
printf（“2”）;
show_reg（）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， 19）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， 13）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， 7）;
printf（“3”）;
show_reg（）;
}
void conv_sw（int filter［3］［3］， int arr［100］［100］， int arrw， int arrh） {
int i， j;
i = 2; j = 2;
for （i = 2; i 《 arrh; i++） {
for （j = 2; j 《 arrw;j++）{
res［i］［j］ = 0;
res［i］［j］ += filter［0］［0］ * arr［i - 1］［j - 1］;
res［i］［j］ += filter［0］［1］ * arr［i - 1］［j］;
res［i］［j］ += filter［0］［2］ * arr［i - 1］［j + 1］;
res［i］［j］ += filter［1］［0］ * arr［i］［j - 1］;
res［i］［j］ += filter［1］［1］ * arr［i］［j］;
res［i］［j］ += filter［1］［2］ * arr［i］［j + 1］;
res［i］［j］ += filter［2］［0］ * arr［i + 1］［j - 1］;
res［i］［j］ += filter［2］［1］ * arr［i + 1］［j］;
res［i］［j］ += filter［2］［2］ * arr［i + 1］［j + 1］;
}
}
}
void conv_hw（int filter［3］［3］， int arr［100］［100］， int arrw， int arrh） {
int i， j;
i = 2; j = 2;
for （i = 2; i 《 arrh; i++） {
//pre load
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， arr［i - 1］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， arr［i］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， arr［i + 1］［j - 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， arr［i - 1］［j］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， arr［i］［j］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， arr［i + 1］［j］）;
for （j = 2; j 《 arrw; j++） {
xil_out32（xpar_conv_0_s00_axi_baseaddr + 8， arr［i - 1］［j + 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 20， arr［i］［j + 1］）;
xil_out32（xpar_conv_0_s00_axi_baseaddr + 32， arr［i + 1］［j + 1］）;
res［i］［j］ = xil_in32（xpar_conv_0_s00_axi_baseaddr + 72）;
}
}
}
int main（） {
printf（“hello world”）;
u32 result;
int filterw = 3;
int filterh = 3;
int arrw = 5;
int arrh = 5;
int resw = filterw + arrw - 1;
int resh = filterh + arrh - 1;
int i， j;
int pfilter［3］［3］;
int arr［100］［100］;
uintptr cur_addr = （uintptr） xpar_conv_0_s00_axi_baseaddr;
pfilter［0］［0］ = 1;
pfilter［0］［1］ = 3;
pfilter［0］［2］ = 1;
pfilter［1］［0］ = 0;
pfilter［1］［1］ = 5;
pfilter［1］［2］ = 0;
pfilter［2］［0］ = 2;
pfilter［2］［1］ = 1;
pfilter［2］［2］ = 2;
init_platform（）;
for （i = 0; i 《 9; i++） {
xil_out32（cur_addr， 0）;
cur_addr = cur_addr + 4;
}
load_kernel（pfilter）;
printf（“kernel loaded”）;
#ifdef test_single
test_set（）;
result = xil_in32（xpar_conv_0_s00_axi_baseaddr + 72）;
printf（“test set result %u”， result）;
show_reg（）;
#endif
#ifdef test_func
srand（10）;
arrw = 20;
arrh = 20;
resh = filterh + arrh - 1;
resw = filterw + arrw - 1;
for （i = 0; i 《 arrh; i++） {
for （j = 0; j 《 arrw; j++） {
arr［i］［j］ = rand（） % 20;
}
}
printf（“*********************************************** ”）;
printf（“filter： ”）;
for （i = filterh - 1; i 》= 0; i--） {
for （j = filterw - 1; j 》= 0; j--） {
printf（“%d ”， pfilter［i］［j］）;
}
printf（“”）;
}
printf（“*********************************************** ”）;
printf（“matrix： ”）;
for （i = 0; i 《 arrh; i++） {
for （j = 0; j 《 arrw; j++） {
printf（“%4d ”， arr［i］［j］）;
}
printf（“”）;
}
printf（“*********************************************** ”）;
printf（“software start！”）;
conv_sw（pfilter， arr， arrw， arrh）;
printf（“software end！”）;
printf（“*********************************************** ”）;
printf（“result1： ”）;
for （i = 0; i 《 resh; i++） {
for （j = 0; j 《 resw; j++） {
printf（“%5d ”， res［i］［j］）;
}
printf（“”）;
}
for （i = 0; i 《 resh; i++） {
for （j = 0; j 《 resw; j++） {
res［i］［j］ = 0;
}
}
printf（“*********************************************** ”）;
printf（“hardware start！”）;
conv_hw（pfilter， arr， arrw， arrh）;
printf（“hardware end！”）;
printf（“result2： ”）;
for （i = 0; i 《 resh; i++） {
for （j = 0; j 《 resw; j++） {
printf（“%5d ”， res［i］［j］）;
}
printf（“”）;
}
printf（“*********************************************** ”）;
#endif
#ifdef test_speed
arrw = 500;
arrh = 500;
resh = filterh + arrh - 1;
resw = filterw + arrw - 1;
printf（“software start！”）;
for（i = 0; i《 200;i++） {
conv_sw（pfilter， arr， arrw， arrh）;
}
printf（“software end！”）;
printf（“hardware start！”）;
for（i = 0; i《 200;i++） {
conv_hw（pfilter， arr， arrw， arrh）;
}
printf（“hardware end！”）;
cleanup_platform（）;
#endif
return 0;
}

洲明科技出资2亿在成都设立全资子公司
我国铁路营业里程将达13.9万公里高铁3.5万公里稳居世界第一
赵明：物联网将带来消费革命的下一个
电压互感器运行注意事项
M5256-000002-350BG变送器如何检测好坏
三个不同AXI IP核的实现的方法_性能的对比及差异的分析
美国政府启动10亿资金促进人工智能和量子计算的发展
Achronix和BittWare推出采用FPGA芯片的加速卡
iPhone7上市时间确定？预售竟是9月9日
Highly Integrated UMTS Femto B
仿生机器狗有什么用，它的实际用途有哪些
ADC0809的主要特性/内部结构以及转换过程
MIE-5408光纤交换机安全解决方案
高速视频处理系统中的信号完整性分析
奥迪威获颁ISO 26262：2018 ASIL-D功能安全流程认证证书
Wi-Fi承担接力最后百米重任摩托罗拉系统提出四个现代化
焦耳小偷制作图解
电焊台的工作原理
索尼 XZ Premium价格？中国市场最晚于6月正式开卖，你期待吗？
2020汽车电动化与智能化技术国际论坛顺利闭幕