FPGA神经网络加速开发流程模板涵盖Verilog模块、OpenCL内核和仿真测试框架。
一、 Verilog RTL级开发模板
1.1 顶层模块模板
// ============================================================
// File: dnn_accelerator_top.v
// Module: dnn_accelerator_top
// Function: 深度神经网络加速器顶层模块
// Author: Your Name
// Version: 1.0
// Date: 2026-04-24
// ============================================================
`timescale 1ns / 1ps
module dnn_accelerator_top #(
// ---------- 系统参数 ----------
parameter CLK_FREQ = 100_000_000, // 时钟频率 (Hz)
// ---------- 数据总线参数 ----------
parameter AXI_DATA_WIDTH = 64, // AXI数据位宽
parameter AXI_ADDR_WIDTH = 32, // AXI地址位宽
// ---------- 神经网络参数 ----------
parameter INPUT_WIDTH = 224, // 输入图像宽度
parameter INPUT_HEIGHT = 224, // 输入图像高度
parameter INPUT_CHANNEL = 3, // 输入通道数
parameter OUTPUT_CLASS = 1000, // 输出类别数
// ---------- 计算精度 ----------
parameter DATA_WIDTH = 16, // 数据位宽 (8/16/32)
parameter WEIGHT_WIDTH = 16, // 权重位宽
parameter BIAS_WIDTH = 16// 偏置位宽
)(
// ---------- 时钟与复位 ----------
inputwire clk_i, // 系统时钟
inputwire rst_n_i, // 低电平有效异步复位
// ---------- AXI-Lite 控制接口 ----------
// 写地址通道
inputwire [AXI_ADDR_WIDTH-1:0] axi_awaddr_i,
inputwire axi_awvalid_i,
outputwire axi_awready_o,
// 写数据通道
inputwire [AXI_DATA_WIDTH-1:0] axi_wdata_i,
inputwire [(AXI_DATA_WIDTH/8)-1:0] axi_wstrb_i,
inputwire axi_wvalid_i,
outputwire axi_wready_o,
// 写响应通道
outputwire [1:0] axi_bresp_o,
outputwire axi_bvalid_o,
inputwire axi_bready_i,
// 读地址通道
inputwire [AXI_ADDR_WIDTH-1:0] axi_araddr_i,
inputwire axi_arvalid_i,
outputwire axi_arready_o,
// 读数据通道
outputwire [AXI_DATA_WIDTH-1:0] axi_rdata_o,
outputwire [1:0] axi_rresp_o,
outputwire axi_rvalid_o,
inputwire axi_rready_i,
// ---------- AXI-Stream 数据接口 ----------
// 输入数据流
inputwire s_axis_tvalid_i,
inputwire [AXI_DATA_WIDTH-1:0] s_axis_tdata_i,
inputwire s_axis_tlast_i,
outputwire s_axis_tready_o,
// 输出数据流
outputwire m_axis_tvalid_o,
outputwire [AXI_DATA_WIDTH-1:0] m_axis_tdata_o,
outputwire m_axis_tlast_o,
inputwire m_axis_tready_i,
// ---------- 状态指示 ----------
outputwire busy_o, // 加速器忙标志
outputwire error_o, // 错误标志
outputwire [3:0] status_o // 状态码
);
// ========== 内部信号声明 ==========
// 时钟与复位
wire clk;
wire rst_n;
// 控制寄存器
reg [31:0] control_reg;
reg [31:0] status_reg;
reg [31:0] config_reg[0:15];
// 数据通路
wire data_valid;
wire [DATA_WIDTH-1:0] data_in[0:INPUT_CHANNEL-1];
wire [DATA_WIDTH-1:0] data_out[0:OUTPUT_CLASS-1];
// ========== 时钟与复位处理 ==========
assign clk = clk_i;
assign rst_n = rst_n_i;
// ========== AXI-Lite 从机接口 ==========
axi_lite_slave #(
.DATA_WIDTH(AXI_DATA_WIDTH),
.ADDR_WIDTH(AXI_ADDR_WIDTH),
.REG_NUM(16)
) u_axi_lite (
.clk(clk),
.rst_n(rst_n),
// AXI接口
.axi_awaddr_i(axi_awaddr_i),
.axi_awvalid_i(axi_awvalid_i),
.axi_awready_o(axi_awready_o),
.axi_wdata_i(axi_wdata_i),
.axi_wstrb_i(axi_wstrb_i),
.axi_wvalid_i(axi_wvalid_i),
.axi_wready_o(axi_wready_o),
.axi_bresp_o(axi_bresp_o),
.axi_bvalid_o(axi_bvalid_o),
.axi_bready_i(axi_bready_i),
.axi_araddr_i(axi_araddr_i),
.axi_arvalid_i(axi_arvalid_i),
.axi_arready_o(axi_arready_o),
.axi_rdata_o(axi_rdata_o),
.axi_rresp_o(axi_rresp_o),
.axi_rvalid_o(axi_rvalid_o),
.axi_rready_i(axi_rready_i),
// 寄存器接口
.control_reg_o(control_reg),
.status_reg_i(status_reg),
.config_reg_o(config_reg)
);
// ========== 数据流接口 ==========
stream_interface #(
.DATA_WIDTH(AXI_DATA_WIDTH)
) u_stream_in (
.clk(clk),
.rst_n(rst_n),
// AXI-Stream接口
.s_axis_tvalid_i(s_axis_tvalid_i),
.s_axis_tdata_i(s_axis_tdata_i),
.s_axis_tlast_i(s_axis_tlast_i),
.s_axis_tready_o(s_axis_tready_o),
// 内部接口
.data_valid_o(data_valid),
.data_o(data_in)
);
// ========== 神经网络加速引擎 ==========
dnn_engine #(
.INPUT_WIDTH(INPUT_WIDTH),
.INPUT_HEIGHT(INPUT_HEIGHT),
.INPUT_CHANNEL(INPUT_CHANNEL),
.OUTPUT_CLASS(OUTPUT_CLASS),
.DATA_WIDTH(DATA_WIDTH),
.WEIGHT_WIDTH(WEIGHT_WIDTH),
.BIAS_WIDTH(BIAS_WIDTH)
) u_dnn_engine (
.clk(clk),
.rst_n(rst_n),
// 控制接口
.start_i(control_reg[0](@ref),
.done_o(status_reg[0](@ref),
// 配置接口
.layer_config_i(config_reg),
// 数据接口
.data_valid_i(data_valid),
.data_i(data_in),
.data_o(data_out)
);
// ========== 输出接口 ==========
stream_interface_out #(
.DATA_WIDTH(AXI_DATA_WIDTH),
.OUTPUT_WIDTH(OUTPUT_CLASS)
) u_stream_out (
.clk(clk),
.rst_n(rst_n),
// 内部接口
.data_valid_i(status_reg[0](@ref),
.data_i(data_out),
// AXI-Stream接口
.m_axis_tvalid_o(m_axis_tvalid_o),
.m_axis_tdata_o(m_axis_tdata_o),
.m_axis_tlast_o(m_axis_tlast_o),
.m_axis_tready_i(m_axis_tready_i)
);
// ========== 状态监控 ==========
assign busy_o = control_reg[0] & ~status_reg[0];
assign error_o = status_reg[31];
assign status_o = status_reg[3:0];
endmodule
1.2 卷积计算单元模板
// ============================================================
// File: conv_3x3_pe.v
// Module: conv_3x3_pe
// Function: 3×3卷积处理单元 (参考图7-4)
// ============================================================
module conv_3x3_pe #(
parameter DATA_WIDTH = 8,
parameter CH_IN_WIDTH = 16,
parameter CH_OUT_WIDTH = 16,
parameter PARALLEL_PE = 9// PE并行数量 (3×3=9)
)(
inputwire clk,
inputwire rst_n,
// 数据输入
inputwire data_valid_i,
inputwire [DATA_WIDTH-1:0] data_i[PARALLEL_PE-1:0],
// 权重输入
inputwire weight_valid_i,
inputwire [DATA_WIDTH-1:0] weight_i[PARALLEL_PE-1:0],
// 控制信号
inputwire start_i,
outputwire done_o,
// 结果输出
outputwire result_valid_o,
outputwire [DATA_WIDTH*2-1:0] result_o[CH_OUT_WIDTH-1:0]
);
// ========== 寄存器定义 ==========
reg [DATA_WIDTH-1:0] weight_reg[PARALLEL_PE-1:0];
reg compute_en;
reg [7:0] cycle_cnt;
// ========== PE阵列 ==========
wire [DATA_WIDTH*2-1:0] pe_result[PARALLEL_PE-1:0][CH_OUT_WIDTH-1:0];
wire pe_valid[PARALLEL_PE-1:0];
genvar i, j;
generate
for (i = 0; i < PARALLEL_PE; i = i + 1) begin : pe_array
// 每个PE处理一个权重与输入的乘加
pe_mult_add #(
.DATA_WIDTH(DATA_WIDTH),
.CH_OUT_WIDTH(CH_OUT_WIDTH)
) u_pe (
.clk(clk),
.rst_n(rst_n),
.data_i(data_i[i]),
.weight_i(weight_reg[i]),
.valid_i(compute_en),
.result_o(pe_result[i]),
.valid_o(pe_valid[i])
);
end
endgenerate
// ========== 累加树 ==========
wire [DATA_WIDTH*2-1:0] accum_result[CH_OUT_WIDTH-1:0];
generate
for (j = 0; j < CH_OUT_WIDTH; j = j + 1) begin : accum_tree
// 9个PE结果累加
adder_tree #(
.INPUT_NUM(PARALLEL_PE),
.DATA_WIDTH(DATA_WIDTH*2)
) u_adder_tree (
.clk(clk),
.rst_n(rst_n),
.data_i({
pe_result[j], pe_result[j], pe_result[j],
pe_result[j], pe_result[j], pe_result[j],
pe_result[j], pe_result[j], pe_result[j]
}),
.valid_i(&pe_valid),
.sum_o(accum_result[j]),
.valid_o(result_valid_o)
);
assign result_o[j] = accum_result[j];
end
endgenerate
// ========== 控制逻辑 ==========
always @(posedge clk ornegedge rst_n) begin
if (!rst_n) begin
compute_en <= 1'b0;
cycle_cnt <= 8'd0;
for (integer k = 0; k < PARALLEL_PE; k = k + 1) begin
weight_reg[k] <= {DATA_WIDTH{1'b0}};
end
endelsebegin
// 权重加载
if (weight_valid_i) begin
for (integer k = 0; k < PARALLEL_PE; k = k + 1) begin
weight_reg[k] <= weight_i[k];
end
end
// 计算启动
if (start_i && data_valid_i) begin
compute_en <= 1'b1;
cycle_cnt <= 8'd0;
endelseif (cycle_cnt == 8'd10) begin
compute_en <= 1'b0;
end
if (compute_en) begin
cycle_cnt <= cycle_cnt + 1;
end
end
end
assign done_o = (cycle_cnt == 8'd10);
endmodule
二、 OpenCL内核开发模板
2.1 卷积内核模板
// ============================================================
// File: dnn_convolution.cl
// Kernel: convolution_3x3
// Function: 3×3卷积OpenCL内核 (参考8.4节)
// ============================================================
// 常量定义
#define CHANNELS_IN 3
#define CHANNELS_OUT 64
#define KERNEL_SIZE 3
#define IMAGE_WIDTH 224
#define IMAGE_HEIGHT 224
// 数据类型定义
typedefshortdata_t; // 16位定点数
typedefshortweight_t;
typedefintacc_t; // 32位累加器
// 卷积内核 - ND Range版本
__kernel voidconvolution_3x3_ndrange(
__global constdata_t *restrict input, // 输入图像 [H][W][C]
__global constweight_t *restrict weights, // 权重 [Co][Ci][Kh][Kw]
__global constdata_t *restrict biases, // 偏置 [Co]
__global acc_t *restrict output, // 输出特征图 [Ho][Wo][Co]
__constant int *restrict params // 参数: stride, padding等
)
{
// 获取全局ID
constint gx = get_global_id(0); // 输出宽度方向
constint gy = get_global_id(1); // 输出高度方向
constint gc = get_global_id(2); // 输出通道方向
// 边界检查
if (gx >= IMAGE_WIDTH || gy >= IMAGE_HEIGHT || gc >= CHANNELS_OUT) {
return;
}
// 读取参数
constint stride = params[0];
constint padding = params[1];
// 计算输入起始位置
constint in_x = gx * stride - padding;
constint in_y = gy * stride - padding;
// 累加器初始化
acc_t sum = 0;
// 三维循环展开 (通道×高度×宽度)
#pragma unroll
for (int ci = 0; ci < CHANNELS_IN; ci++) {
#pragma unroll
for (int kh = 0; kh < KERNEL_SIZE; kh++) {
#pragma unroll
for (int kw = 0; kw < KERNEL_SIZE; kw++) {
// 计算输入坐标
int ix = in_x + kw;
int iy = in_y + kh;
// 边界填充处理 (zero-padding)
if (ix >= 0 && ix < IMAGE_WIDTH && iy >= 0 && iy < IMAGE_HEIGHT) {
// 读取输入数据
int input_idx = (iy * IMAGE_WIDTH + ix) * CHANNELS_IN + ci;
data_t in_val = input[input_idx];
// 读取权重
int weight_idx = ((gc * CHANNELS_IN + ci) * KERNEL_SIZE + kh) * KERNEL_SIZE + kw;
weight_t weight_val = weights[weight_idx];
// 乘积累加
sum += (acc_t)in_val * (acc_t)weight_val;
}
}
}
}
// 添加偏置
sum += (acc_t)biases[gc];
// ReLU激活
if (sum < 0) sum = 0;
// 写入输出
int output_idx = (gy * IMAGE_WIDTH + gx) * CHANNELS_OUT + gc;
output[output_idx] = sum;
}
// 卷积内核 - Single Work Item版本 (适合FPGA流水线)
__attribute__((max_global_work_dim(0)))
__kernel voidconvolution_3x3_single(
__global constdata_t *restrict input,
__global constweight_t *restrict weights,
__global constdata_t *restrict biases,
__global acc_t *restrict output,
__constant int *restrict params
)
{
// 本地行缓存 (参考4.3.2节行缓存技术)
data_t line_buffer[KERNEL_SIZE][IMAGE_WIDTH][CHANNELS_IN];
#pragma ivdep array(line_buffer)
// 权重本地缓存
weight_t weight_buffer[CHANNELS_OUT][CHANNELS_IN][KERNEL_SIZE][KERNEL_SIZE];
// 流水线处理
for (int y = 0; y < IMAGE_HEIGHT; y++) {
for (int x = 0; x < IMAGE_WIDTH; x++) {
// 读取新一行数据到行缓存
#pragma unroll
for (int c = 0; c < CHANNELS_IN; c++) {
int idx = (y * IMAGE_WIDTH + x) * CHANNELS_IN + c;
line_buffer[y % KERNEL_SIZE][x][c] = input[idx];
}
// 卷积计算 (当有足够行时)
if (y >= KERNEL_SIZE - 1) {
for (int co = 0; co < CHANNELS_OUT; co++) {
acc_t sum = 0;
#pragma unroll
for (int ci = 0; ci < CHANNELS_IN; ci++) {
#pragma unroll
for (int kh = 0; kh < KERNEL_SIZE; kh++) {
#pragma unroll
for (int kw = 0; kw < KERNEL_SIZE; kw++) {
data_t in_val = line_buffer[(y - kh) % KERNEL_SIZE][x - kw][ci];
weight_t w_val = weight_buffer[co][ci][kh][kw];
sum += (acc_t)in_val * (acc_t)w_val;
}
}
}
// 偏置与激活
sum += (acc_t)biases[co];
if (sum < 0) sum = 0;
// 写入输出
int out_idx = ((y - KERNEL_SIZE + 1) * IMAGE_WIDTH + (x - KERNEL_SIZE + 1)) * CHANNELS_OUT + co;
if (x >= KERNEL_SIZE - 1 && y >= KERNEL_SIZE - 1) {
output[out_idx] = sum;
}
}
}
}
}
}
2.2 主机代码模板
// ============================================================
// File: dnn_host.cpp
// Function: OpenCL主机代码框架 (参考8.2.2节开发流程)
// ============================================================
#include<CL/cl.h>
#include<iostream>
#include<vector>
#include<fstream>
#include<chrono>
classDNNOpenCLAccelerator {
private:
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
// 内核对象
cl_kernel kernel_conv;
cl_kernel kernel_pool;
cl_kernel kernel_fc;
// 缓冲区
cl_mem buf_input;
cl_mem buf_weights;
cl_mem buf_biases;
cl_mem buf_output;
cl_mem buf_params;
public:
// 构造函数
DNNOpenCLAccelerator() {
initialize_opencl();
build_program();
create_buffers();
create_kernels();
}
// 析构函数
~DNNOpenCLAccelerator() {
cleanup();
}
private:
// 1. 初始化OpenCL环境
voidinitialize_opencl(){
cl_int err;
// 获取平台
err = clGetPlatformIDs(1, &platform, NULL);
check_error(err, "clGetPlatformIDs");
// 获取设备 (优先选择FPGA)
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 1, &device, NULL);
if (err != CL_SUCCESS) {
std::cout << "No accelerator found, trying CPU..." << std::endl;
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
}
check_error(err, "clGetDeviceIDs");
// 创建设备上下文
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
check_error(err, "clCreateContext");
// 创建命令队列
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
check_error(err, "clCreateCommandQueue");
}
// 2. 构建程序
voidbuild_program(){
cl_int err;
// 读取内核文件
std::ifstream kernel_file("dnn_kernels.cl");
std::string kernel_code(
(std::istreambuf_iterator<char>(kernel_file)),
std::istreambuf_iterator<char>()
);
kernel_file.close();
constchar* kernel_str = kernel_code.c_str();
size_t kernel_len = kernel_code.length();
// 创建程序对象
program = clCreateProgramWithSource(context, 1, &kernel_str, &kernel_len, &err);
check_error(err, "clCreateProgramWithSource");
// 构建程序
err = clBuildProgram(program, 1, &device, "-I ./include", NULL, NULL);
if (err != CL_SUCCESS) {
// 获取构建日志
char build_log[4096];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(build_log), build_log, NULL);
std::cerr << "Build failed:\n" << build_log << std::endl;
exit(1);
}
}
// 3. 创建缓冲区
voidcreate_buffers(){
cl_int err;
size_t input_size = 224 * 224 * 3 * sizeof(short);
size_t weight_size = 64 * 3 * 3 * 3 * sizeof(short);
size_t bias_size = 64 * sizeof(short);
size_t output_size = 224 * 224 * 64 * sizeof(int);
size_t param_size = 4 * sizeof(int);
// 创建缓冲区 (使用CL_MEM_USE_HOST_PTR优化传输)
buf_input = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
input_size, NULL, &err);
check_error(err, "clCreateBuffer input");
buf_weights = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
weight_size, NULL, &err);
check_error(err, "clCreateBuffer weights");
buf_biases = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
bias_size, NULL, &err);
check_error(err, "clCreateBuffer biases");
buf_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
output_size, NULL, &err);
check_error(err, "clCreateBuffer output");
buf_params = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
param_size, NULL, &err);
check_error(err, "clCreateBuffer params");
}
// 4. 创建内核
voidcreate_kernels(){
cl_int err;
kernel_conv = clCreateKernel(program, "convolution_3x3_ndrange", &err);
check_error(err, "clCreateKernel conv");
kernel_pool = clCreateKernel(program, "pooling_2x2", &err);
check_error(err, "clCreateKernel pool");
kernel_fc = clCreateKernel(program, "fully_connected", &err);
check_error(err, "clCreateKernel fc");
}
public:
// 执行卷积层
floatrun_convolution(short* input_data, short* weight_data,
short* bias_data, int* output_data){
cl_int err;
cl_event events[3];
// 设置卷积参数
int params[4] = {1, 1, 224, 224}; // stride, padding, width, height
// 设置内核参数
err = clSetKernelArg(kernel_conv, 0, sizeof(cl_mem), &buf_input);
err |= clSetKernelArg(kernel_conv, 1, sizeof(cl_mem), &buf_weights);
err |= clSetKernelArg(kernel_conv, 2, sizeof(cl_mem), &buf_biases);
err |= clSetKernelArg(kernel_conv, 3, sizeof(cl_mem), &buf_output);
err |= clSetKernelArg(kernel_conv, 4, sizeof(cl_mem), &buf_params);
check_error(err, "clSetKernelArg");
// 异步数据传输 (输入)
err = clEnqueueWriteBuffer(queue, buf_input, CL_FALSE, 0,
224*224*3*sizeof(short), input_data,
0, NULL, &events[0](@ref);
err |= clEnqueueWriteBuffer(queue, buf_weights, CL_FALSE, 0,
64*3*3*3*sizeof(short), weight_data,
0, NULL, &events[1](@ref);
err |= clEnqueueWriteBuffer(queue, buf_params, CL_FALSE, 0,
4*sizeof(int), params,
0, NULL, &events[2](@ref);
check_error(err, "clEnqueueWriteBuffer");
// 等待数据传输完成
clWaitForEvents(3, events);
// 设置工作组大小
size_t global_size[3] = {224, 224, 64};
size_t local_size[3] = {8, 8, 4}; // 根据FPGA资源调整
// 执行内核
auto start = std::chrono::high_resolution_clock::now();
err = clEnqueueNDRangeKernel(queue, kernel_conv, 3, NULL,
global_size, local_size,
0, NULL, NULL);
check_error(err, "clEnqueueNDRangeKernel");
clFinish(queue);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<float> duration = end - start;
// 读取结果
err = clEnqueueReadBuffer(queue, buf_output, CL_TRUE, 0,
224*224*64*sizeof(int), output_data,
0, NULL, NULL);
check_error(err, "clEnqueueReadBuffer");
return duration.count();
}
private:
voidcheck_error(cl_int err, constchar* operation) {
if (err != CL_SUCCESS) {
std::cerr << "Error during " << operation << ": " << err << std::endl;
exit(1);
}
}
voidcleanup() {
clReleaseMemObject(buf_input);
clReleaseMemObject(buf_weights);
clReleaseMemObject(buf_biases);
clReleaseMemObject(buf_output);
clReleaseMemObject(buf_params);
clReleaseKernel(kernel_conv);
clReleaseKernel(kernel_pool);
clReleaseKernel(kernel_fc);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
}
};
// 使用示例
intmain() {
DNNOpenCLAccelerator accelerator;
// 准备数据
short input[3];
short weights[3];
short biases[64];
int output[64];
// 执行卷积
float time = accelerator.run_convolution(
(short*)input, (short*)weights, biases, (int*)output);
std::cout << "Convolution completed in " << time * 1000 << " ms" << std::endl;
return0;
}
三、 仿真测试框架模板
3.1 Verilog Testbench模板
// ============================================================
// File: tb_dnn_accelerator.v
// Testbench: 深度神经网络加速器测试平台
// 参考: 5.2节视频图像处理仿真测试系统
// ============================================================
`timescale 1ns/1ps
module tb_dnn_accelerator;
// ---------- 测试参数 ----------
parameter CLK_PERIOD = 10; // 100MHz时钟
parameter IMAGE_FILE = "test_image.dat";
parameter WEIGHT_FILE = "weights.dat";
parameter RESULT_FILE = "fpga_result.dat";
parameter GOLDEN_FILE = "golden_result.dat";
// ---------- 时钟与复位 ----------
reg clk;
reg rst_n;
// ---------- DUT接口 ----------
// AXI-Lite接口
reg [31:0] axi_awaddr;
reg axi_awvalid;
wire axi_awready;
reg [63:0] axi_wdata;
reg [7:0] axi_wstrb;
reg axi_wvalid;
wire axi_wready;
wire [1:0] axi_bresp;
wire axi_bvalid;
reg axi_bready;
reg [31:0] axi_araddr;
reg axi_arvalid;
wire axi_arready;
wire [63:0] axi_rdata;
wire [1:0] axi_rresp;
wire axi_rvalid;
reg axi_rready;
// AXI-Stream接口
reg s_axis_tvalid;
reg [63:0] s_axis_tdata;
reg s_axis_tlast;
wire s_axis_tready;
wire m_axis_tvalid;
wire [63:0] m_axis_tdata;
wire m_axis_tlast;
reg m_axis_tready;
// 状态信号
wire busy;
wire error;
wire [3:0] status;
// ---------- 测试控制 ----------
integer test_pass = 0;
integer test_fail = 0;
integer total_tests = 0;
// ---------- 测试数据 ----------
reg [7:0] test_image [0:224*224*3-1];
reg [7:0] test_weights [0:64*3*3*3-1];
reg [31:0] golden_result [0:224*224*64-1];
reg [31:0] fpga_result [0:224*224*64-1];
// ---------- DUT实例 ----------
dnn_accelerator_top dut (
.clk_i(clk),
.rst_n_i(rst_n),
// AXI-Lite接口
.axi_awaddr_i(axi_awaddr),
.axi_awvalid_i(axi_awvalid),
.axi_awready_o(axi_awready),
.axi_wdata_i(axi_wdata),
.axi_wstrb_i(axi_wstrb),
.axi_wvalid_i(axi_wvalid),
.axi_wready_o(axi_wready),
.axi_bresp_o(axi_bresp),
.axi_bvalid_o(axi_bvalid),
.axi_bready_i(axi_bready),
.axi_araddr_i(axi_araddr),
.axi_arvalid_i(axi_arvalid),
.axi_arready_o(axi_arready),
.axi_rdata_o(axi_rdata),
.axi_rresp_o(axi_rresp),
.axi_rvalid_o(axi_rvalid),
.axi_rready_i(axi_rready),
// AXI-Stream接口
.s_axis_tvalid_i(s_axis_tvalid),
.s_axis_tdata_i(s_axis_tdata),
.s_axis_tlast_i(s_axis_tlast),
.s_axis_tready_o(s_axis_tready),
.m_axis_tvalid_o(m_axis_tvalid),
.m_axis_tdata_o(m_axis_tdata),
.m_axis_tlast_o(m_axis_tlast),
.m_axis_tready_i(m_axis_tready),
// 状态
.busy_o(busy),
.error_o(error),
.status_o(status)
);
// ---------- 时钟生成 ----------
initialbegin
clk = 0;
forever#(CLK_PERIOD/2) clk = ~clk;
end
// ---------- 复位生成 ----------
initialbegin
rst_n = 0;
#100;
rst_n = 1;
end
// ---------- 主测试程序 ----------
initialbegin
$display("========================================");
$display("DNN Accelerator Testbench Starting...");
$display("Time: %t", $time);
$display("========================================");
// 初始化
init_signals();
// 等待复位完成
wait(rst_n == 1);
#100;
// 加载测试数据
load_test_data();
// 测试用例1: 寄存器读写测试
test_reg_access();
// 测试用例2: 单张图像推理测试
test_single_image();
// 测试用例3: 连续图像流测试
test_image_stream();
// 测试用例4: 边界条件测试
test_boundary_conditions();
// 测试用例5: 性能测试
test_performance();
// 生成测试报告
generate_report();
// 结束仿真
#1000;
$finish;
end
// ---------- 任务定义 ----------
// 初始化所有信号
task init_signals;
begin
axi_awaddr = 0;
axi_awvalid = 0;
axi_wdata = 0;
axi_wstrb = 0;
axi_wvalid = 0;
axi_bready = 0;
axi_araddr = 0;
axi_arvalid = 0;
axi_rready = 0;
s_axis_tvalid = 0;
s_axis_tdata = 0;
s_axis_tlast = 0;
m_axis_tready = 0;
end
endtask
// 加载测试数据
task load_test_data;
integer i, file;
begin
// 加载测试图像
file = $fopen(IMAGE_FILE, "rb");
if (file == 0) begin
$display("ERROR: Cannot open image file %s", IMAGE_FILE);
$finish;
end
for (i = 0; i < 224*224*3; i = i + 1) begin
$fscanf(file, "%h", test_image[i]);
end
$fclose(file);
// 加载权重
file = $fopen(WEIGHT_FILE, "rb");
for (i = 0; i < 64*3*3*3; i = i + 1) begin
$fscanf(file, "%h", test_weights[i]);
end
$fclose(file);
// 加载黄金结果
file = $fopen(GOLDEN_FILE, "rb");
for (i = 0; i < 224*224*64; i = i + 1) begin
$fscanf(file, "%h", golden_result[i]);
end
$fclose(file);
$display("Test data loaded successfully");
end
endtask
// AXI-Lite写事务
task axi_write;
input [31:0] addr;
input [63:0] data;
begin
@(posedge clk);
axi_awaddr = addr;
axi_awvalid = 1;
axi_wdata = data;
axi_wvalid = 1;
axi_wstrb = 8'hFF;
wait(axi_awready && axi_wready);
@(posedge clk);
axi_awvalid = 0;
axi_wvalid = 0;
axi_bready = 1;
wait(axi_bvalid);
@(posedge clk);
axi_bready = 0;
if (axi_bresp != 2'b00) begin
$display("ERROR: Write response error at address %h", addr);
test_fail = test_fail + 1;
end
total_tests = total_tests + 1;
end
endtask
// AXI-Lite读事务
task axi_read;
input [31:0] addr;
output [63:0] data;
begin
@(posedge clk);
axi_araddr = addr;
axi_arvalid = 1;
wait(axi_arready);
@(posedge clk);
axi_arvalid = 0;
axi_rready = 1;
wait(axi_rvalid);
data = axi_rdata;
@(posedge clk);
axi_rready = 0;
if (axi_rresp != 2'b00) begin
$display("ERROR: Read response error at address %h", addr);
test_fail = test_fail + 1;
end
total_tests = total_tests + 1;
end
endtask
// 发送图像数据
task send_image_data;
integer i, j;
begin
$display("Starting image data transmission...");
for (i = 0; i < 224*224*3; i = i + 8) begin
@(posedge clk);
s_axis_tvalid = 1;
s_axis_tdata = {
test_image[i+7], test_image[i+6], test_image[i+5], test_image[i+4],
test_image[i+3], test_image[i+2], test_image[i+1], test_image[i]
};
if (i == 224*224*3 - 8) begin
s_axis_tlast = 1;
end
wait(s_axis_tready);
@(posedge clk);
end
s_axis_tvalid = 0;
s_axis_tlast = 0;
$display("Image data transmission completed");
end
endtask
// 接收结果数据
task receive_results;
integer i, j;
reg [63:0] temp_data;
begin
$display("Waiting for results...");
m_axis_tready = 1;
for (i = 0; i < 224*224*64; i = i + 2) begin
wait(m_axis_tvalid);
@(posedge clk);
temp_data = m_axis_tdata;
fpga_result[i] = temp_data[31:0];
if (i+1 < 224*224*64) begin
fpga_result[i+1] = temp_data[63:32];
end
if (m_axis_tlast && i >= 224*224*64 - 2) begin
break;
end
end
m_axis_tready = 0;
$display("Results received");
end
endtask
// 结果验证
task verify_results;
integer i;
real abs_error, rel_error;
integer error_count = 0;
begin
$display("Verifying results against golden reference...");
for (i = 0; i < 224*224*64; i = i + 1) begin
abs_error = $itor(fpga_result[i]) - $itor(golden_result[i]);
if (abs_error < 0) abs_error = -abs_error;
rel_error = abs_error / ($itor(golden_result[i]) + 1.0);
if (rel_error > 0.001) begin// 允许0.1%的相对误差
error_count = error_count + 1;
if (error_count <= 10) begin// 只显示前10个错误
$display("ERROR at pixel %d: FPGA=%h, Golden=%h, RelErr=%.4f%%",
i, fpga_result[i], golden_result[i], rel_error*100);
end
end
end
if (error_count == 0) begin
$display("SUCCESS: All results match golden reference!");
test_pass = test_pass + 1;
endelsebegin
$display("FAILURE: %d mismatches found", error_count);
test_fail = test_fail + 1;
end
total_tests = total_tests + 1;
end
endtask
// ---------- 测试用例 ----------
// 测试用例1: 寄存器读写
task test_reg_access;
reg [63:0] read_data;
begin
$display("\n=== Test Case 1: Register Access ===");
// 写控制寄存器
axi_write(32'h0000, 64'h0000_0001); // 启动位
// 读状态寄存器
axi_read(32'h0004, read_data);
if (read_data[0] == 1'b0) begin
$display("Status register read successful");
test_pass = test_pass + 1;
endelsebegin
$display("ERROR: Status register incorrect");
test_fail = test_fail + 1;
end
total_tests = total_tests + 1;
end
endtask
// 测试用例2: 单张图像推理
task test_single_image;
begin
$display("\n=== Test Case 2: Single Image Inference ===");
// 配置加速器
axi_write(32'h0010, 64'h0000_00E0); // 图像宽度=224
axi_write(32'h0014, 64'h0000_00E0); // 图像高度=224
axi_write(32'h0018, 64'h0000_0003); // 输入通道=3
// 启动加速器
axi_write(32'h0000, 64'h0000_0001);
// 发送图像数据
fork
send_image_data();
receive_results();
join
// 等待完成
wait(busy == 0);
#100;
// 验证结果
verify_results();
// 保存结果到文件
save_results_to_file();
end
endtask
// 保存结果到文件
task save_results_to_file;
integer file, i;
begin
file = $fopen(RESULT_FILE, "w");
for (i = 0; i < 224*224*64; i = i + 1) begin
$fwrite(file, "%h\n", fpga_result[i]);
end
$fclose(file);
$display("Results saved to %s", RESULT_FILE);
end
endtask
// 测试用例3: 图像流测试
task test_image_stream;
integer iter;
begin
$display("\n=== Test Case 3: Image Stream Test ===");
for (iter = 0; iter < 10; iter = iter + 1) begin
$display("Iteration %d/%d", iter+1, 10);
axi_write(32'h0000, 64'h0000_0001);
fork
send_image_data();
receive_results();
join
wait(busy == 0);
#10;
if (error == 0) begin
$display(" Iteration passed");
test_pass = test_pass + 1;
endelsebegin
$display(" ERROR: Error flag set");
test_fail = test_fail + 1;
end
total_tests = total_tests + 1;
end
end
endtask
// 测试用例4: 边界条件
task test_boundary_conditions;
begin
$display("\n=== Test Case 4: Boundary Conditions ===");
// 测试最小尺寸
axi_write(32'h0010, 64'h0000_0001);
axi_write(32'h0014, 64'h0000_0001);
axi_write(32'h0000, 64'h0000_0001);
#100;
if (status[0] == 1'b1) begin
$display("Minimum size test passed");
test_pass = test_pass + 1;
endelsebegin
$display("Minimum size test failed");
test_fail = test_fail + 1;
end
total_tests = total_tests + 1;
// 恢复正常尺寸
axi_write(32'h0010, 64'h0000_00E0);
axi_write(32'h0014, 64'h0000_00E0);
end
endtask
// 测试用例5: 性能测试
task test_performance;
integer start_time, end_time;
real throughput, latency;
begin
$display("\n=== Test Case 5: Performance Test ===");
start_time = $time;
axi_write(32'h0000, 64'h0000_0001);
fork
send_image_data();
receive_results();
join
end_time = $time;
latency = (end_time - start_time) / 1000.0; // 转换为us
throughput = (224.0 * 224.0 * 64.0) / latency; // MAC/us
$display("Latency: %.2f us", latency);
$display("Throughput: %.2f MAC/us", throughput);
$display("Equivalent FPS: %.2f", 1e6 / latency);
if (throughput > 1000.0) begin// 假设性能目标为1000 MAC/us
$display("Performance target achieved");
test_pass = test_pass + 1;
endelsebegin
$display("Performance below target");
test_fail = test_fail + 1;
end
total_tests = total_tests + 1;
end
endtask
// 生成测试报告
task generate_report;
begin
$display("\n========================================");
$display("TEST REPORT");
$display("========================================");
$display("Total Tests: %0d", total_tests);
$display("Passed: %0d", test_pass);
$display("Failed: %0d", test_fail);
$display("Pass Rate: %.1f%%", (test_pass*100.0)/total_tests);
$display("========================================");
if (test_fail == 0) begin
$display("ALL TESTS PASSED!");
endelsebegin
$display("SOME TESTS FAILED!");
end
end
endtask
// ---------- 波形导出 ----------
initialbegin
// 设置波形文件
$dumpfile("dnn_accelerator.vcd");
$dumpvars(0, tb_dnn_accelerator);
// 可选:只导出关键信号以减少文件大小
// $dumpvars(1, dut);
end
// ---------- 超时保护 ----------
initialbegin
#5000000; // 5ms超时
$display("TIMEOUT: Simulation took too long");
$finish;
end
endmodule
3.2 Makefile自动化脚本
# ============================================================
# File: Makefile
# Function: FPGA仿真自动化脚本
# 参考: 6.3.6节自动化仿真平台
# ============================================================
# 工具设置
VLOG = vlog
VSIM = vsim
QUARTUS = quartus
VIVADO = vivado
# 目录结构
RTL_DIR = ./rtl
TB_DIR = ./tb
SIM_DIR = ./sim
SYNTH_DIR = ./synth
RESULT_DIR = ./results
# 源文件列表
RTL_SOURCES = \
$(RTL_DIR)/dnn_accelerator_top.v \
$(RTL_DIR)/conv_3x3_pe.v \
$(RTL_DIR)/pooling_unit.v \
$(RTL_DIR)/fc_unit.v \
$(RTL_DIR)/axi_interfaces.v \
$(RTL_DIR)/memory_controller.v
TB_SOURCES = \
$(TB_DIR)/tb_dnn_accelerator.v \
$(TB_DIR)/test_data_gen.v
# 默认目标
all: compile simulate synthesize
# 编译RTL和Testbench
compile:
@echo "Compiling RTL and Testbench..."
$(VLOG) -work work $(RTL_SOURCES)$(TB_SOURCES)
@echo "Compilation complete"
# 运行仿真
simulate: compile
@echo "Starting simulation..."
$(VSIM) -c -do "run -all; quit" work.tb_dnn_accelerator
@echo "Simulation complete"
# 检查仿真结果
@if grep -q "ALL TESTS PASSED" transcript; then \
echo "Simulation PASSED"; \
else \
echo "Simulation FAILED"; \
exit 1; \
fi
# 运行综合
synthesize:
@echo "Running synthesis..."
cd $(SYNTH_DIR) && $(QUARTUS) --flow compile dnn_accelerator.qpf
@echo "Synthesis complete"
# 检查时序报告
@if grep -q "Timing requirements were met"$(SYNTH_DIR)/output_files/dnn_accelerator.sta.rpt; then \
echo "Timing MET"; \
else \
echo "Timing VIOLATED"; \
exit 1; \
fi
# 清理
clean:
rm -rf work transcript *.vcd *.wlf
rm -rf $(SIM_DIR)/*
rm -rf $(RESULT_DIR)/*
@echo "Cleanup complete"
# 生成覆盖率报告
coverage: compile
$(VSIM) -c -do "coverage save -onexit coverage.ucdb; run -all; quit" work.tb_dnn_accelerator
$(VSIM) -viewcov coverage.ucdb
@echo "Coverage report generated"
# 批量测试
batch_test:
@echo "Running batch tests..."
@for test in tests/*.dat; do \
echo "Running test $$test"; \
cp $$test test_input.dat; \
$(VSIM) -c -do "run -all; quit" work.tb_dnn_accelerator; \
if grep -q "ALL TESTS PASSED" transcript; then \
echo "$$test: PASSED"; \
else \
echo "$$test: FAILED"; \
fi; \
done
# 性能分析
profile:
@echo "Generating performance profile..."
$(VSIM) -c -do "profile on; run -all; profile report -file profile.txt; quit" work.tb_dnn_accelerator
@cat profile.txt | head -20
@echo "Profile saved to profile.txt"
.PHONY: all compile simulate synthesize clean coverage batch_test profile
四、 使用说明
4.1 快速开始步骤
环境搭建:
# 安装仿真工具
source /opt/intelFPGA/20.1/modelsim_ase/init_modelsim.sh
# 创建项目结构
mkdir -p {rtl,tb,sim,synth,results,tests}
文件放置:
项目根目录/
#RTL源代码
dnn_accelerator_top.v
conv 3x3 pe.v
...
tb/ #测试平台
tb_dnn_accelerator.v
test data gen.v
sim/ #仿真文件
synth/ # 综合工程
tests/
# 测试用例
Makefile
#自动化脚本
运行测试:
# 编译并运行仿真
make simulate
# 生成覆盖率报告
make coverage
# 批量测试
make batch_test
4.2 模板定制要点
修改神经网络参数:
在顶层模块中调整 INPUT_WIDTH, INPUT_HEIGHT, INPUT_CHANNEL 等参数
在OpenCL内核中修改 #define 常量
适配不同FPGA平台:
Intel FPGA:使用Quartus和OpenCL for FPGA
Xilinx FPGA:使用Vivado和Vitis HLS
扩展测试用例:
在Testbench中添加新的测试任务
在tests/目录中添加新的测试数据文件
4.3 最佳实践建议
仿真优化:
对小分辨率图像进行仿真验证
适当提高timescale加速仿真
使用脚本自动化仿真流程
代码风格:
所有输出信号必须寄存器输出
使用参数化设计提高可重用性
添加充分的注释和文档
验证策略:
先做单元测试,再做集成测试
对比FPGA结果与软件黄金模型
覆盖边界条件和异常情况