FPGA实践教程(八)PS与PL共享DDR

背景:很多时候需要PS与PL共享DDR作为global memory,例如卷积之中,PS将weight in与feature写入DDR,然后PL调用DDR进行运算,再将结果写入DDR进行下一次迭代。

目的:1.  PS与PL共享DDR,读和写。并且像卷积一样需要三个指针。2.  IPcore设置变量,能通过PS能查看到IPcore运行位置。3. 运用BRAM实现一定的数据搬运。

一、IPcore编写
1.1 一种错误的接口
int share_dram_core(int write_nums,int read_nums,
volatile float * write_ptr,volatile float *read_ptr,
int location_idx,int write_loop_idx,int read_loop_idx,
int read_sum){
#pragma HLS INTERFACE m_axi depth=4096 port=write_ptr offset=slave
#pragma HLS INTERFACE m_axi depth=4096 port=read_ptr offset=slave
#pragma HLS INTERFACE s_axilite port=return
#pragma HLS INTERFACE s_axilite port=write_nums
#pragma HLS INTERFACE s_axilite port=read_nums
#pragma HLS INTERFACE s_axilite port=location_idx
#pragma HLS INTERFACE s_axilite port=write_loop_idx
#pragma HLS INTERFACE s_axilite port=read_loop_idx
#pragma HLS INTERFACE s_axilite port=read_sum

DRAM上不能有两个m_axi类型的指针,否则可能会遇到重叠等问题。

1.2 IPcore代码
int share_dram_core(int write_nums,int read_nums,
volatile float * data_ptr,
int location_idx,int write_loop_idx,int read_loop_idx,
int read_sum){
#pragma HLS INTERFACE m_axi depth=4096 port=data_ptr offset=slave
#pragma HLS INTERFACE s_axilite port=return register
#pragma HLS INTERFACE s_axilite port=write_nums register
#pragma HLS INTERFACE s_axilite port=read_nums register
#pragma HLS INTERFACE s_axilite port=location_idx register
#pragma HLS INTERFACE s_axilite port=write_loop_idx register
#pragma HLS INTERFACE s_axilite port=read_loop_idx register
#pragma HLS INTERFACE s_axilite port=read_sum register

location_idx=0;
write_loop_idx=0;
read_loop_idx=0;
read_sum=0;

for(int read_loc=0;read_loc read_sum+=data_ptr[read_loc];
read_loop_idx++;
}
location_idx=1;//Done read process

volatile float *write_ptr=&data_ptr[read_nums];

for(int write_loc=0;write_loc write_ptr[write_loc]=write_loc;
write_loop_idx++;
}
location_idx=2;//done write process

return 1; //return=1 means done
}

只要一个指针指向DRAM。

1.3 位置信息
location_idx表示IPcore当前位置,0表示刚开始,1表示完成写操作,2表示完成读操作

read_loop_idx表示当前IPcore读出DRAM的次数

write_loop_idx表示当前IPcore写入DRAM的次数

return 1表示程序运行完成且成功。

1.4 接口
  s_axilite
运用带return的s_axilite来设置IPcore的值与完成IPcore。传输位置IPcore的位置信息

  m_axi
运用主axi协议运用IPcore对DDR进行读写。只能有一个

Depth的设置问题:可能是IPcore可以读写DDR上的地址。我们设为4096(1024个4字节的浮点数)

二、testBench
2.1 程序编写
#include
#include
int share_dram_core(int write_nums,int read_nums,
volatile float * data_ptr,
int location_idx,int write_loop_idx,int read_loop_idx,
int read_sum);
int main(){
int PL_write_nums=50;
int PL_read_nums=50;
volatile float * PL_write_ptr;
volatile float * PL_read_ptr;

PL_read_ptr=(volatile float *)malloc(sizeof(float)*(PL_read_nums+PL_write_nums));
//PL_write_ptr=(volatile float *)malloc(sizeof(float)*PL_write_nums);

//PL_read_ptr=(volatile float *)0x00ac1680;
PL_write_ptr=&PL_read_ptr[PL_read_nums];

printf("Initilize SUCCESS!PL_write_num is %d,PL_read_num is %d\n",PL_write_nums,PL_read_nums);
printf("PL_read_ptr is %8x, PL_write_ptr is %8x \n",PL_write_ptr,PL_read_ptr);

for(int cur_PL_read_loc=0;cur_PL_read_loc PL_read_ptr[cur_PL_read_loc]=cur_PL_read_loc;
}
printf("PS write on PL read loc SUCCESS!\n");

int result=share_dram_core(PL_write_nums,PL_read_nums,
PL_read_ptr,
0,0,0,0);

for(int cur_PL_write_loc=0;cur_PL_write_loc if(PL_write_ptr[cur_PL_write_loc]!=cur_PL_write_loc){
printf("PL write ERROR!loc is %d, prt loc is %8x \n",&PL_write_ptr[cur_PL_write_loc]);
}
}
printf("Check PL write done!\n");
if(result==1){
printf("IPcore result SUCCESS!\n");
}

return 0;
}

2.2 PS与PL的交互
PS传出数据很简单,但是PL传出数据不易。所以尽量以PS多输出信息来验证PL的正确性。

更多信息通过一些参数传出来。例如location_idx, write_loop_idx; read_loop_idx; read_sum;
INFO: [SIM 4] CSIM will launch GCC as the compiler.
Compiling ../../../../src/share_dram_HLS_test.cpp in debug mode
Generating csim.exe
Initilize SUCCESS!PL_write_num is 50,PL_read_num is 50
PL_read_ptr is a21748, PL_write_ptr is a21680
PS write on PL read loc SUCCESS!
Check PL write done!
IPcore result SUCCESS!

synthesis,然后export RTL

三、系统搭建与hdf生成
运用已有的样板文件,hello world。加入HLS的IP。搭建系统。

使能GP与HP0,自动连接,create HDL wrapper,生成比特流,export到 local include bitstream

四、SDK
//created by Xing Xiangrui on 2018.12.25
//This is the SDK code to test share DRAM
//Write through PS to DDR
//Run PL : read from DDR to PL and write from PL to DDR
//Then read from DDR to PS

#include
#include
//#include
//#include "platform.h"
//#include
#include "xshare_dram_core.h"

XShare_dram_core XShare_dram_core_instance;

int main()
{
printf("\n --------------program start------------- \n");

//read and write param
int ps_wirte_size=5; int ps_read_size=5;
int core_location_idx=100;int core_write_loop_idx=100;int core_read_loop_idx=100;int core_read_sum=100;
int core_return_value=100;
volatile float * ps_write_ptr;
volatile float * ps_read_ptr;

//pointer intialize
ps_write_ptr=(volatile float *)malloc((ps_wirte_size+ps_read_size)*sizeof(float));
//ps_write_ptr= 0x10000000;
ps_read_ptr=&ps_write_ptr[ps_wirte_size];
if(ps_write_ptr==NULL)printf("Malloc ps_write_ptr failure \n");
if(ps_read_ptr==NULL)printf("Malloc ps_read_ptr failure \n");
memset((void*)ps_write_ptr,0,ps_wirte_size*sizeof(float));
memset((void*)ps_read_ptr,0,ps_read_size*sizeof(float));

printf("Initialize ps_read_ptr and ps_write_ptr SUCCESS!\n");
printf("ps_read_ptr is %8x \n",ps_read_ptr);
printf("ps_write_ptr is %8x \n",ps_write_ptr);

for(int cur_print_loc=0;cur_print_loc
printf("location %3d, value %f \n",cur_print_loc,ps_read_ptr[cur_print_loc]);
}

//initialize IPcore
XShare_dram_core_Initialize(&XShare_dram_core_instance, XPAR_SHARE_DRAM_CORE_0_DEVICE_ID);
printf("XShare_dram_core_Initialize SUCCESS!\n");

//get and printf values
core_location_idx=XShare_dram_core_Get_location_idx(&XShare_dram_core_instance);
core_write_loop_idx=XShare_dram_core_Get_write_loop_idx(&XShare_dram_core_instance);
core_read_loop_idx=XShare_dram_core_Get_read_loop_idx(&XShare_dram_core_instance);
core_read_sum=XShare_dram_core_Get_read_sum(&XShare_dram_core_instance);
core_return_value=XShare_dram_core_Get_return(&XShare_dram_core_instance);
printf("core_location_idx=%d \n",core_location_idx);
printf("core_write_loop_idx=%d \n",core_write_loop_idx);
printf("core_read_loop_idx=%d \n",core_read_loop_idx);
printf("core_read_sum=%d \n",core_read_sum);
printf("core_return_value=%d \n",core_return_value);

//initialize IPcore value
XShare_dram_core_Set_write_nums(&XShare_dram_core_instance, ps_read_size);
XShare_dram_core_Set_read_nums(&XShare_dram_core_instance, ps_wirte_size);
XShare_dram_core_Set_data_ptr(&XShare_dram_core_instance, ps_write_ptr);
printf("-------------Core value set SUCCESS! \n");

//get and printf values
core_location_idx=XShare_dram_core_Get_location_idx(&XShare_dram_core_instance);
core_write_loop_idx=XShare_dram_core_Get_write_loop_idx(&XShare_dram_core_instance);
core_read_loop_idx=XShare_dram_core_Get_read_loop_idx(&XShare_dram_core_instance);
core_read_sum=XShare_dram_core_Get_read_sum(&XShare_dram_core_instance);
core_return_value=XShare_dram_core_Get_return(&XShare_dram_core_instance);
printf("core_location_idx=%d \n",core_location_idx);
printf("core_write_loop_idx=%d \n",core_write_loop_idx);
printf("core_read_loop_idx=%d \n",core_read_loop_idx);
printf("core_read_sum=%d \n",core_read_sum);
printf("core_return_value=%d \n",core_return_value);

//IPcore start
XShare_dram_core_Start(&XShare_dram_core_instance);
printf("-------------IPCore start SUCCESS! \n");

//get and printf values
core_location_idx=XShare_dram_core_Get_location_idx(&XShare_dram_core_instance);
core_write_loop_idx=XShare_dram_core_Get_write_loop_idx(&XShare_dram_core_instance);
core_read_loop_idx=XShare_dram_core_Get_read_loop_idx(&XShare_dram_core_instance);
core_read_sum=XShare_dram_core_Get_read_sum(&XShare_dram_core_instance);
core_return_value=XShare_dram_core_Get_return(&XShare_dram_core_instance);
printf("core_location_idx=%d \n",core_location_idx);
printf("core_write_loop_idx=%d \n",core_write_loop_idx);
printf("core_read_loop_idx=%d \n",core_read_loop_idx);
printf("core_read_sum=%d \n",core_read_sum);
printf("core_return_value=%d \n",core_return_value);

while(!XShare_dram_core_IsDone(&XShare_dram_core_instance)){
printf("Calculating...\n");
}
printf("IsDone done SUCCESS!\n");

//get and printf values
core_location_idx=XShare_dram_core_Get_location_idx(&XShare_dram_core_instance);
core_write_loop_idx=XShare_dram_core_Get_write_loop_idx(&XShare_dram_core_instance);
core_read_loop_idx=XShare_dram_core_Get_read_loop_idx(&XShare_dram_core_instance);
core_read_sum=XShare_dram_core_Get_read_sum(&XShare_dram_core_instance);
core_return_value=XShare_dram_core_Get_return(&XShare_dram_core_instance);
printf("core_location_idx=%d \n",core_location_idx);
printf("core_write_loop_idx=%d \n",core_write_loop_idx);
printf("core_read_loop_idx=%d \n",core_read_loop_idx);
printf("core_read_sum=%d \n",core_read_sum);
printf("core_return_value=%d \n",core_return_value);

for(int cur_print_loc=0;cur_print_loc printf("location %3d, value %f \n",cur_print_loc,ps_read_ptr[cur_print_loc]);
}
printf("-----------Program end SUCCESS!- \n\n");
return 0;
}

用SDK打开vivado生成的文件夹下的 .sdk文件夹然后加载相应的hdf,生成bsp,创建c程序,hello world。build它。

启动FPGA,program FPGA将比特流烧录进去,然后运行程序。

4.1 用malloc的方式开辟内存
//pointer intialize
ps_write_ptr=(volatile float *)malloc((ps_wirte_size+ps_read_size)*sizeof(float));
//ps_write_ptr= 0x10000000;
ps_read_ptr=&ps_write_ptr[ps_wirte_size];
if(ps_write_ptr==NULL)printf("Malloc ps_write_ptr failure \n");
if(ps_read_ptr==NULL)printf("Malloc ps_read_ptr failure \n");
memset((void*)ps_write_ptr,0,ps_wirte_size*sizeof(float));
memset((void*)ps_read_ptr,0,ps_read_size*sizeof(float));

FPGA始终输出0,即IPcore并未有正确的动作。
--------------program start-------------
Initialize ps_read_ptr and ps_write_ptr SUCCESS!
ps_read_ptr is 114764
ps_write_ptr is 114750
location 0, value 0.000000
location 1, value 0.000000
location 2, value 0.000000
location 3, value 0.000000
location 4, value 0.000000
XShare_dram_core_Initialize SUCCESS!
core_location_idx=0
core_write_loop_idx=0
core_read_loop_idx=0
core_read_sum=0
core_return_value=0
-------------Core value set SUCCESS!
core_location_idx=0
core_write_loop_idx=0
core_read_loop_idx=0
core_read_sum=0
core_return_value=0
-------------IPCore start SUCCESS!
core_location_idx=0
core_write_loop_idx=0
core_read_loop_idx=0
core_read_sum=0
core_return_value=0
Calculating...
Calculating...
。。。

IPcore会一直不结束。

4.2 指定指针位置
ps_write_ptr= 0x10000000;
ps_read_ptr=&ps_write_ptr[ps_wirte_size];
if(ps_write_ptr==NULL)printf("Malloc ps_write_ptr failure \n");
if(ps_read_ptr==NULL)printf("Malloc ps_read_ptr failure \n");
memset((void*)ps_write_ptr,0,ps_wirte_size*sizeof(float));
memset((void*)ps_read_ptr,0,ps_read_size*sizeof(float));

依然无法用IPcore写入值。

--------------program start-------------
Initialize ps_read_ptr and ps_write_ptr SUCCESS!
ps_read_ptr is 10000014
ps_write_ptr is 10000000
location 0, value 0.000000
location 1, value 0.000000
location 2, value 0.000000
location 3, value 0.000000
location 4, value 0.000000
XShare_dram_core_Initialize SUCCESS!
core_location_idx=0
core_write_loop_idx=0
core_read_loop_idx=0
core_read_sum=0
core_return_value=1
-------------Core value set SUCCESS!
core_location_idx=0
core_write_loop_idx=0
core_read_loop_idx=0
core_read_sum=0
core_return_value=1
-------------IPCore start SUCCESS!
core_location_idx=0
core_write_loop_idx=0
core_read_loop_idx=0
core_read_sum=0
core_return_value=1
IsDone done SUCCESS!
core_location_idx=0
core_write_loop_idx=0
core_read_loop_idx=0
core_read_sum=0
core_return_value=1
location 0, value 0.000000
location 1, value 0.000000
location 2, value 0.000000
location 3, value 0.000000
location 4, value 0.000000
-----------Program end SUCCESS!-

五、SoC
SDK实现过程中会出现地址冲突的问题,难以实现共享DDR,我们用SoC的方法共享DDR。

5.1 交叉编译
MIZ7035交叉编译单片机程序运行  https://blog.csdn.net/weixin_36474809/article/details/86487043

5.2 驱动
驱动由HLS和vivado生成,相应的地址在vivado中可查。在zynqNet基础上更改:

#ifndef SHARED_DRAM_H_9B5B43B5
#define SHARED_DRAM_H_9B5B43B5

#include
#include
#include
#include

#include
#include
#include
#include
#include

#include "xfpga_hw.hpp" // Register addresses

typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;

// Location + Size of SHARED DRAM segment:
// - from Vivado Block Designer (Address Editor):
// AXI M memory bus starts at 0x00000000 – 0xFFFFFFFF, SIZE: 4GB
// - from information by Simon Wright:
// top 128MB of 1GB system memory are not OS-managed
// - from "free -m" on Zynq:
// total mem 882MB -> 118MB not OS-managed
// -> place SHARED_DRAM at 896MB (-> max. activations ~100MB)
// -> 896MB = 896*1024*1024 = 0x3800'0000 bytes
// -> 96MB = 96*1024*1024 = 0x600'0000 bytes

const off_t SHARED_DRAM_BASE_ADDR = 0x20000000;
const size_t SHARED_DRAM_MEM_SIZE = 0x06000000;
extern int SHARED_DRAM_FD;
extern volatile u32* SHARED_DRAM_PTR;

// External Interface
bool SHARED_DRAM_open();
bool SHARED_DRAM_close();
volatile u32* SHARED_DRAM_virtual();
volatile u32* SHARED_DRAM_physical();

// Internal Functions
volatile u32* map_SHARED_DRAM(off_t base_addr);
void release_SHARED_DRAM(volatile u32* axilite);

// unused:
// 32-bit word read + write (other sizes not supported!)
/* void shared_DRAM_write(u32 byte_addr, u32 value);
u32 shared_DRAM_read(u32 byte_addr); */

#endif /* end of include guard: SHARED_DRAM_H_9B5B43B5 */

#include "shared_dram.hpp"

int SHARED_DRAM_FD = -1;
volatile u32* SHARED_DRAM_PTR = NULL;

bool SHARED_DRAM_open() {
printf("XFPGA Driver: open /dev/mem handle\n");
// Check that it's not yet open
if (SHARED_DRAM_FD > -1) {
printf("SHARED_DRAM already open!\n");
return false;
}

// Memory Map SHARED_DRAM
SHARED_DRAM_PTR = map_SHARED_DRAM(SHARED_DRAM_BASE_ADDR);
printf("SHARED_DRAM_PTR=%X\n", (unsigned long)SHARED_DRAM_PTR);

// Make sure the file handle is really set
return (SHARED_DRAM_FD > -1);
}

bool SHARED_DRAM_close() {
printf("XFPGA Driver: close /dev/mem handle\n");
// Check that memory file is really open
if (SHARED_DRAM_FD == -1) {
printf("SHARED_DRAM bus not open!\n");
return false;
}
// Release Memory Region and File handle
release_SHARED_DRAM(SHARED_DRAM_PTR);
// Make sure file was correctly released
return (SHARED_DRAM_FD == -1);
}

volatile u32* SHARED_DRAM_virtual() {
return (volatile u32*) SHARED_DRAM_PTR;
}

volatile u32* SHARED_DRAM_physical() {
return (volatile u32*) SHARED_DRAM_BASE_ADDR;
}

////////////////////////////////////////////////////
////////////////// Helper Functions ////////////////

volatile u32* map_SHARED_DRAM(off_t base_addr) {
printf("XFPGA Driver: map shared DRAM at base address %X\n", (unsigned long)base_addr);
// make sure that base addr is aligned to memory pages...
base_addr &= ~(getpagesize() - 1);

// Open /dev/mem file (need root privileges or setuid!)
SHARED_DRAM_FD = open("/dev/mem", O_RDWR);
if (SHARED_DRAM_FD < 0) err(errno, "could not open /dev/mem. need to be root");

// Map SHARED_DRAM memory region to pointer
volatile u32* pointer = (u32*)mmap(NULL, SHARED_DRAM_MEM_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED, SHARED_DRAM_FD, base_addr);
if (pointer == MAP_FAILED) err(errno, "could not map memory for SHARED_DRAM bus");
return pointer;
}

void release_SHARED_DRAM(volatile u32* pointer) {
printf("XFPGA Driver: unmap shared DRAM\n");
// Release SHARED_DRAM memory region (unmap)
int retval = munmap((void*)pointer, SHARED_DRAM_MEM_SIZE);
if (retval < 0) err(errno, "could not unmap memory region for SHARED_DRAM bus");

// release file handle
retval = close(SHARED_DRAM_FD);
if (retval < 0) err(errno, "could not release /dev/mem file handle");

// set file handle variable s.t. we know it's closed
SHARED_DRAM_FD = -1;

}

5.3 运行
交叉编译,挂载,运行

---------------------
作者:邢翔瑞
来源:CSDN
原文:https://blog.csdn.net/weixin_36474809/article/details/85111550

最新文章

最新文章