|
@@ -0,0 +1,305 @@
|
|
|
+#include <stdio.h>
|
|
|
+#include <stdlib.h>
|
|
|
+#include "common.h"
|
|
|
+#include "kernels.h"
|
|
|
+#include "gdrapi.h"
|
|
|
+#include <pcilib.h>
|
|
|
+#include <pcilib/kmem.h>
|
|
|
+#include <pcilib/bar.h>
|
|
|
+#include "ipedma.h"
|
|
|
+#include <unistd.h>
|
|
|
+#include <pthread.h>
|
|
|
+
|
|
|
+#define KMEM_DEFAULT_FLAGS (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
|
|
|
+
|
|
|
+#define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
|
|
|
+#define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
|
|
|
+#define GPU_NAME_LENGTH 30
|
|
|
+
|
|
|
+#define GPU_PAGE 65536
|
|
|
+#define DATA 0xa2
|
|
|
+
|
|
|
+int stop_process = 0;
|
|
|
+
|
|
|
+void* cpu_load_compute(void* arg)
|
|
|
+{
|
|
|
+ /* Loops a matrix multiplication */
|
|
|
+ matrix I,matx,result;
|
|
|
+ I.rows = MATRIX_ROW_SIZE;
|
|
|
+ I.columns = I.rows;
|
|
|
+ I.stride = I.columns;
|
|
|
+ matx.rows = MATRIX_ROW_SIZE;
|
|
|
+ matx.columns = matx.rows;
|
|
|
+ matx.stride = matx.columns;
|
|
|
+ result.rows = MATRIX_ROW_SIZE;
|
|
|
+ result.columns = result.rows;
|
|
|
+ result.stride = result.columns;
|
|
|
+ I.elements = (float*)malloc(I.rows*I.columns*sizeof(float));
|
|
|
+ matx.elements = (float*)malloc(matx.rows*matx.columns*sizeof(float));
|
|
|
+ result.elements = (float*)malloc(result.rows*result.columns*sizeof(float));
|
|
|
+
|
|
|
+ fill_matrix_random(I);
|
|
|
+ fill_matrix_random(matx);
|
|
|
+
|
|
|
+ while(!stop_process){
|
|
|
+ mult_matrix(I, matx, result);
|
|
|
+ }
|
|
|
+ free(I.elements);
|
|
|
+ free(result.elements);
|
|
|
+ free(matx.elements);
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+void* gpu_load_compute(void* arg)
|
|
|
+{
|
|
|
+ /* Loops a kernel that multiplies matrix */
|
|
|
+ dim3 blocks_per_grid(MATRIX_ROW_SIZE/BLOCK_SIZE,MATRIX_ROW_SIZE/BLOCK_SIZE);
|
|
|
+ dim3 threads_per_block(BLOCK_SIZE,BLOCK_SIZE);
|
|
|
+ matrix dev_I,dev_matx,dev_result;
|
|
|
+
|
|
|
+ dev_I.rows = MATRIX_ROW_SIZE;
|
|
|
+ dev_I.columns = dev_I.rows;
|
|
|
+ dev_I.stride = dev_I.columns;
|
|
|
+ dev_matx.rows = MATRIX_ROW_SIZE;
|
|
|
+ dev_matx.columns = dev_matx.rows;
|
|
|
+ dev_matx.rows = dev_matx.columns;
|
|
|
+ dev_result.rows = MATRIX_ROW_SIZE;
|
|
|
+ dev_result.columns = dev_result.rows;
|
|
|
+ dev_result.stride = dev_result.columns;
|
|
|
+ assert_cuda( cudaMalloc((void**)&dev_I.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
|
|
|
+ assert_cuda( cudaMalloc((void**)&dev_matx.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
|
|
|
+ assert_cuda( cudaMalloc((void**)&dev_result.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
|
|
|
+
|
|
|
+ kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_I);
|
|
|
+ kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_matx);
|
|
|
+
|
|
|
+ while(!stop_process){
|
|
|
+ kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result);
|
|
|
+ }
|
|
|
+ assert_cuda( cudaFree(dev_I.elements) );
|
|
|
+ assert_cuda( cudaFree(dev_result.elements) );
|
|
|
+ assert_cuda( cudaFree(dev_matx.elements) );
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+void* cpu_load_memory(void* arg)
|
|
|
+{
|
|
|
+ char* foo = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
|
|
|
+ char* bar = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
|
|
|
+ while(!stop_process){
|
|
|
+ memcpy(foo, bar, MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char));
|
|
|
+ }
|
|
|
+ free(foo);
|
|
|
+ free(bar);
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+void* gpu_load_memory(void* arg)
|
|
|
+{
|
|
|
+ char *dev_foo,*dev_bar;
|
|
|
+ assert_cuda( cudaMalloc((void**)&dev_foo,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
|
|
|
+ assert_cuda( cudaMalloc((void**)&dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
|
|
|
+ while(!stop_process){
|
|
|
+ assert_cuda( cudaMemcpy(dev_foo,dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char),cudaMemcpyDeviceToDevice) );
|
|
|
+ }
|
|
|
+ assert_cuda( cudaFree(dev_foo) );
|
|
|
+ assert_cuda( cudaFree(dev_bar) );
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+int main(int argc, char* argv[])
|
|
|
+{
|
|
|
+ FILE* fp = fopen("loaded.csv","a");
|
|
|
+ if( fp == NULL ){
|
|
|
+ printf("Cannot open file loaded.csv\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+
|
|
|
+ int nb_bytes = atoi(argv[argc -1]);
|
|
|
+ printf("nb_bytes = %d\n",nb_bytes);
|
|
|
+ int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
|
|
|
+
|
|
|
+ unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
|
|
|
+ memset(data,DATA,nb_bytes);
|
|
|
+ init_to_send(data,sizeof(char),nb_bytes);
|
|
|
+ system("/home/mathiasb/sources/benchmarking/launch.sh");
|
|
|
+
|
|
|
+ /* Initialisation of the APIs */
|
|
|
+ assert_cu( cuInit(0) );
|
|
|
+ gdr_t g = gdr_open();
|
|
|
+ if( g==NULL){
|
|
|
+ printf("Could not open gdr\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+ /* Manage NVIDIA GPU */
|
|
|
+ printf("\nInitialisation of the GPU\n");
|
|
|
+ CUdevice GPU;
|
|
|
+ CUdevprop GPUProp;
|
|
|
+ assert_cu( cuDeviceGet(&GPU,0) );
|
|
|
+ assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
|
|
|
+ char gpu_name[GPU_NAME_LENGTH] = {0};
|
|
|
+ assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU));
|
|
|
+ printf("GPU: %s\n", gpu_name);
|
|
|
+ /* Check context */
|
|
|
+ CUcontext cuCtx;
|
|
|
+ assert_cu( cuCtxCreate(&cuCtx,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU) );
|
|
|
+ assert_cu( cuCtxSetCurrent(cuCtx) );
|
|
|
+
|
|
|
+ /* Allocate memory on the device, pin and map */
|
|
|
+ uint8_t flagValueToSet = 1;
|
|
|
+ printf("\nMemory mapping with the GPU for pages\n");
|
|
|
+ CUdeviceptr gpuPagePtr;
|
|
|
+ assert_cu( cuMemAlloc(&gpuPagePtr,nb_bytes) );
|
|
|
+ assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr) );
|
|
|
+ gdr_mh_t GPUMemHandlePage;
|
|
|
+ assert_gdr( gdr_pin_buffer(g,gpuPagePtr,nb_bytes,0,0,&GPUMemHandlePage) );
|
|
|
+ void* gpuPageVa;
|
|
|
+ assert_gdr( gdr_map(g,GPUMemHandlePage,&gpuPageVa,nb_bytes) );
|
|
|
+ gdr_info_t pageInfo;
|
|
|
+ assert_gdr( gdr_get_info(g,GPUMemHandlePage,&pageInfo) );
|
|
|
+ printf("Memory mapping with the GPU for descriptors\n");
|
|
|
+ CUdeviceptr gpuDescPtr;
|
|
|
+ assert_cu( cuMemAlloc(&gpuDescPtr,GPU_PAGE) );
|
|
|
+ assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr) );
|
|
|
+ gdr_mh_t GPUMemHandleDesc;
|
|
|
+ assert_gdr( gdr_pin_buffer(g,gpuDescPtr,GPU_PAGE,0,0,&GPUMemHandleDesc) );
|
|
|
+ void* gpuDescVa;
|
|
|
+ assert_gdr( gdr_map(g,GPUMemHandleDesc,&gpuDescVa,GPU_PAGE) );
|
|
|
+ gdr_info_t descInfo;
|
|
|
+ assert_gdr( gdr_get_info(g,GPUMemHandleDesc,&descInfo) );
|
|
|
+
|
|
|
+ /* PCI */
|
|
|
+ printf("\nSetting up the PCI\n");
|
|
|
+ pcilib_t* pciCtx;
|
|
|
+ char* pciVa;
|
|
|
+ pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
|
|
|
+ if( pciCtx == NULL ){
|
|
|
+ printf("Cannot open a context for pci\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+ pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
|
|
|
+ if( pciVa == NULL ){
|
|
|
+ printf("Cannot resolve PCI physical adress to virtual\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+ CUdeviceptr dBAR;
|
|
|
+ assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
|
|
|
+ assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
|
|
|
+
|
|
|
+ /* Config PCI for Pages*/
|
|
|
+ pcilib_kmem_handle_t* pciHandlePage;
|
|
|
+ pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((nb_bytes%4096)?(4096 * (1 + nb_bytes/4096)):nb_bytes), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
|
|
|
+ if( pciHandlePage == NULL ){
|
|
|
+ printf("Cannot allocate PCI kernel memory\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+ volatile void* pciMemPtrPage;
|
|
|
+ uintptr_t pciBusPage;
|
|
|
+ pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
|
|
|
+ if( pciMemPtrPage == NULL ){
|
|
|
+ printf("Cannot get PCI pointer to kernel memory\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+ pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
|
|
|
+ if( pciBusPage == 0 ){
|
|
|
+ printf("Cannot get PCI Bus address on kernel memory\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+ /* Config PCI for Desc */
|
|
|
+ pcilib_kmem_handle_t* pciHandleDesc;
|
|
|
+ pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
|
|
|
+ if( pciHandleDesc == NULL ){
|
|
|
+ printf("Cannot allocate PCI kernel memory\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+ volatile void* pciMemPtrDesc;
|
|
|
+ uintptr_t pciBusDesc;
|
|
|
+ pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
|
|
|
+ if( pciMemPtrDesc == NULL ){
|
|
|
+ printf("Cannot get PCI pointer to kernel memory\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+ pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
|
|
|
+ if( pciBusDesc == 0 ){
|
|
|
+ printf("Cannot get PCI Bus address on kernel memory\n");
|
|
|
+ exit( EXIT_FAILURE );
|
|
|
+ }
|
|
|
+
|
|
|
+ double simple_write_meas1;
|
|
|
+ double simple_write_meas2;
|
|
|
+ double meas_result;
|
|
|
+ unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
|
|
|
+
|
|
|
+ pthread_t cpu_compute,cpu_mem,gpu_compute,gpu_mem;
|
|
|
+ pthread_create(&cpu_compute, NULL, cpu_load_compute, NULL);
|
|
|
+ pthread_create(&gpu_compute, NULL, gpu_load_compute, NULL);
|
|
|
+ pthread_create(&cpu_mem, NULL, cpu_load_memory, NULL);
|
|
|
+ pthread_create(&gpu_mem, NULL, gpu_load_memory, NULL);
|
|
|
+
|
|
|
+ printf("\nWorking on the FPGA\n");
|
|
|
+ WR32(REG_RESET_DMA, 1);
|
|
|
+ usleep(100000);
|
|
|
+ WR32(REG_RESET_DMA, 0);
|
|
|
+ usleep(100000);
|
|
|
+ WR32_sleep(REG_PERF_COUNTER,0);
|
|
|
+ WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,nb_transfer); //16);
|
|
|
+WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
|
|
|
+ WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
|
|
|
+ WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
|
|
|
+ WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
|
|
|
+ WR64_sleep(REG_UPDATE_ADDRESS,descInfo.bus_addr+DESCRIPTOR_OFFSET);
|
|
|
+
|
|
|
+ WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
|
|
|
+ WR32_sleep(REG_DMA,1);
|
|
|
+
|
|
|
+ WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
|
|
|
+ WR32_sleep(REG_COUNTER,0x1);
|
|
|
+ *(int*)pciMemPtrDesc=0;
|
|
|
+ simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
|
|
|
+ WR64(REG_DESCRIPTOR_ADDRESS,pageInfo.bus_addr);
|
|
|
+ while(!*(int*)pciMemPtrDesc)
|
|
|
+ simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
|
|
|
+ meas_result=simple_write_meas2-simple_write_meas1;
|
|
|
+
|
|
|
+ usleep(1000);
|
|
|
+
|
|
|
+ fprintf(fp,"%lf",meas_result);
|
|
|
+
|
|
|
+
|
|
|
+ /* Close everything */
|
|
|
+ printf("\nClosing the connections\n");
|
|
|
+
|
|
|
+ stop_process = 1;
|
|
|
+ pthread_join(cpu_compute,NULL);
|
|
|
+ pthread_join(gpu_compute,NULL);
|
|
|
+ pthread_join(cpu_mem,NULL);
|
|
|
+ pthread_join(gpu_mem,NULL);
|
|
|
+
|
|
|
+ free(getBack);
|
|
|
+ WR32(REG_COUNTER, 0);
|
|
|
+ WR32(REG_DMA, 0);
|
|
|
+ WR32(REG_RESET_DMA, 1);
|
|
|
+ usleep (100000);
|
|
|
+ WR32(REG_RESET_DMA, 0);
|
|
|
+ usleep (100000);
|
|
|
+
|
|
|
+ pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
|
|
|
+ pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
|
|
|
+ assert_cu( cuMemHostUnregister((void*) pciVa) );
|
|
|
+ pcilib_close(pciCtx);
|
|
|
+ assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,nb_bytes) );
|
|
|
+ assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
|
|
|
+ assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE) );
|
|
|
+ assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
|
|
|
+ assert_gdr( gdr_close(g) );
|
|
|
+ assert_cu( cuMemFree(gpuPagePtr) );
|
|
|
+ assert_cu( cuMemFree(gpuDescPtr) );
|
|
|
+ assert_cu( cuCtxDestroy(cuCtx) );
|
|
|
+
|
|
|
+ fclose(fp);
|
|
|
+
|
|
|
+ printf("All Cleared\n");
|
|
|
+
|
|
|
+ exit(EXIT_SUCCESS);
|
|
|
+}
|