#include #include #include #include #include #include #include #include #include #include #define DEVICE "/dev/fpga0" #define BAR PCILIB_BAR0 #define initAssert(ans) { initError((ans), __FILE__, __LINE__); } inline int initError(CUresult code, const char *file, int line) { if (code != CUDA_SUCCESS) { const char *error = NULL; cuGetErrorString (code, &error); fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", error, code, file, line); return code; } else { return 0; } } #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline int gpuAssert(cudaError_t code, const char *file, int line) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s (Code: %i) %s %d\n", cudaGetErrorString(code), code, file, line); return code; } else { return 0; } } int main(int argc, char *argv[]) { printf("\n"); int manual_init = 1; if (manual_init) { //CUDA initialization initAssert (cuInit(0)); int num_gpus; initAssert (cuDeviceGetCount (&num_gpus)); printf ("Found %i GPUs on the system\n", num_gpus); CUdevice gpu; //will be used to find the correct GPU for (num_gpus--; num_gpus >= 0; num_gpus--) { CUdevice current_gpu; initAssert (cuDeviceGet (¤t_gpu, num_gpus)); char gpu_name[30] = {0}; initAssert (cuDeviceGetName (gpu_name, 30, current_gpu)); printf("GPU %i: %s\n", num_gpus, gpu_name); if (strncmp (gpu_name, "Tesla K40", 9) == 0) { printf ("Found a Tesla GPU! I'll use that one.\n"); gpu = current_gpu; break; } } //The CU_CTX_MAP_HOST is what we are interested in! CUcontext context; initAssert (cuCtxCreate (&context, CU_CTX_MAP_HOST | CU_CTX_SCHED_AUTO, gpu)); initAssert (cuCtxSetCurrent (context)); //NOTE: API Version 3010 is problematic //(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html) unsigned int api_version; initAssert (cuCtxGetApiVersion (context, &api_version)); printf ("CUDA API Version: %u\n", api_version); } else { int gpu_found = 0; int num_gpus; gpuErrchk (cudaGetDeviceCount(&num_gpus)); for (num_gpus--; num_gpus >= 0; num_gpus--) { struct cudaDeviceProp dev_props; gpuErrchk (cudaGetDeviceProperties (&dev_props, num_gpus)); if (dev_props.canMapHostMemory > 0) { printf ("%s is able to map host memory. I'll use that one.\n", dev_props.name); gpuErrchk (cudaSetDevice (num_gpus)); gpuErrchk (cudaSetDeviceFlags(cudaDeviceMapHost)); gpu_found = 1; break; } else { printf ("%s can not map host memory. Won't use it.\n", dev_props.name); } } if (!gpu_found) { printf ("None of the installed GPUs can map host memory. Aborting...\n"); exit(1); } } printf ("CUDA init done\n\n"); pcilib_t *pci; void* volatile bar; const pcilib_bar_info_t *bar_info; pci = pcilib_open(DEVICE, PCILIB_MODEL_DETECT); if (!pci) { printf("pcilib_open\n"); exit(1); } bar = pcilib_resolve_bar_address(pci, BAR, 0); if (!bar) { pcilib_close(pci); printf("map bar\n"); exit(1); } printf("BAR mapped to: %p\n", bar); bar_info = pcilib_get_bar_info(pci, BAR); printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size); //write some test data to the BAR int *test = ((int *)(((unsigned long)bar)+0x9100)); /* int *test = (int *)malloc(sizeof(int)); */ /* int *test = (int *)0xfb000000; */ printf ("Pointing to: 0x%lx\n", test); *(test) = 0xdeadbeef; printf ("Set BAR content to: 0x%x\n", *(test)); //try to map the BAR address as host memory if (manual_init) { /* initAssert (cuMemHostRegister (foobar, 0x1000, CU_MEMHOSTREGISTER_DEVICEMAP)); */ /* initAssert (cuMemHostRegister (bar, 0x4, CU_MEMHOSTREGISTER_DEVICEMAP)); */ initAssert (cuMemHostRegister (test, 0x4, CU_MEMHOSTREGISTER_IOMEMORY)); /* *(volatile int *)(test) = 0xdeadbeef; */ } else { /* gpuErrchk(cudaHostRegister(bar, bar_info[BAR].size, cudaHostRegisterMapped | cudaHostRegisterPortable)) */ gpuErrchk(cudaHostRegister(test, 0x1000, cudaHostRegisterMapped | cudaHostRegisterPortable)) /* gpuErrchk(cudaHostRegister(bar, 0x1000, cudaHostRegisterDefault)) */ /* gpuErrchk(cudaHostRegister((void *)bar_info[BAR].phys_addr, 4, cudaHostRegisterMapped)) */ /* gpuErrchk(cudaHostRegister(foobar, 0x4, cudaHostRegisterDefault)) */ } /* get the GPU-Mapped pointer */ void *bar_gpu_pointer; gpuErrchk (cudaHostGetDevicePointer (&bar_gpu_pointer, test, 0)); /* create 'overwrite' data and load it to a normal CUDA memory */ int *overwrite = (int *)malloc(sizeof(int)); *overwrite = 0xdefaced; printf ("Set overwrite value to: 0x%x\n", *overwrite); void *normal_gpu_mem; gpuErrchk (cudaMalloc(&normal_gpu_mem, sizeof(int))); cudaDeviceSynchronize(); gpuErrchk (cudaMemcpy (normal_gpu_mem, overwrite, sizeof(int), cudaMemcpyHostToDevice)); /* read the 'overwrite' data back to mapped bar address purely in GPU address space */ gpuErrchk (cudaMemcpy (bar_gpu_pointer, normal_gpu_mem, sizeof(int), cudaMemcpyDeviceToDevice)); cudaDeviceSynchronize(); /* if everything works, this should show the overwrite data */ printf ("Read BAR as: 0x%x\n", *(test)); free (overwrite); pcilib_close(pci); printf("PCI closed\n"); }