#define _XOPEN_SOURCE 500 #include #include #include #include #include #include #include #include #include #include #include #include "ocl.h" /* this should actually come from the distributed pcitool sources */ #include "pciDriver.h" typedef struct { /* pcilib */ pcilib_t *pci; uint8_t *bar; cl_ulong bar_phys; uint8_t board_gen; pcilib_kmem_handle_t *kdesc; uintptr_t kdesc_bus; volatile uint32_t *desc; /* OpenCL */ OclPlatform *ocl; cl_device_id device; cl_command_queue queue; cl_context context; cl_program program; cl_kernel kernel; cl_mem check_buffer; /* both */ cl_mem fpga_buffer; struct { cl_mem buffer; cl_bus_address_amd addr; } gpu; struct { pcilib_kmem_handle_t *kmem; uint32_t *buffer; uintptr_t addr; } cpu; } App; #define UNICODE_CHECK_MARK "o" #define UNICODE_CROSS "x" #define KMEM_DEFAULT_FLAGS PCILIB_KMEM_FLAG_HARDWARE | \ PCILIB_KMEM_FLAG_PERSISTENT | \ PCILIB_KMEM_FLAG_EXCLUSIVE #define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1) #define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2) #define REG_RESET_DMA 0x00 #define REG_DMA 0x04 #define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10 #define REG_PERF_COUNTER 0x28 #define REG_PACKET_LENGTH 0x0C #define REG_DESCRIPTOR_ADDRESS 0x50 #define REG_UPDATE_ADDRESS 0x58 #define REG_UPDATE_THRESHOLD 0x60 #define REG_COUNTER 0x9000 #define REG_VERSION 0x9020 #define REG_CONTROL 0x9040 #define REG_LATENCY 0x9044 #define REG_NUM_ROWS 0x9168 #define REG_NUM_FRAMES 0x9170 #define REG_DBG_RQ_RST 0x9344 #define REG_DBG_CQ_RST 0x93A4 #define CMD_DMA_START 0x1 #define CMD_DMA_STOP 0x0 #define CMD_COUNTER_RESET 0xf0 #define CMD_COUNTER_START 0x1 #define CMD_COUNTER_STOP 0x0 #define CMD_LATENCY_RESET 0x0800000f #define CMD_LATENCY_STOP 0xf0000000 #define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); #define RD32(addr) (*(uint32_t *) (app->bar + (addr))) #define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100); #define WR64(addr, value) *(uint64_t *) (app->bar + (addr)) = (value); #define RD64(addr) (*(uint64_t *) (app->bar + (addr))) #define WR64_sleep(addr, value) *(uint64_t *) (app->bar + (addr)) = (value); usleep (100); static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL; static clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD = NULL; static const size_t TLP_SIZE = 64; static const size_t FPGA_BUFFER_SIZE = 1024 * 64; static uint32_t PAGE_SIZE = 4096; static uint32_t NUM_PAGES = 1; static size_t GPU_BUFFER_SIZE = 4096; static size_t CPU_BUFFER_SIZE = 4096; static size_t CHECK_BUFFER_SIZE = 8; /* declaration should actually come from a distributed header file */ const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *); static bool init_pcilib (App *app) { static const char *DEVICE = "/dev/fpga0"; const pcilib_board_info_t *board; app->pci = pcilib_open (DEVICE, "pci"); if (app->pci == NULL) { printf ("Could not open `%s'", DEVICE); return false; } app->bar = pcilib_map_bar (app->pci, PCILIB_BAR0); if (app->bar == NULL) { printf ("Unable to map BAR\n"); pcilib_close (app->pci); return false; } board = pcilib_get_board_info (app->pci); app->bar_phys = board->bar_start[PCILIB_BAR0]; app->board_gen = RD32 (0x18) & 0xF; app->kdesc = pcilib_alloc_kernel_memory (app->pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS); app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0); app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0); memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t)); printf ("%-16s 0x%X\n", "Firmware", RD32 (REG_VERSION)); return true; } static void close_pcilib (App *app) { pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE; pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS); pcilib_free_kernel_memory (app->pci, app->cpu.kmem, KMEM_DEFAULT_FLAGS); pcilib_clean_kernel_memory (app->pci, KMEM_USE_DEFAULT, flags); pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar); pcilib_close (app->pci); } static cl_int create_fpga_buffer (App *app, size_t size) { cl_mem_flags flags; cl_bus_address_amd addr; cl_int error; flags = CL_MEM_EXTERNAL_PHYSICAL_AMD; addr.surface_bus_address = (cl_ulong) app->bar_phys; addr.marker_bus_address = (cl_ulong) app->bar_phys; app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error); return error; } static cl_int create_gpu_buffer (App *app, size_t size) { cl_mem_flags flags; cl_int error; char *data; data = malloc (size); memset (data, 42, size); flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR; app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error); if (error != CL_SUCCESS) return error; return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL); } static cl_int create_cpu_buffer (App *app, size_t size) { pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE; app->cpu.kmem = pcilib_alloc_kernel_memory (app->pci, /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT, 1, size, CPU_BUFFER_SIZE, KMEM_USE_DEFAULT, flags); app->cpu.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu.kmem, 0); app->cpu.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu.kmem, 0); memset (app->cpu.buffer, 42, CPU_BUFFER_SIZE); return 0; } static bool init_opencl (App *app) { cl_int error; cl_platform_id platform; app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE); platform = ocl_get_platform (app->ocl); clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD"); clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueWaitSignalAMD"); app->device = ocl_get_devices (app->ocl)[0]; app->queue = ocl_get_cmd_queues (app->ocl)[0]; app->context = ocl_get_context (app->ocl); app->program = ocl_create_program_from_file (app->ocl, "kernel.cl", NULL, &error); OCL_CHECK_ERROR (error); app->kernel = clCreateKernel (app->program, "wait_and_write", &error); OCL_CHECK_ERROR (error); app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, CHECK_BUFFER_SIZE, NULL, &error); OCL_CHECK_ERROR (error); error |= create_fpga_buffer (app, FPGA_BUFFER_SIZE); OCL_CHECK_ERROR (error); error |= create_gpu_buffer (app, GPU_BUFFER_SIZE); OCL_CHECK_ERROR (error); error |= create_cpu_buffer (app, CPU_BUFFER_SIZE); OCL_CHECK_ERROR (error); return error != CL_SUCCESS ? false : true; } static void close_opencl (App *app) { OCL_CHECK_ERROR (clReleaseKernel (app->kernel)); OCL_CHECK_ERROR (clReleaseProgram (app->program)); OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer)); OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer)); ocl_free (app->ocl); } static void debug_assert (const char *message, bool condition) { printf ("%-16s ", message); if (condition) printf (UNICODE_CHECK_MARK"\n"); else printf (UNICODE_CROSS"\n"); } static void debug_assert_cmp (const char *message, uint32_t value, uint32_t expected) { printf ("%-16s ", message); if (value != expected) printf (UNICODE_CROSS" [%x (%i) != %x (%i)]\n", value, value, expected, expected); else printf (UNICODE_CHECK_MARK"\n"); } #if 0 static void check_value (App *app, const char *message, uint32_t addr, uint32_t expected) { uint32_t value; value = RD32 (addr); debug_assert_cmp (message, value, expected); } #endif static void configure_dma (App *app) { uint32_t value; WR32 (REG_RESET_DMA, 1); usleep (100000); WR32 (REG_RESET_DMA, 0); usleep (100000); value = RD32 (REG_RESET_DMA); debug_assert ("PCIe check", value == 335746816 || value == 335681280); WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE)); if (app->board_gen == 3) { WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE); } else { WR32 (REG_PACKET_LENGTH, TLP_SIZE); } /* reset host side addr */ app->desc[app->board_gen == 3 ? 2 : 4] = 0; } static void configure_dma_descriptors (App *app) { WR32 (REG_UPDATE_THRESHOLD, 0); WR64 (REG_UPDATE_ADDRESS, app->kdesc_bus); usleep (100000); } static void setup_counter (App *app) { WR32_sleep (REG_NUM_ROWS, 0); WR32_sleep (REG_NUM_FRAMES, 0); WR32_sleep (REG_CONTROL, CMD_LATENCY_RESET); WR32_sleep (REG_CONTROL, CMD_COUNTER_STOP); WR32_sleep (REG_COUNTER, CMD_COUNTER_RESET); WR32_sleep (REG_COUNTER, CMD_COUNTER_START); } static void stop_dma (App *app) { WR32_sleep (REG_DMA, 0); } static void reset_dbg (App *app) { // TX WR32_sleep (REG_DBG_RQ_RST, 1); WR32_sleep (REG_DBG_RQ_RST, 0); // RX WR32_sleep (REG_DBG_CQ_RST, 1); WR32_sleep (REG_DBG_CQ_RST, 0); } static double compute_debug_latency (App *app) { uint64_t start, start_high, start_low; uint64_t end, end_high, end_low; WR32_sleep (0x93A0, 0); start_high = RD32 (0x93B8); start_low = RD32 (0x93BC); WR32_sleep (0x93A0,1) end_high = RD32 (0x93B8); end_low = RD32 (0x93BC); start = (start_high << 32) | start_low; end = (end_high << 32) | end_low; return ((end - start) * 4) / 1000.0; } static double elapsed_seconds (struct timespec *start, struct timespec *end) { return (end->tv_sec + end->tv_nsec / 1000000000.0) - (start->tv_sec + start->tv_nsec / 1000000000.0); } static double elapsed_gpu_seconds (cl_event event) { cl_ulong start; cl_ulong end; OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL)); OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL)); return (end - start) / 1000.0 / 1000.0 / 1000.0; } static void check_data_transfer (App *app, size_t check_size) { uint32_t *data; data = malloc (check_size); memset (data, 0, check_size); OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL)); for (uint32_t i = 0; i < check_size / 4; i++) { if (data[i] != i) { debug_assert_cmp ("FPGA->GPU write", data[i], i); goto finish_check_data_transfer; } } debug_assert ("FPGA->GPU write", true); finish_check_data_transfer: free (data); } static void check_data_transfer_cpu (App *app, size_t check_size) { for (uint32_t i = 0; i < check_size / 4; i++) { if (app->cpu.buffer[i] != i) { debug_assert_cmp ("FPGA->CPU write", app->cpu.buffer[i], i); return; } } debug_assert ("FPGA->CPU write", true); } static void fill_gpu_buffer (App *app) { uint32_t pattern; cl_event event; pattern = 42; OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, app->gpu.buffer, &pattern, sizeof (pattern), 0, GPU_BUFFER_SIZE / 4, 0, NULL, &event)); OCL_CHECK_ERROR (clWaitForEvents (1, &event)); OCL_CHECK_ERROR (clReleaseEvent (event)); } static void measure_fpga_to_gpu_latency_with_marker (App *app) { uint32_t counter; struct timespec start; struct timespec end; double host_latency; double debug_latency; cl_event event; printf ("\n** FPGA to GPU latency [marker]\n\n"); fill_gpu_buffer (app); configure_dma (app); WR32 (REG_UPDATE_THRESHOLD, 0); WR64_sleep (REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address); setup_counter (app); WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address); reset_dbg (app); OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 0xd0dad0da, 0, NULL, &event)); clock_gettime (CLOCK_MONOTONIC, &start); WR32 (REG_DMA, 1); OCL_CHECK_ERROR (clWaitForEvents (1, &event)); clock_gettime (CLOCK_MONOTONIC, &end); stop_dma (app); OCL_CHECK_ERROR (clReleaseEvent (event)); counter = RD32 (REG_PERF_COUNTER); host_latency = elapsed_seconds (&start, &end); debug_latency = compute_debug_latency (app); check_data_transfer (app, PAGE_SIZE); printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0); printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0); printf ("%-16s %f us\n", "FPGA [debug]", debug_latency); printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.); } static void measure_fpga_to_gpu_latency_with_kernel (App *app) { uint32_t counter; struct timespec start; struct timespec end; double host_latency; double debug_latency; cl_event event; uint32_t check[CHECK_BUFFER_SIZE / 4]; size_t work_size = 1; printf ("\n** FPGA to GPU latency [kernel]\n\n"); fill_gpu_buffer (app); configure_dma (app); configure_dma_descriptors (app); setup_counter (app); WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address); reset_dbg (app); OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->gpu.buffer)); OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->fpga_buffer)); OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (cl_mem), &app->check_buffer)); OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1, NULL, &work_size, NULL, 0, NULL, &event)); WR32 (REG_DMA, 1); clock_gettime (CLOCK_MONOTONIC, &start); clWaitForEvents (1, &event); clock_gettime (CLOCK_MONOTONIC, &end); OCL_CHECK_ERROR (clReleaseEvent (event)); counter = RD32 (REG_PERF_COUNTER); host_latency = elapsed_seconds (&start, &end); debug_latency = compute_debug_latency (app); check_data_transfer (app, PAGE_SIZE); OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE, 0, CHECK_BUFFER_SIZE, check, 0, NULL, NULL)); debug_assert_cmp ("Data check", check[0], 1); printf ("\n%-16s %i\n", "Kernel count", check[1]); printf ("%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0); printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0); printf ("%-16s %f us\n", "FPGA [debug]", debug_latency); printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.); } static void measure_fpga_to_gpu_latency_with_cpu (App *app) { uint32_t hardware_ptr; uint32_t counter; unsigned flag_index; struct timespec start; struct timespec end; double host_latency; double debug_latency; printf ("\n** FPGA to GPU latency [CPU]\n\n"); fill_gpu_buffer (app); hardware_ptr = 0; flag_index = app->board_gen == 3 ? 2 : 4; configure_dma (app); configure_dma_descriptors (app); setup_counter (app); WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address); reset_dbg (app); clock_gettime (CLOCK_MONOTONIC, &start); WR32 (REG_DMA, 1); do { hardware_ptr = app->desc[flag_index]; } while (hardware_ptr != (app->gpu.addr.surface_bus_address & 0xFFFFFFFF)); clock_gettime (CLOCK_MONOTONIC, &end); stop_dma (app); counter = RD32 (REG_PERF_COUNTER); host_latency = elapsed_seconds (&start, &end); debug_latency = compute_debug_latency (app); check_data_transfer (app, PAGE_SIZE); printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0); printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0); printf ("%-16s %f us\n", "FPGA [debug]", debug_latency); printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.); } static void measure_fpga_to_cpu_latency (App *app) { uint32_t hardware_ptr; uint32_t counter; unsigned flag_index; struct timespec start; struct timespec end; double host_latency; double debug_latency; printf ("\n** FPGA to CPU latency\n\n"); hardware_ptr = 0; flag_index = app->board_gen == 3 ? 2 : 4; configure_dma (app); configure_dma_descriptors (app); setup_counter (app); WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->cpu.addr); reset_dbg (app); clock_gettime (CLOCK_MONOTONIC, &start); WR32 (REG_DMA, 1); do { hardware_ptr = app->desc[flag_index]; } while (hardware_ptr != app->cpu.addr); clock_gettime (CLOCK_MONOTONIC, &end); stop_dma (app); // Data back to FPGA memcpy(app->bar + 0x9400, app->cpu.buffer, CPU_BUFFER_SIZE); counter = RD32 (REG_PERF_COUNTER); host_latency = elapsed_seconds (&start, &end); debug_latency = compute_debug_latency (app); check_data_transfer_cpu (app, PAGE_SIZE); printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0); printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0); printf ("%-16s %f us\n", "FPGA [debug]", debug_latency); printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.); } static void measure_fpga_to_gpu_throughput (App *app) { uint32_t current_ptr; uint32_t hardware_ptr; unsigned flag_index; size_t transferred; cl_ulong addr; double elapsed; struct timespec start; struct timespec end; printf ("\n** FPGA to GPU throughput\n\n"); configure_dma (app); configure_dma_descriptors (app); setup_counter (app); current_ptr = 0; hardware_ptr = 0; transferred = 0; addr = app->gpu.addr.surface_bus_address; flag_index = app->board_gen == 3 ? 2 : 4; WR64_sleep (REG_DESCRIPTOR_ADDRESS, addr); clock_gettime (CLOCK_MONOTONIC, &start); WR32 (REG_DMA, 1); do { do { hardware_ptr = app->desc[flag_index]; } while (hardware_ptr == current_ptr); /* it should work to check against `addr` but it never does */ addr += NUM_PAGES * PAGE_SIZE; transferred += NUM_PAGES * PAGE_SIZE; current_ptr = hardware_ptr; WR64 (REG_DESCRIPTOR_ADDRESS, addr); } while (transferred < GPU_BUFFER_SIZE); clock_gettime (CLOCK_MONOTONIC, &end); stop_dma (app); check_data_transfer (app, GPU_BUFFER_SIZE); elapsed = elapsed_seconds (&start, &end); printf ("\n%-16s %.2f MB/s [%3.5f us]\n", "Host", GPU_BUFFER_SIZE / elapsed / 1024. / 1024., elapsed * 1000 * 1000); } static void measure_gpu_to_fpga_latency (App *app) { cl_event event; cl_mem src_buffer; cl_int err; uint32_t pattern; struct timespec start; struct timespec end; printf ("\n** GPU to FPGA latency\n\n"); src_buffer = clCreateBuffer (app->context, CL_MEM_READ_ONLY, PAGE_SIZE, NULL, &err); OCL_CHECK_ERROR (err); pattern = 0xdeadf00d; OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, src_buffer, &pattern, sizeof (pattern), 0, PAGE_SIZE, 0, NULL, &event)); OCL_CHECK_ERROR (clWaitForEvents (1, &event)); OCL_CHECK_ERROR (clReleaseEvent (event)); clock_gettime (CLOCK_MONOTONIC, &start); OCL_CHECK_ERROR (clEnqueueCopyBuffer (app->queue, src_buffer, app->fpga_buffer, 0, 0x9400, PAGE_SIZE, 0, NULL, &event)); OCL_CHECK_ERROR (clWaitForEvents (1, &event)); clock_gettime (CLOCK_MONOTONIC, &end); printf ("%-16s %f us\n", "Wall time", elapsed_seconds (&start, &end) * 1000 * 1000); printf ("%-16s %f us\n", "GPU time", elapsed_gpu_seconds (event) * 1000 * 1000); OCL_CHECK_ERROR (clReleaseEvent (event)); OCL_CHECK_ERROR (clReleaseMemObject (src_buffer)); } int main (int argc, char const* argv[]) { App app; if (argc > 1) { PAGE_SIZE = GPU_BUFFER_SIZE = atoi (argv[1]); } if (argc > 2) { NUM_PAGES = atoi (argv[2]); } printf ("** Parameters\n\n"); printf ("%-16s %u\n", "Pages", NUM_PAGES); printf ("%-16s %zu B\n", "Buffer size", GPU_BUFFER_SIZE); if (!init_pcilib (&app)) return 1; if (!init_opencl (&app)) return 1; measure_fpga_to_gpu_latency_with_marker (&app); measure_fpga_to_gpu_latency_with_kernel (&app); measure_fpga_to_gpu_latency_with_cpu (&app); measure_fpga_to_cpu_latency (&app); measure_gpu_to_fpga_latency (&app); measure_fpga_to_gpu_throughput (&app); close_opencl (&app); close_pcilib (&app); return 0; }