123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769 |
- #define _XOPEN_SOURCE 500
- #include <stdio.h>
- #include <stdlib.h>
- #include <stdbool.h>
- #include <string.h>
- #include <unistd.h>
- #include <time.h>
- #include <pcilib.h>
- #include <pcilib/bar.h>
- #include <pcilib/kmem.h>
- #include <CL/cl.h>
- #include <CL/cl_ext.h>
- #include "ocl.h"
- /* this should actually come from the distributed pcitool sources */
- #include "pciDriver.h"
- typedef struct {
- /* pcilib */
- pcilib_t *pci;
- uint8_t *bar;
- cl_ulong bar_phys;
- uint8_t board_gen;
- pcilib_kmem_handle_t *kdesc;
- uintptr_t kdesc_bus;
- volatile uint32_t *desc;
- /* OpenCL */
- OclPlatform *ocl;
- cl_device_id device;
- cl_command_queue queue;
- cl_context context;
- cl_program program;
- cl_kernel kernel;
- cl_mem check_buffer;
- /* both */
- cl_mem fpga_buffer;
- struct {
- cl_mem buffer;
- cl_bus_address_amd addr;
- } gpu;
- struct {
- pcilib_kmem_handle_t *kmem;
- uint32_t *buffer;
- uintptr_t addr;
- } cpu;
- } App;
- #define UNICODE_CHECK_MARK "o"
- #define UNICODE_CROSS "x"
- #define KMEM_DEFAULT_FLAGS PCILIB_KMEM_FLAG_HARDWARE | \
- PCILIB_KMEM_FLAG_PERSISTENT | \
- PCILIB_KMEM_FLAG_EXCLUSIVE
- #define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
- #define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
- #define REG_RESET_DMA 0x00
- #define REG_DMA 0x04
- #define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10
- #define REG_PERF_COUNTER 0x28
- #define REG_PACKET_LENGTH 0x0C
- #define REG_DESCRIPTOR_ADDRESS 0x50
- #define REG_UPDATE_ADDRESS 0x58
- #define REG_UPDATE_THRESHOLD 0x60
- #define REG_COUNTER 0x9000
- #define REG_VERSION 0x9020
- #define REG_CONTROL 0x9040
- #define REG_LATENCY 0x9044
- #define REG_NUM_ROWS 0x9168
- #define REG_NUM_FRAMES 0x9170
- #define REG_DBG_RQ_RST 0x9344
- #define REG_DBG_CQ_RST 0x93A4
- #define CMD_DMA_START 0x1
- #define CMD_DMA_STOP 0x0
- #define CMD_COUNTER_RESET 0xf0
- #define CMD_COUNTER_START 0x1
- #define CMD_COUNTER_STOP 0x0
- #define CMD_LATENCY_RESET 0x0800000f
- #define CMD_LATENCY_STOP 0xf0000000
- #define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
- #define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
- #define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);
- #define WR64(addr, value) *(uint64_t *) (app->bar + (addr)) = (value);
- #define RD64(addr) (*(uint64_t *) (app->bar + (addr)))
- #define WR64_sleep(addr, value) *(uint64_t *) (app->bar + (addr)) = (value); usleep (100);
- static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL;
- static clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD = NULL;
- static const size_t TLP_SIZE = 64;
- static const size_t FPGA_BUFFER_SIZE = 1024 * 64;
- static uint32_t PAGE_SIZE = 4096;
- static uint32_t NUM_PAGES = 1;
- static size_t GPU_BUFFER_SIZE = 4096;
- static size_t CPU_BUFFER_SIZE = 4096;
- static size_t CHECK_BUFFER_SIZE = 8;
- /* declaration should actually come from a distributed header file */
- const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);
- static bool
- init_pcilib (App *app)
- {
- static const char *DEVICE = "/dev/fpga0";
- const pcilib_board_info_t *board;
- app->pci = pcilib_open (DEVICE, "pci");
- if (app->pci == NULL) {
- printf ("Could not open `%s'", DEVICE);
- return false;
- }
- app->bar = pcilib_map_bar (app->pci, PCILIB_BAR0);
- if (app->bar == NULL) {
- printf ("Unable to map BAR\n");
- pcilib_close (app->pci);
- return false;
- }
- board = pcilib_get_board_info (app->pci);
- app->bar_phys = board->bar_start[PCILIB_BAR0];
- app->board_gen = RD32 (0x18) & 0xF;
- app->kdesc = pcilib_alloc_kernel_memory (app->pci,
- PCILIB_KMEM_TYPE_CONSISTENT, 1, 128,
- 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
- app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0);
- app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0);
- memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));
- printf ("%-16s 0x%X\n", "Firmware", RD32 (REG_VERSION));
- return true;
- }
- static void
- close_pcilib (App *app)
- {
- pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
- pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
- pcilib_free_kernel_memory (app->pci, app->cpu.kmem, KMEM_DEFAULT_FLAGS);
- pcilib_clean_kernel_memory (app->pci, KMEM_USE_DEFAULT, flags);
- pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
- pcilib_close (app->pci);
- }
- static cl_int
- create_fpga_buffer (App *app, size_t size)
- {
- cl_mem_flags flags;
- cl_bus_address_amd addr;
- cl_int error;
- flags = CL_MEM_EXTERNAL_PHYSICAL_AMD;
- addr.surface_bus_address = (cl_ulong) app->bar_phys;
- addr.marker_bus_address = (cl_ulong) app->bar_phys;
- app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error);
- return error;
- }
- static cl_int
- create_gpu_buffer (App *app, size_t size)
- {
- cl_mem_flags flags;
- cl_int error;
- char *data;
- data = malloc (size);
- memset (data, 42, size);
- flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR;
- app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error);
- if (error != CL_SUCCESS)
- return error;
- return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
- }
- static cl_int
- create_cpu_buffer (App *app, size_t size)
- {
- pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
- app->cpu.kmem = pcilib_alloc_kernel_memory (app->pci,
- /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
- 1, size, CPU_BUFFER_SIZE, KMEM_USE_DEFAULT, flags);
- app->cpu.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu.kmem, 0);
- app->cpu.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu.kmem, 0);
- memset (app->cpu.buffer, 42, CPU_BUFFER_SIZE);
- return 0;
- }
- static bool
- init_opencl (App *app)
- {
- cl_int error;
- cl_platform_id platform;
- app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE);
- platform = ocl_get_platform (app->ocl);
- clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD");
- clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueWaitSignalAMD");
- app->device = ocl_get_devices (app->ocl)[0];
- app->queue = ocl_get_cmd_queues (app->ocl)[0];
- app->context = ocl_get_context (app->ocl);
- app->program = ocl_create_program_from_file (app->ocl, "kernel.cl", NULL, &error);
- OCL_CHECK_ERROR (error);
- app->kernel = clCreateKernel (app->program, "wait_and_write", &error);
- OCL_CHECK_ERROR (error);
- app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, CHECK_BUFFER_SIZE, NULL, &error);
- OCL_CHECK_ERROR (error);
- error |= create_fpga_buffer (app, FPGA_BUFFER_SIZE);
- OCL_CHECK_ERROR (error);
- error |= create_gpu_buffer (app, GPU_BUFFER_SIZE);
- OCL_CHECK_ERROR (error);
- error |= create_cpu_buffer (app, CPU_BUFFER_SIZE);
- OCL_CHECK_ERROR (error);
- return error != CL_SUCCESS ? false : true;
- }
- static void
- close_opencl (App *app)
- {
- OCL_CHECK_ERROR (clReleaseKernel (app->kernel));
- OCL_CHECK_ERROR (clReleaseProgram (app->program));
- OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
- OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
- ocl_free (app->ocl);
- }
- static void
- debug_assert (const char *message, bool condition)
- {
- printf ("%-16s ", message);
- if (condition)
- printf (UNICODE_CHECK_MARK"\n");
- else
- printf (UNICODE_CROSS"\n");
- }
- static void
- debug_assert_cmp (const char *message, uint32_t value, uint32_t expected)
- {
- printf ("%-16s ", message);
- if (value != expected)
- printf (UNICODE_CROSS" [%x (%i) != %x (%i)]\n", value, value, expected, expected);
- else
- printf (UNICODE_CHECK_MARK"\n");
- }
- #if 0
- static void
- check_value (App *app, const char *message, uint32_t addr, uint32_t expected)
- {
- uint32_t value;
- value = RD32 (addr);
- debug_assert_cmp (message, value, expected);
- }
- #endif
- static void
- configure_dma (App *app)
- {
- uint32_t value;
- WR32 (REG_RESET_DMA, 1);
- usleep (100000);
- WR32 (REG_RESET_DMA, 0);
- usleep (100000);
- value = RD32 (REG_RESET_DMA);
- debug_assert ("PCIe check", value == 335746816 || value == 335681280);
- WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));
- if (app->board_gen == 3) {
- WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
- }
- else {
- WR32 (REG_PACKET_LENGTH, TLP_SIZE);
- }
- /* reset host side addr */
- app->desc[app->board_gen == 3 ? 2 : 4] = 0;
- }
- static void
- configure_dma_descriptors (App *app)
- {
- WR32 (REG_UPDATE_THRESHOLD, 0);
- WR64 (REG_UPDATE_ADDRESS, app->kdesc_bus);
- usleep (100000);
- }
- static void
- setup_counter (App *app)
- {
- WR32_sleep (REG_NUM_ROWS, 0);
- WR32_sleep (REG_NUM_FRAMES, 0);
- WR32_sleep (REG_CONTROL, CMD_LATENCY_RESET);
- WR32_sleep (REG_CONTROL, CMD_COUNTER_STOP);
- WR32_sleep (REG_COUNTER, CMD_COUNTER_RESET);
- WR32_sleep (REG_COUNTER, CMD_COUNTER_START);
- }
- static void
- stop_dma (App *app)
- {
- WR32_sleep (REG_DMA, 0);
- }
- static void
- reset_dbg (App *app)
- {
- // TX
- WR32_sleep (REG_DBG_RQ_RST, 1);
- WR32_sleep (REG_DBG_RQ_RST, 0);
- // RX
- WR32_sleep (REG_DBG_CQ_RST, 1);
- WR32_sleep (REG_DBG_CQ_RST, 0);
- }
- static double
- compute_debug_latency (App *app)
- {
- uint64_t start, start_high, start_low;
- uint64_t end, end_high, end_low;
- WR32_sleep (0x93A0, 0);
- start_high = RD32 (0x93B8);
- start_low = RD32 (0x93BC);
- WR32_sleep (0x93A0,1)
- end_high = RD32 (0x93B8);
- end_low = RD32 (0x93BC);
- start = (start_high << 32) | start_low;
- end = (end_high << 32) | end_low;
- return ((end - start) * 4) / 1000.0;
- }
- static double
- elapsed_seconds (struct timespec *start, struct timespec *end)
- {
- return (end->tv_sec + end->tv_nsec / 1000000000.0) - (start->tv_sec + start->tv_nsec / 1000000000.0);
- }
- static double
- elapsed_gpu_seconds (cl_event event)
- {
- cl_ulong start;
- cl_ulong end;
- OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
- OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
- return (end - start) / 1000.0 / 1000.0 / 1000.0;
- }
- static void
- check_data_transfer (App *app, size_t check_size)
- {
- uint32_t *data;
- data = malloc (check_size);
- memset (data, 0, check_size);
- OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));
- for (uint32_t i = 0; i < check_size / 4; i++) {
- if (data[i] != i) {
- debug_assert_cmp ("FPGA->GPU write", data[i], i);
- goto finish_check_data_transfer;
- }
- }
- debug_assert ("FPGA->GPU write", true);
- finish_check_data_transfer:
- free (data);
- }
- static void
- check_data_transfer_cpu (App *app, size_t check_size)
- {
- for (uint32_t i = 0; i < check_size / 4; i++) {
- if (app->cpu.buffer[i] != i) {
- debug_assert_cmp ("FPGA->CPU write", app->cpu.buffer[i], i);
- return;
- }
- }
- debug_assert ("FPGA->CPU write", true);
- }
- static void
- fill_gpu_buffer (App *app)
- {
- uint32_t pattern;
- cl_event event;
- pattern = 42;
- OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, app->gpu.buffer, &pattern, sizeof (pattern), 0, GPU_BUFFER_SIZE / 4, 0, NULL, &event));
- OCL_CHECK_ERROR (clWaitForEvents (1, &event));
- OCL_CHECK_ERROR (clReleaseEvent (event));
- }
- static void
- measure_fpga_to_gpu_latency_with_marker (App *app)
- {
- uint32_t counter;
- struct timespec start;
- struct timespec end;
- double host_latency;
- double debug_latency;
- cl_event event;
- printf ("\n** FPGA to GPU latency [marker]\n\n");
- fill_gpu_buffer (app);
- configure_dma (app);
- WR32 (REG_UPDATE_THRESHOLD, 0);
- WR64_sleep (REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);
- setup_counter (app);
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
- reset_dbg (app);
- OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 0xd0dad0da, 0, NULL, &event));
- clock_gettime (CLOCK_MONOTONIC, &start);
- WR32 (REG_DMA, 1);
- OCL_CHECK_ERROR (clWaitForEvents (1, &event));
- clock_gettime (CLOCK_MONOTONIC, &end);
- stop_dma (app);
- OCL_CHECK_ERROR (clReleaseEvent (event));
- counter = RD32 (REG_PERF_COUNTER);
- host_latency = elapsed_seconds (&start, &end);
- debug_latency = compute_debug_latency (app);
- check_data_transfer (app, PAGE_SIZE);
- printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
- printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
- printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
- printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
- }
- static void
- measure_fpga_to_gpu_latency_with_kernel (App *app)
- {
- uint32_t counter;
- struct timespec start;
- struct timespec end;
- double host_latency;
- double debug_latency;
- cl_event event;
- uint32_t check[CHECK_BUFFER_SIZE / 4];
- size_t work_size = 1;
- printf ("\n** FPGA to GPU latency [kernel]\n\n");
- fill_gpu_buffer (app);
- configure_dma (app);
- configure_dma_descriptors (app);
- setup_counter (app);
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
- reset_dbg (app);
- OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->gpu.buffer));
- OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->fpga_buffer));
- OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (cl_mem), &app->check_buffer));
- OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1,
- NULL, &work_size, NULL,
- 0, NULL, &event));
- WR32 (REG_DMA, 1);
- clock_gettime (CLOCK_MONOTONIC, &start);
- clWaitForEvents (1, &event);
- clock_gettime (CLOCK_MONOTONIC, &end);
- OCL_CHECK_ERROR (clReleaseEvent (event));
- counter = RD32 (REG_PERF_COUNTER);
- host_latency = elapsed_seconds (&start, &end);
- debug_latency = compute_debug_latency (app);
- check_data_transfer (app, PAGE_SIZE);
- OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE,
- 0, CHECK_BUFFER_SIZE, check, 0, NULL, NULL));
- debug_assert_cmp ("Data check", check[0], 1);
- printf ("\n%-16s %i\n", "Kernel count", check[1]);
- printf ("%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
- printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
- printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
- printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
- }
- static void
- measure_fpga_to_gpu_latency_with_cpu (App *app)
- {
- uint32_t hardware_ptr;
- uint32_t counter;
- unsigned flag_index;
- struct timespec start;
- struct timespec end;
- double host_latency;
- double debug_latency;
- printf ("\n** FPGA to GPU latency [CPU]\n\n");
- fill_gpu_buffer (app);
- hardware_ptr = 0;
- flag_index = app->board_gen == 3 ? 2 : 4;
- configure_dma (app);
- configure_dma_descriptors (app);
- setup_counter (app);
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
- reset_dbg (app);
- clock_gettime (CLOCK_MONOTONIC, &start);
- WR32 (REG_DMA, 1);
- do {
- hardware_ptr = app->desc[flag_index];
- }
- while (hardware_ptr != (app->gpu.addr.surface_bus_address & 0xFFFFFFFF));
- clock_gettime (CLOCK_MONOTONIC, &end);
- stop_dma (app);
- counter = RD32 (REG_PERF_COUNTER);
- host_latency = elapsed_seconds (&start, &end);
- debug_latency = compute_debug_latency (app);
- check_data_transfer (app, PAGE_SIZE);
- printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
- printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
- printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
- printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
- }
- static void
- measure_fpga_to_cpu_latency (App *app)
- {
- uint32_t hardware_ptr;
- uint32_t counter;
- unsigned flag_index;
- struct timespec start;
- struct timespec end;
- double host_latency;
- double debug_latency;
- printf ("\n** FPGA to CPU latency\n\n");
- hardware_ptr = 0;
- flag_index = app->board_gen == 3 ? 2 : 4;
- configure_dma (app);
- configure_dma_descriptors (app);
- setup_counter (app);
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->cpu.addr);
- reset_dbg (app);
- clock_gettime (CLOCK_MONOTONIC, &start);
- WR32 (REG_DMA, 1);
- do {
- hardware_ptr = app->desc[flag_index];
- }
- while (hardware_ptr != app->cpu.addr);
- clock_gettime (CLOCK_MONOTONIC, &end);
- stop_dma (app);
- // Data back to FPGA
- memcpy(app->bar + 0x9400, app->cpu.buffer, CPU_BUFFER_SIZE);
- counter = RD32 (REG_PERF_COUNTER);
- host_latency = elapsed_seconds (&start, &end);
- debug_latency = compute_debug_latency (app);
- check_data_transfer_cpu (app, PAGE_SIZE);
- printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
- printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
- printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
- printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
- }
- static void
- measure_fpga_to_gpu_throughput (App *app)
- {
- uint32_t current_ptr;
- uint32_t hardware_ptr;
- unsigned flag_index;
- size_t transferred;
- cl_ulong addr;
- double elapsed;
- struct timespec start;
- struct timespec end;
- printf ("\n** FPGA to GPU throughput\n\n");
- configure_dma (app);
- configure_dma_descriptors (app);
- setup_counter (app);
- current_ptr = 0;
- hardware_ptr = 0;
- transferred = 0;
- addr = app->gpu.addr.surface_bus_address;
- flag_index = app->board_gen == 3 ? 2 : 4;
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, addr);
- clock_gettime (CLOCK_MONOTONIC, &start);
- WR32 (REG_DMA, 1);
- do {
- do {
- hardware_ptr = app->desc[flag_index];
- }
- while (hardware_ptr == current_ptr); /* it should work to check against `addr` but it never does */
- addr += NUM_PAGES * PAGE_SIZE;
- transferred += NUM_PAGES * PAGE_SIZE;
- current_ptr = hardware_ptr;
- WR64 (REG_DESCRIPTOR_ADDRESS, addr);
- }
- while (transferred < GPU_BUFFER_SIZE);
- clock_gettime (CLOCK_MONOTONIC, &end);
- stop_dma (app);
- check_data_transfer (app, GPU_BUFFER_SIZE);
- elapsed = elapsed_seconds (&start, &end);
- printf ("\n%-16s %.2f MB/s [%3.5f us]\n", "Host", GPU_BUFFER_SIZE / elapsed / 1024. / 1024., elapsed * 1000 * 1000);
- }
- static void
- measure_gpu_to_fpga_latency (App *app)
- {
- cl_event event;
- cl_mem src_buffer;
- cl_int err;
- uint32_t pattern;
- struct timespec start;
- struct timespec end;
- printf ("\n** GPU to FPGA latency\n\n");
- src_buffer = clCreateBuffer (app->context, CL_MEM_READ_ONLY, PAGE_SIZE, NULL, &err);
- OCL_CHECK_ERROR (err);
- pattern = 0xdeadf00d;
- OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, src_buffer, &pattern, sizeof (pattern), 0, PAGE_SIZE, 0, NULL, &event));
- OCL_CHECK_ERROR (clWaitForEvents (1, &event));
- OCL_CHECK_ERROR (clReleaseEvent (event));
- clock_gettime (CLOCK_MONOTONIC, &start);
- OCL_CHECK_ERROR (clEnqueueCopyBuffer (app->queue, src_buffer, app->fpga_buffer, 0, 0x9400, PAGE_SIZE, 0, NULL, &event));
- OCL_CHECK_ERROR (clWaitForEvents (1, &event));
- clock_gettime (CLOCK_MONOTONIC, &end);
- printf ("%-16s %f us\n", "Wall time", elapsed_seconds (&start, &end) * 1000 * 1000);
- printf ("%-16s %f us\n", "GPU time", elapsed_gpu_seconds (event) * 1000 * 1000);
- OCL_CHECK_ERROR (clReleaseEvent (event));
- OCL_CHECK_ERROR (clReleaseMemObject (src_buffer));
- }
- int
- main (int argc, char const* argv[])
- {
- App app;
- if (argc > 1) {
- PAGE_SIZE = GPU_BUFFER_SIZE = atoi (argv[1]);
- }
- if (argc > 2) {
- NUM_PAGES = atoi (argv[2]);
- }
- printf ("** Parameters\n\n");
- printf ("%-16s %u\n", "Pages", NUM_PAGES);
- printf ("%-16s %zu B\n", "Buffer size", GPU_BUFFER_SIZE);
- if (!init_pcilib (&app))
- return 1;
- if (!init_opencl (&app))
- return 1;
- measure_fpga_to_gpu_latency_with_marker (&app);
- measure_fpga_to_gpu_latency_with_kernel (&app);
- measure_fpga_to_gpu_latency_with_cpu (&app);
- measure_fpga_to_cpu_latency (&app);
- measure_gpu_to_fpga_latency (&app);
- measure_fpga_to_gpu_throughput (&app);
- close_opencl (&app);
- close_pcilib (&app);
- return 0;
- }
|