check.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769
  1. #define _XOPEN_SOURCE 500
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <stdbool.h>
  5. #include <string.h>
  6. #include <unistd.h>
  7. #include <time.h>
  8. #include <pcilib.h>
  9. #include <pcilib/bar.h>
  10. #include <pcilib/kmem.h>
  11. #include <CL/cl.h>
  12. #include <CL/cl_ext.h>
  13. #include "ocl.h"
  14. /* this should actually come from the distributed pcitool sources */
  15. #include "pciDriver.h"
  16. typedef struct {
  17. /* pcilib */
  18. pcilib_t *pci;
  19. uint8_t *bar;
  20. cl_ulong bar_phys;
  21. uint8_t board_gen;
  22. pcilib_kmem_handle_t *kdesc;
  23. uintptr_t kdesc_bus;
  24. volatile uint32_t *desc;
  25. /* OpenCL */
  26. OclPlatform *ocl;
  27. cl_device_id device;
  28. cl_command_queue queue;
  29. cl_context context;
  30. cl_program program;
  31. cl_kernel kernel;
  32. cl_mem check_buffer;
  33. /* both */
  34. cl_mem fpga_buffer;
  35. struct {
  36. cl_mem buffer;
  37. cl_bus_address_amd addr;
  38. } gpu;
  39. struct {
  40. pcilib_kmem_handle_t *kmem;
  41. uint32_t *buffer;
  42. uintptr_t addr;
  43. } cpu;
  44. } App;
  45. #define UNICODE_CHECK_MARK "o"
  46. #define UNICODE_CROSS "x"
  47. #define KMEM_DEFAULT_FLAGS PCILIB_KMEM_FLAG_HARDWARE | \
  48. PCILIB_KMEM_FLAG_PERSISTENT | \
  49. PCILIB_KMEM_FLAG_EXCLUSIVE
  50. #define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
  51. #define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
  52. #define REG_RESET_DMA 0x00
  53. #define REG_DMA 0x04
  54. #define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10
  55. #define REG_PERF_COUNTER 0x28
  56. #define REG_PACKET_LENGTH 0x0C
  57. #define REG_DESCRIPTOR_ADDRESS 0x50
  58. #define REG_UPDATE_ADDRESS 0x58
  59. #define REG_UPDATE_THRESHOLD 0x60
  60. #define REG_COUNTER 0x9000
  61. #define REG_VERSION 0x9020
  62. #define REG_CONTROL 0x9040
  63. #define REG_LATENCY 0x9044
  64. #define REG_NUM_ROWS 0x9168
  65. #define REG_NUM_FRAMES 0x9170
  66. #define REG_DBG_RQ_RST 0x9344
  67. #define REG_DBG_CQ_RST 0x93A4
  68. #define CMD_DMA_START 0x1
  69. #define CMD_DMA_STOP 0x0
  70. #define CMD_COUNTER_RESET 0xf0
  71. #define CMD_COUNTER_START 0x1
  72. #define CMD_COUNTER_STOP 0x0
  73. #define CMD_LATENCY_RESET 0x0800000f
  74. #define CMD_LATENCY_STOP 0xf0000000
  75. #define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
  76. #define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
  77. #define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);
  78. #define WR64(addr, value) *(uint64_t *) (app->bar + (addr)) = (value);
  79. #define RD64(addr) (*(uint64_t *) (app->bar + (addr)))
  80. #define WR64_sleep(addr, value) *(uint64_t *) (app->bar + (addr)) = (value); usleep (100);
  81. static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL;
  82. static clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD = NULL;
  83. static const size_t TLP_SIZE = 64;
  84. static const size_t FPGA_BUFFER_SIZE = 1024 * 64;
  85. static uint32_t PAGE_SIZE = 4096;
  86. static uint32_t NUM_PAGES = 1;
  87. static size_t GPU_BUFFER_SIZE = 4096;
  88. static size_t CPU_BUFFER_SIZE = 4096;
  89. static size_t CHECK_BUFFER_SIZE = 8;
  90. /* declaration should actually come from a distributed header file */
  91. const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);
  92. static bool
  93. init_pcilib (App *app)
  94. {
  95. static const char *DEVICE = "/dev/fpga0";
  96. const pcilib_board_info_t *board;
  97. app->pci = pcilib_open (DEVICE, "pci");
  98. if (app->pci == NULL) {
  99. printf ("Could not open `%s'", DEVICE);
  100. return false;
  101. }
  102. app->bar = pcilib_map_bar (app->pci, PCILIB_BAR0);
  103. if (app->bar == NULL) {
  104. printf ("Unable to map BAR\n");
  105. pcilib_close (app->pci);
  106. return false;
  107. }
  108. board = pcilib_get_board_info (app->pci);
  109. app->bar_phys = board->bar_start[PCILIB_BAR0];
  110. app->board_gen = RD32 (0x18) & 0xF;
  111. app->kdesc = pcilib_alloc_kernel_memory (app->pci,
  112. PCILIB_KMEM_TYPE_CONSISTENT, 1, 128,
  113. 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
  114. app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0);
  115. app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0);
  116. memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));
  117. printf ("%-16s 0x%X\n", "Firmware", RD32 (REG_VERSION));
  118. return true;
  119. }
  120. static void
  121. close_pcilib (App *app)
  122. {
  123. pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
  124. pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
  125. pcilib_free_kernel_memory (app->pci, app->cpu.kmem, KMEM_DEFAULT_FLAGS);
  126. pcilib_clean_kernel_memory (app->pci, KMEM_USE_DEFAULT, flags);
  127. pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
  128. pcilib_close (app->pci);
  129. }
  130. static cl_int
  131. create_fpga_buffer (App *app, size_t size)
  132. {
  133. cl_mem_flags flags;
  134. cl_bus_address_amd addr;
  135. cl_int error;
  136. flags = CL_MEM_EXTERNAL_PHYSICAL_AMD;
  137. addr.surface_bus_address = (cl_ulong) app->bar_phys;
  138. addr.marker_bus_address = (cl_ulong) app->bar_phys;
  139. app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error);
  140. return error;
  141. }
  142. static cl_int
  143. create_gpu_buffer (App *app, size_t size)
  144. {
  145. cl_mem_flags flags;
  146. cl_int error;
  147. char *data;
  148. data = malloc (size);
  149. memset (data, 42, size);
  150. flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR;
  151. app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error);
  152. if (error != CL_SUCCESS)
  153. return error;
  154. return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
  155. }
  156. static cl_int
  157. create_cpu_buffer (App *app, size_t size)
  158. {
  159. pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
  160. app->cpu.kmem = pcilib_alloc_kernel_memory (app->pci,
  161. /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
  162. 1, size, CPU_BUFFER_SIZE, KMEM_USE_DEFAULT, flags);
  163. app->cpu.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu.kmem, 0);
  164. app->cpu.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu.kmem, 0);
  165. memset (app->cpu.buffer, 42, CPU_BUFFER_SIZE);
  166. return 0;
  167. }
  168. static bool
  169. init_opencl (App *app)
  170. {
  171. cl_int error;
  172. cl_platform_id platform;
  173. app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE);
  174. platform = ocl_get_platform (app->ocl);
  175. clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD");
  176. clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueWaitSignalAMD");
  177. app->device = ocl_get_devices (app->ocl)[0];
  178. app->queue = ocl_get_cmd_queues (app->ocl)[0];
  179. app->context = ocl_get_context (app->ocl);
  180. app->program = ocl_create_program_from_file (app->ocl, "kernel.cl", NULL, &error);
  181. OCL_CHECK_ERROR (error);
  182. app->kernel = clCreateKernel (app->program, "wait_and_write", &error);
  183. OCL_CHECK_ERROR (error);
  184. app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, CHECK_BUFFER_SIZE, NULL, &error);
  185. OCL_CHECK_ERROR (error);
  186. error |= create_fpga_buffer (app, FPGA_BUFFER_SIZE);
  187. OCL_CHECK_ERROR (error);
  188. error |= create_gpu_buffer (app, GPU_BUFFER_SIZE);
  189. OCL_CHECK_ERROR (error);
  190. error |= create_cpu_buffer (app, CPU_BUFFER_SIZE);
  191. OCL_CHECK_ERROR (error);
  192. return error != CL_SUCCESS ? false : true;
  193. }
  194. static void
  195. close_opencl (App *app)
  196. {
  197. OCL_CHECK_ERROR (clReleaseKernel (app->kernel));
  198. OCL_CHECK_ERROR (clReleaseProgram (app->program));
  199. OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
  200. OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
  201. ocl_free (app->ocl);
  202. }
  203. static void
  204. debug_assert (const char *message, bool condition)
  205. {
  206. printf ("%-16s ", message);
  207. if (condition)
  208. printf (UNICODE_CHECK_MARK"\n");
  209. else
  210. printf (UNICODE_CROSS"\n");
  211. }
  212. static void
  213. debug_assert_cmp (const char *message, uint32_t value, uint32_t expected)
  214. {
  215. printf ("%-16s ", message);
  216. if (value != expected)
  217. printf (UNICODE_CROSS" [%x (%i) != %x (%i)]\n", value, value, expected, expected);
  218. else
  219. printf (UNICODE_CHECK_MARK"\n");
  220. }
  221. #if 0
  222. static void
  223. check_value (App *app, const char *message, uint32_t addr, uint32_t expected)
  224. {
  225. uint32_t value;
  226. value = RD32 (addr);
  227. debug_assert_cmp (message, value, expected);
  228. }
  229. #endif
  230. static void
  231. configure_dma (App *app)
  232. {
  233. uint32_t value;
  234. WR32 (REG_RESET_DMA, 1);
  235. usleep (100000);
  236. WR32 (REG_RESET_DMA, 0);
  237. usleep (100000);
  238. value = RD32 (REG_RESET_DMA);
  239. debug_assert ("PCIe check", value == 335746816 || value == 335681280);
  240. WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));
  241. if (app->board_gen == 3) {
  242. WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
  243. }
  244. else {
  245. WR32 (REG_PACKET_LENGTH, TLP_SIZE);
  246. }
  247. /* reset host side addr */
  248. app->desc[app->board_gen == 3 ? 2 : 4] = 0;
  249. }
  250. static void
  251. configure_dma_descriptors (App *app)
  252. {
  253. WR32 (REG_UPDATE_THRESHOLD, 0);
  254. WR64 (REG_UPDATE_ADDRESS, app->kdesc_bus);
  255. usleep (100000);
  256. }
  257. static void
  258. setup_counter (App *app)
  259. {
  260. WR32_sleep (REG_NUM_ROWS, 0);
  261. WR32_sleep (REG_NUM_FRAMES, 0);
  262. WR32_sleep (REG_CONTROL, CMD_LATENCY_RESET);
  263. WR32_sleep (REG_CONTROL, CMD_COUNTER_STOP);
  264. WR32_sleep (REG_COUNTER, CMD_COUNTER_RESET);
  265. WR32_sleep (REG_COUNTER, CMD_COUNTER_START);
  266. }
  267. static void
  268. stop_dma (App *app)
  269. {
  270. WR32_sleep (REG_DMA, 0);
  271. }
  272. static void
  273. reset_dbg (App *app)
  274. {
  275. // TX
  276. WR32_sleep (REG_DBG_RQ_RST, 1);
  277. WR32_sleep (REG_DBG_RQ_RST, 0);
  278. // RX
  279. WR32_sleep (REG_DBG_CQ_RST, 1);
  280. WR32_sleep (REG_DBG_CQ_RST, 0);
  281. }
  282. static double
  283. compute_debug_latency (App *app)
  284. {
  285. uint64_t start, start_high, start_low;
  286. uint64_t end, end_high, end_low;
  287. WR32_sleep (0x93A0, 0);
  288. start_high = RD32 (0x93B8);
  289. start_low = RD32 (0x93BC);
  290. WR32_sleep (0x93A0,1)
  291. end_high = RD32 (0x93B8);
  292. end_low = RD32 (0x93BC);
  293. start = (start_high << 32) | start_low;
  294. end = (end_high << 32) | end_low;
  295. return ((end - start) * 4) / 1000.0;
  296. }
  297. static double
  298. elapsed_seconds (struct timespec *start, struct timespec *end)
  299. {
  300. return (end->tv_sec + end->tv_nsec / 1000000000.0) - (start->tv_sec + start->tv_nsec / 1000000000.0);
  301. }
  302. static double
  303. elapsed_gpu_seconds (cl_event event)
  304. {
  305. cl_ulong start;
  306. cl_ulong end;
  307. OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
  308. OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
  309. return (end - start) / 1000.0 / 1000.0 / 1000.0;
  310. }
  311. static void
  312. check_data_transfer (App *app, size_t check_size)
  313. {
  314. uint32_t *data;
  315. data = malloc (check_size);
  316. memset (data, 0, check_size);
  317. OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));
  318. for (uint32_t i = 0; i < check_size / 4; i++) {
  319. if (data[i] != i) {
  320. debug_assert_cmp ("FPGA->GPU write", data[i], i);
  321. goto finish_check_data_transfer;
  322. }
  323. }
  324. debug_assert ("FPGA->GPU write", true);
  325. finish_check_data_transfer:
  326. free (data);
  327. }
  328. static void
  329. check_data_transfer_cpu (App *app, size_t check_size)
  330. {
  331. for (uint32_t i = 0; i < check_size / 4; i++) {
  332. if (app->cpu.buffer[i] != i) {
  333. debug_assert_cmp ("FPGA->CPU write", app->cpu.buffer[i], i);
  334. return;
  335. }
  336. }
  337. debug_assert ("FPGA->CPU write", true);
  338. }
  339. static void
  340. fill_gpu_buffer (App *app)
  341. {
  342. uint32_t pattern;
  343. cl_event event;
  344. pattern = 42;
  345. OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, app->gpu.buffer, &pattern, sizeof (pattern), 0, GPU_BUFFER_SIZE / 4, 0, NULL, &event));
  346. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  347. OCL_CHECK_ERROR (clReleaseEvent (event));
  348. }
  349. static void
  350. measure_fpga_to_gpu_latency_with_marker (App *app)
  351. {
  352. uint32_t counter;
  353. struct timespec start;
  354. struct timespec end;
  355. double host_latency;
  356. double debug_latency;
  357. cl_event event;
  358. printf ("\n** FPGA to GPU latency [marker]\n\n");
  359. fill_gpu_buffer (app);
  360. configure_dma (app);
  361. WR32 (REG_UPDATE_THRESHOLD, 0);
  362. WR64_sleep (REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);
  363. setup_counter (app);
  364. WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
  365. reset_dbg (app);
  366. OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 0xd0dad0da, 0, NULL, &event));
  367. clock_gettime (CLOCK_MONOTONIC, &start);
  368. WR32 (REG_DMA, 1);
  369. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  370. clock_gettime (CLOCK_MONOTONIC, &end);
  371. stop_dma (app);
  372. OCL_CHECK_ERROR (clReleaseEvent (event));
  373. counter = RD32 (REG_PERF_COUNTER);
  374. host_latency = elapsed_seconds (&start, &end);
  375. debug_latency = compute_debug_latency (app);
  376. check_data_transfer (app, PAGE_SIZE);
  377. printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
  378. printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
  379. printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
  380. printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
  381. }
  382. static void
  383. measure_fpga_to_gpu_latency_with_kernel (App *app)
  384. {
  385. uint32_t counter;
  386. struct timespec start;
  387. struct timespec end;
  388. double host_latency;
  389. double debug_latency;
  390. cl_event event;
  391. uint32_t check[CHECK_BUFFER_SIZE / 4];
  392. size_t work_size = 1;
  393. printf ("\n** FPGA to GPU latency [kernel]\n\n");
  394. fill_gpu_buffer (app);
  395. configure_dma (app);
  396. configure_dma_descriptors (app);
  397. setup_counter (app);
  398. WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
  399. reset_dbg (app);
  400. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->gpu.buffer));
  401. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->fpga_buffer));
  402. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (cl_mem), &app->check_buffer));
  403. OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1,
  404. NULL, &work_size, NULL,
  405. 0, NULL, &event));
  406. WR32 (REG_DMA, 1);
  407. clock_gettime (CLOCK_MONOTONIC, &start);
  408. clWaitForEvents (1, &event);
  409. clock_gettime (CLOCK_MONOTONIC, &end);
  410. OCL_CHECK_ERROR (clReleaseEvent (event));
  411. counter = RD32 (REG_PERF_COUNTER);
  412. host_latency = elapsed_seconds (&start, &end);
  413. debug_latency = compute_debug_latency (app);
  414. check_data_transfer (app, PAGE_SIZE);
  415. OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE,
  416. 0, CHECK_BUFFER_SIZE, check, 0, NULL, NULL));
  417. debug_assert_cmp ("Data check", check[0], 1);
  418. printf ("\n%-16s %i\n", "Kernel count", check[1]);
  419. printf ("%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
  420. printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
  421. printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
  422. printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
  423. }
  424. static void
  425. measure_fpga_to_gpu_latency_with_cpu (App *app)
  426. {
  427. uint32_t hardware_ptr;
  428. uint32_t counter;
  429. unsigned flag_index;
  430. struct timespec start;
  431. struct timespec end;
  432. double host_latency;
  433. double debug_latency;
  434. printf ("\n** FPGA to GPU latency [CPU]\n\n");
  435. fill_gpu_buffer (app);
  436. hardware_ptr = 0;
  437. flag_index = app->board_gen == 3 ? 2 : 4;
  438. configure_dma (app);
  439. configure_dma_descriptors (app);
  440. setup_counter (app);
  441. WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
  442. reset_dbg (app);
  443. clock_gettime (CLOCK_MONOTONIC, &start);
  444. WR32 (REG_DMA, 1);
  445. do {
  446. hardware_ptr = app->desc[flag_index];
  447. }
  448. while (hardware_ptr != (app->gpu.addr.surface_bus_address & 0xFFFFFFFF));
  449. clock_gettime (CLOCK_MONOTONIC, &end);
  450. stop_dma (app);
  451. counter = RD32 (REG_PERF_COUNTER);
  452. host_latency = elapsed_seconds (&start, &end);
  453. debug_latency = compute_debug_latency (app);
  454. check_data_transfer (app, PAGE_SIZE);
  455. printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
  456. printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
  457. printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
  458. printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
  459. }
  460. static void
  461. measure_fpga_to_cpu_latency (App *app)
  462. {
  463. uint32_t hardware_ptr;
  464. uint32_t counter;
  465. unsigned flag_index;
  466. struct timespec start;
  467. struct timespec end;
  468. double host_latency;
  469. double debug_latency;
  470. printf ("\n** FPGA to CPU latency\n\n");
  471. hardware_ptr = 0;
  472. flag_index = app->board_gen == 3 ? 2 : 4;
  473. configure_dma (app);
  474. configure_dma_descriptors (app);
  475. setup_counter (app);
  476. WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->cpu.addr);
  477. reset_dbg (app);
  478. clock_gettime (CLOCK_MONOTONIC, &start);
  479. WR32 (REG_DMA, 1);
  480. do {
  481. hardware_ptr = app->desc[flag_index];
  482. }
  483. while (hardware_ptr != app->cpu.addr);
  484. clock_gettime (CLOCK_MONOTONIC, &end);
  485. stop_dma (app);
  486. // Data back to FPGA
  487. memcpy(app->bar + 0x9400, app->cpu.buffer, CPU_BUFFER_SIZE);
  488. counter = RD32 (REG_PERF_COUNTER);
  489. host_latency = elapsed_seconds (&start, &end);
  490. debug_latency = compute_debug_latency (app);
  491. check_data_transfer_cpu (app, PAGE_SIZE);
  492. printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
  493. printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
  494. printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
  495. printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
  496. }
  497. static void
  498. measure_fpga_to_gpu_throughput (App *app)
  499. {
  500. uint32_t current_ptr;
  501. uint32_t hardware_ptr;
  502. unsigned flag_index;
  503. size_t transferred;
  504. cl_ulong addr;
  505. double elapsed;
  506. struct timespec start;
  507. struct timespec end;
  508. printf ("\n** FPGA to GPU throughput\n\n");
  509. configure_dma (app);
  510. configure_dma_descriptors (app);
  511. setup_counter (app);
  512. current_ptr = 0;
  513. hardware_ptr = 0;
  514. transferred = 0;
  515. addr = app->gpu.addr.surface_bus_address;
  516. flag_index = app->board_gen == 3 ? 2 : 4;
  517. WR64_sleep (REG_DESCRIPTOR_ADDRESS, addr);
  518. clock_gettime (CLOCK_MONOTONIC, &start);
  519. WR32 (REG_DMA, 1);
  520. do {
  521. do {
  522. hardware_ptr = app->desc[flag_index];
  523. }
  524. while (hardware_ptr == current_ptr); /* it should work to check against `addr` but it never does */
  525. addr += NUM_PAGES * PAGE_SIZE;
  526. transferred += NUM_PAGES * PAGE_SIZE;
  527. current_ptr = hardware_ptr;
  528. WR64 (REG_DESCRIPTOR_ADDRESS, addr);
  529. }
  530. while (transferred < GPU_BUFFER_SIZE);
  531. clock_gettime (CLOCK_MONOTONIC, &end);
  532. stop_dma (app);
  533. check_data_transfer (app, GPU_BUFFER_SIZE);
  534. elapsed = elapsed_seconds (&start, &end);
  535. printf ("\n%-16s %.2f MB/s [%3.5f us]\n", "Host", GPU_BUFFER_SIZE / elapsed / 1024. / 1024., elapsed * 1000 * 1000);
  536. }
  537. static void
  538. measure_gpu_to_fpga_latency (App *app)
  539. {
  540. cl_event event;
  541. cl_mem src_buffer;
  542. cl_int err;
  543. uint32_t pattern;
  544. struct timespec start;
  545. struct timespec end;
  546. printf ("\n** GPU to FPGA latency\n\n");
  547. src_buffer = clCreateBuffer (app->context, CL_MEM_READ_ONLY, PAGE_SIZE, NULL, &err);
  548. OCL_CHECK_ERROR (err);
  549. pattern = 0xdeadf00d;
  550. OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, src_buffer, &pattern, sizeof (pattern), 0, PAGE_SIZE, 0, NULL, &event));
  551. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  552. OCL_CHECK_ERROR (clReleaseEvent (event));
  553. clock_gettime (CLOCK_MONOTONIC, &start);
  554. OCL_CHECK_ERROR (clEnqueueCopyBuffer (app->queue, src_buffer, app->fpga_buffer, 0, 0x9400, PAGE_SIZE, 0, NULL, &event));
  555. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  556. clock_gettime (CLOCK_MONOTONIC, &end);
  557. printf ("%-16s %f us\n", "Wall time", elapsed_seconds (&start, &end) * 1000 * 1000);
  558. printf ("%-16s %f us\n", "GPU time", elapsed_gpu_seconds (event) * 1000 * 1000);
  559. OCL_CHECK_ERROR (clReleaseEvent (event));
  560. OCL_CHECK_ERROR (clReleaseMemObject (src_buffer));
  561. }
  562. int
  563. main (int argc, char const* argv[])
  564. {
  565. App app;
  566. if (argc > 1) {
  567. PAGE_SIZE = GPU_BUFFER_SIZE = atoi (argv[1]);
  568. }
  569. if (argc > 2) {
  570. NUM_PAGES = atoi (argv[2]);
  571. }
  572. printf ("** Parameters\n\n");
  573. printf ("%-16s %u\n", "Pages", NUM_PAGES);
  574. printf ("%-16s %zu B\n", "Buffer size", GPU_BUFFER_SIZE);
  575. if (!init_pcilib (&app))
  576. return 1;
  577. if (!init_opencl (&app))
  578. return 1;
  579. measure_fpga_to_gpu_latency_with_marker (&app);
  580. measure_fpga_to_gpu_latency_with_kernel (&app);
  581. measure_fpga_to_gpu_latency_with_cpu (&app);
  582. measure_fpga_to_cpu_latency (&app);
  583. measure_gpu_to_fpga_latency (&app);
  584. measure_fpga_to_gpu_throughput (&app);
  585. close_opencl (&app);
  586. close_pcilib (&app);
  587. return 0;
  588. }