check.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946
  1. #define _XOPEN_SOURCE 500
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <stdbool.h>
  5. #include <string.h>
  6. #include <unistd.h>
  7. #include <time.h>
  8. #include <pcilib.h>
  9. #include <pcilib/bar.h>
  10. #include <pcilib/kmem.h>
  11. #include <CL/cl.h>
  12. #include <CL/cl_ext.h>
  13. #include "ocl.h"
  14. #include "hf-interface.h"
  15. /* this should actually come from the distributed pcitool sources */
  16. #include "pciDriver.h"
  17. typedef struct {
  18. /* pcilib */
  19. pcilib_t *pci;
  20. uint8_t *bar;
  21. cl_ulong bar_phys;
  22. uint8_t board_gen;
  23. pcilib_kmem_handle_t *kdesc;
  24. uintptr_t kdesc_bus;
  25. volatile uint32_t *desc;
  26. /* OpenCL */
  27. OclPlatform *ocl;
  28. cl_device_id device;
  29. cl_command_queue queue;
  30. cl_context context;
  31. cl_program program;
  32. cl_kernel kernel;
  33. cl_mem check_buffer;
  34. /* both */
  35. cl_mem fpga_buffer;
  36. struct {
  37. cl_mem buffer;
  38. cl_bus_address_amd addr;
  39. } gpu;
  40. struct {
  41. cl_mem buffer;
  42. cl_bus_address_amd addr;
  43. } latency_data;
  44. struct {
  45. pcilib_kmem_handle_t *kmem;
  46. uint32_t *buffer;
  47. uintptr_t addr;
  48. } cpu;
  49. struct {
  50. pcilib_kmem_handle_t *kmem;
  51. uint32_t *buffer;
  52. uintptr_t addr;
  53. } cpu_latency_data;
  54. } App;
  55. #define UNICODE_CHECK_MARK "o"
  56. #define UNICODE_CROSS "x"
  57. #define KMEM_DEFAULT_FLAGS PCILIB_KMEM_FLAG_HARDWARE | \
  58. PCILIB_KMEM_FLAG_PERSISTENT | \
  59. PCILIB_KMEM_FLAG_EXCLUSIVE
  60. #define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
  61. #define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
  62. #define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
  63. #define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
  64. #define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);
  65. #define WR64(addr, value) *(uint64_t *) (app->bar + (addr)) = (value);
  66. #define RD64(addr) (*(uint64_t *) (app->bar + (addr)))
  67. #define WR64_sleep(addr, value) *(uint64_t *) (app->bar + (addr)) = (value); usleep (100);
  68. static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL;
  69. static clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD = NULL;
  70. static const size_t TLP_SIZE = 64;
  71. static const size_t FPGA_BUFFER_SIZE = 1024 * 64;
  72. static uint32_t PAGE_SIZE = 4096;
  73. static uint32_t NUM_PAGES = 1;
  74. static size_t GPU_BUFFER_SIZE = 4096;
  75. static size_t CPU_BUFFER_SIZE = 4096;
  76. static size_t CHECK_BUFFER_SIZE = 8;
  77. static size_t COUNTER_DATA_SIZE = 4096; // 2 GB
  78. /* declaration should actually come from a distributed header file */
  79. const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);
  80. static bool
  81. init_pcilib (App *app)
  82. {
  83. static const char *DEVICE = "/dev/fpga0";
  84. const pcilib_board_info_t *board;
  85. app->pci = pcilib_open (DEVICE, "pci");
  86. if (app->pci == NULL) {
  87. printf ("Could not open `%s'", DEVICE);
  88. return false;
  89. }
  90. app->bar = pcilib_map_bar (app->pci, PCILIB_BAR0);
  91. if (app->bar == NULL) {
  92. printf ("Unable to map BAR\n");
  93. pcilib_close (app->pci);
  94. return false;
  95. }
  96. board = pcilib_get_board_info (app->pci);
  97. app->bar_phys = board->bar_start[PCILIB_BAR0];
  98. app->board_gen = RD32 (0x18) & 0xF;
  99. app->kdesc = pcilib_alloc_kernel_memory (app->pci,
  100. PCILIB_KMEM_TYPE_CONSISTENT, 1, 128,
  101. 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
  102. app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0);
  103. app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0);
  104. memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));
  105. printf ("%-16s 0x%X\n", "Firmware", RD32 (HF_REG_VERSION));
  106. return true;
  107. }
  108. static void
  109. close_pcilib (App *app)
  110. {
  111. pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
  112. pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
  113. pcilib_free_kernel_memory (app->pci, app->cpu.kmem, KMEM_DEFAULT_FLAGS);
  114. pcilib_free_kernel_memory (app->pci, app->cpu_latency_data.kmem, KMEM_DEFAULT_FLAGS);
  115. pcilib_clean_kernel_memory (app->pci, KMEM_USE_DEFAULT, flags);
  116. pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
  117. pcilib_close (app->pci);
  118. }
  119. static cl_int
  120. create_fpga_buffer (App *app, size_t size)
  121. {
  122. cl_mem_flags flags;
  123. cl_bus_address_amd addr;
  124. cl_int error;
  125. flags = CL_MEM_EXTERNAL_PHYSICAL_AMD;
  126. addr.surface_bus_address = (cl_ulong) app->bar_phys;
  127. addr.marker_bus_address = (cl_ulong) app->bar_phys;
  128. app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error);
  129. return error;
  130. }
  131. static cl_int
  132. create_gpu_buffer (App *app, size_t size)
  133. {
  134. cl_mem_flags flags;
  135. cl_int error;
  136. char *data;
  137. data = malloc (size);
  138. memset (data, 42, size);
  139. flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR;
  140. app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error);
  141. if (error != CL_SUCCESS)
  142. return error;
  143. return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
  144. }
  145. static cl_int
  146. create_latency_data_buffer (App *app, size_t size)
  147. {
  148. cl_mem_flags flags;
  149. cl_int error;
  150. char *data;
  151. data = malloc (size);
  152. memset (data, 0xAD, size);
  153. flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE;
  154. app->latency_data.buffer = clCreateBuffer (app->context, flags, size, data, &error);
  155. if (error != CL_SUCCESS)
  156. return error;
  157. return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->latency_data.buffer, CL_TRUE, &app->latency_data.addr, 0, NULL, NULL);
  158. }
  159. static cl_int
  160. create_cpu_buffer (App *app, size_t size)
  161. {
  162. pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
  163. app->cpu.kmem = pcilib_alloc_kernel_memory (app->pci,
  164. /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
  165. 1, size, CPU_BUFFER_SIZE, KMEM_USE_DEFAULT, flags);
  166. app->cpu.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu.kmem, 0);
  167. app->cpu.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu.kmem, 0);
  168. memset (app->cpu.buffer, 42, CPU_BUFFER_SIZE);
  169. return 0;
  170. }
  171. static cl_int
  172. create_cpu_latency_data_buffer (App *app, size_t size)
  173. {
  174. pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
  175. app->cpu_latency_data.kmem = pcilib_alloc_kernel_memory (app->pci,
  176. /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
  177. 1, size, 16, KMEM_USE_DEFAULT, flags);
  178. app->cpu_latency_data.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu_latency_data.kmem, 0);
  179. app->cpu_latency_data.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu_latency_data.kmem, 0);
  180. memset (app->cpu_latency_data.buffer, 0x8A, 16);
  181. return 0;
  182. }
  183. static bool
  184. init_opencl (App *app)
  185. {
  186. cl_int error;
  187. cl_platform_id platform;
  188. app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE);
  189. platform = ocl_get_platform (app->ocl);
  190. clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD");
  191. clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueWaitSignalAMD");
  192. app->device = ocl_get_devices (app->ocl)[0];
  193. app->queue = ocl_get_cmd_queues (app->ocl)[0];
  194. app->context = ocl_get_context (app->ocl);
  195. app->program = ocl_create_program_from_file (app->ocl, "kernel.cl", NULL, &error);
  196. OCL_CHECK_ERROR (error);
  197. app->kernel = clCreateKernel (app->program, "wait_and_write", &error);
  198. OCL_CHECK_ERROR (error);
  199. app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, CHECK_BUFFER_SIZE, NULL, &error);
  200. OCL_CHECK_ERROR (error);
  201. error |= create_fpga_buffer (app, FPGA_BUFFER_SIZE);
  202. OCL_CHECK_ERROR (error);
  203. error |= create_gpu_buffer (app, GPU_BUFFER_SIZE);
  204. OCL_CHECK_ERROR (error);
  205. error |= create_latency_data_buffer (app, 16);
  206. OCL_CHECK_ERROR (error);
  207. error |= create_cpu_buffer (app, CPU_BUFFER_SIZE);
  208. OCL_CHECK_ERROR (error);
  209. error |= create_cpu_latency_data_buffer (app, 16);
  210. OCL_CHECK_ERROR (error);
  211. return error != CL_SUCCESS ? false : true;
  212. }
  213. static void
  214. close_opencl (App *app)
  215. {
  216. OCL_CHECK_ERROR (clReleaseKernel (app->kernel));
  217. OCL_CHECK_ERROR (clReleaseProgram (app->program));
  218. OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
  219. OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
  220. OCL_CHECK_ERROR (clReleaseMemObject (app->gpu.buffer));
  221. OCL_CHECK_ERROR (clReleaseMemObject (app->latency_data.buffer));
  222. ocl_free (app->ocl);
  223. }
  224. static void
  225. debug_assert (const char *message, bool condition)
  226. {
  227. printf ("%-16s ", message);
  228. if (condition)
  229. printf (UNICODE_CHECK_MARK"\n");
  230. else
  231. printf (UNICODE_CROSS"\n");
  232. }
  233. static void
  234. debug_assert_cmp (const char *message, uint32_t value, uint32_t expected)
  235. {
  236. printf ("%-16s ", message);
  237. if (value != expected)
  238. printf (UNICODE_CROSS" [%x (%i) != %x (%i)]\n", value, value, expected, expected);
  239. else
  240. printf (UNICODE_CHECK_MARK"\n");
  241. }
  242. #if 0
  243. static void
  244. check_value (App *app, const char *message, uint32_t addr, uint32_t expected)
  245. {
  246. uint32_t value;
  247. value = RD32 (addr);
  248. debug_assert_cmp (message, value, expected);
  249. }
  250. #endif
  251. static void
  252. configure_dma (App *app)
  253. {
  254. uint32_t value;
  255. WR32 (HF_REG_DMA, 1);
  256. usleep (100000);
  257. WR32 (HF_REG_DMA, 0);
  258. usleep (100000);
  259. value = RD32 (HF_REG_DMA);
  260. debug_assert ("PCIe check", value == 335746816 || value == 335681280);
  261. WR32 (HF_REG_NUM_PACKETS, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));
  262. if (app->board_gen == 3) {
  263. WR32 (HF_REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
  264. }
  265. else {
  266. WR32 (HF_REG_PACKET_LENGTH, TLP_SIZE);
  267. }
  268. /* reset host side addr */
  269. app->desc[app->board_gen == 3 ? 2 : 4] = 0;
  270. }
  271. static void
  272. configure_dma_descriptors (App *app)
  273. {
  274. WR32 (HF_REG_UPDATE_THRESHOLD, 0);
  275. WR64 (HF_REG_UPDATE_ADDRESS, app->kdesc_bus);
  276. usleep (100000);
  277. }
  278. static void
  279. setup_counter (App *app)
  280. {
  281. WR32_sleep (HF_REG_INTERCONNECT,
  282. HF_INTERCONNECT_DDR_FROM_CNT |
  283. HF_INTERCONNECT_DDR_TO_DMA |
  284. HF_INTERCONNECT_MASTER_DMA);
  285. WR32_sleep (HF_REG_DCG_UPPER_LIMIT, COUNTER_DATA_SIZE);
  286. WR32_sleep (HF_REG_DDR_UPPER_ADDR, 0xFFFFFFFF);
  287. WR32_sleep (HF_REG_DCG, HF_DCG_RESET );
  288. WR32_sleep (HF_REG_DCG, HF_DCG_START);
  289. sleep (3); // sleep until ddr is complete
  290. WR32_sleep (HF_REG_DCG, HF_DCG_STOP);
  291. }
  292. static void
  293. stop_dma (App *app)
  294. {
  295. WR32_sleep (HF_REG_DMA, 0);
  296. }
  297. static void
  298. reset_dbg (App *app)
  299. {
  300. // RC
  301. WR32_sleep (HF_REG_DEBUG_RC_RESET, 1);
  302. WR32_sleep (HF_REG_DEBUG_RC_RESET, 0);
  303. // TX
  304. WR32_sleep (HF_REG_DEBUG_REQUESTER_RESET, 1);
  305. WR32_sleep (HF_REG_DEBUG_REQUESTER_RESET, 0);
  306. // RX
  307. WR32_sleep (HF_REG_DEBUG_COMPLETER_RESET, 1);
  308. WR32_sleep (HF_REG_DEBUG_COMPLETER_RESET, 0);
  309. }
  310. static double
  311. compute_debug_latency (App *app)
  312. {
  313. uint64_t start, start_high, start_low;
  314. uint64_t end, end_high, end_low;
  315. WR32_sleep (0x93A0, 0);
  316. start_high = RD32 (0x93B8);
  317. start_low = RD32 (0x93BC);
  318. WR32_sleep (0x93A0,1)
  319. end_high = RD32 (0x93B8);
  320. end_low = RD32 (0x93BC);
  321. start = (start_high << 32) | start_low;
  322. end = (end_high << 32) | end_low;
  323. return ((end - start) * 4) / 1000.0;
  324. }
  325. static double
  326. elapsed_seconds (struct timespec *start, struct timespec *end)
  327. {
  328. return (end->tv_sec + end->tv_nsec / 1000000000.0) - (start->tv_sec + start->tv_nsec / 1000000000.0);
  329. }
  330. static double
  331. elapsed_gpu_seconds (cl_event event)
  332. {
  333. cl_ulong start;
  334. cl_ulong end;
  335. OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
  336. OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));
  337. return (end - start) / 1000.0 / 1000.0 / 1000.0;
  338. }
  339. static void
  340. check_data_transfer (App *app, size_t check_size)
  341. {
  342. uint32_t *data;
  343. data = malloc (check_size);
  344. memset (data, 0, check_size);
  345. OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));
  346. for (uint32_t i = 0; i < check_size / 4; i++) {
  347. if (data[i] != i) {
  348. debug_assert_cmp ("FPGA->GPU write", data[i], i);
  349. goto finish_check_data_transfer;
  350. }
  351. }
  352. debug_assert ("FPGA->GPU write", true);
  353. finish_check_data_transfer:
  354. free (data);
  355. }
  356. static void
  357. check_data_transfer_cpu (App *app, size_t check_size)
  358. {
  359. for (uint32_t i = 0; i < check_size / 4; i++) {
  360. if (app->cpu.buffer[i] != i) {
  361. debug_assert_cmp ("FPGA->CPU write", app->cpu.buffer[i], i);
  362. return;
  363. }
  364. }
  365. debug_assert ("FPGA->CPU write", true);
  366. }
  367. static void
  368. fill_gpu_buffer (App *app)
  369. {
  370. uint32_t pattern;
  371. cl_event event;
  372. pattern = 42;
  373. OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, app->gpu.buffer, &pattern, sizeof (pattern), 0, GPU_BUFFER_SIZE / 4, 0, NULL, &event));
  374. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  375. OCL_CHECK_ERROR (clReleaseEvent (event));
  376. }
  377. static void
  378. measure_fpga_to_gpu_latency_with_marker (App *app)
  379. {
  380. uint32_t counter;
  381. struct timespec start;
  382. struct timespec end;
  383. double host_latency;
  384. double debug_latency;
  385. cl_event event;
  386. printf ("\n** FPGA to GPU latency [marker]\n\n");
  387. fill_gpu_buffer (app);
  388. configure_dma (app);
  389. WR32 (HF_REG_UPDATE_THRESHOLD, 0);
  390. WR64_sleep (HF_REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);
  391. setup_counter (app);
  392. WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
  393. reset_dbg (app);
  394. OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 0xd0dad0da, 0, NULL, &event));
  395. clock_gettime (CLOCK_MONOTONIC, &start);
  396. WR32 (HF_REG_DMA, 1);
  397. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  398. clock_gettime (CLOCK_MONOTONIC, &end);
  399. stop_dma (app);
  400. OCL_CHECK_ERROR (clReleaseEvent (event));
  401. counter = RD32 (HF_REG_PERF_COUNTER);
  402. host_latency = elapsed_seconds (&start, &end);
  403. debug_latency = compute_debug_latency (app);
  404. check_data_transfer (app, PAGE_SIZE);
  405. printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
  406. printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
  407. printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
  408. printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
  409. }
  410. static void
  411. measure_fpga_to_gpu_latency_with_kernel (App *app)
  412. {
  413. uint32_t counter;
  414. struct timespec start;
  415. struct timespec end;
  416. double host_latency;
  417. double debug_latency;
  418. cl_event signal_event;
  419. cl_event event;
  420. uint32_t check[CHECK_BUFFER_SIZE / 4];
  421. size_t work_size = 1;
  422. cl_ulong times[4];
  423. printf ("\n** FPGA to GPU latency [kernel]\n\n");
  424. fill_gpu_buffer (app);
  425. configure_dma (app);
  426. configure_dma_descriptors (app);
  427. setup_counter (app);
  428. WR32_sleep (HF_REG_CONTROL, HF_CONTROL_ENABLE_READ | HF_CONTROL_ENABLE_MULTI_READ);
  429. WR64_sleep (HF_REG_PERF_COUNTER_FEEDBACK_1, 0);
  430. WR64_sleep (HF_REG_DESCRIPTOR_AMD_SIGNAL, app->gpu.addr.marker_bus_address);
  431. WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
  432. reset_dbg (app);
  433. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->gpu.buffer));
  434. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->fpga_buffer));
  435. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (cl_mem), &app->check_buffer));
  436. WR32_sleep (HF_REG_DMA, 0);
  437. OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 1, 0, NULL, &signal_event));
  438. /* OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1, */
  439. /* NULL, &work_size, NULL, */
  440. /* 1, &signal_event, &event)); */
  441. usleep (100);
  442. WR32_sleep (HF_REG_DMA, 1);
  443. clock_gettime (CLOCK_MONOTONIC, &start);
  444. OCL_CHECK_ERROR (clWaitForEvents (1, &signal_event));
  445. clock_gettime (CLOCK_MONOTONIC, &end);
  446. WR32 ( HF_REG_DMA, 0);
  447. WR32 ( 0x20, 1);
  448. WR32 ( 0x24, 1);
  449. /* OCL_CHECK_ERROR (clReleaseEvent (event)); */
  450. counter = RD32 (HF_REG_PERF_COUNTER);
  451. host_latency = elapsed_seconds (&start, &end);
  452. debug_latency = compute_debug_latency (app);
  453. check_data_transfer (app, PAGE_SIZE);
  454. printf ("DMA running: %i\n", RD32 (HF_REG_DMA) & 0x1);
  455. OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE,
  456. 0, CHECK_BUFFER_SIZE, check, 0, NULL, NULL));
  457. WR32 (HF_REG_DMA, 0);
  458. debug_assert_cmp ("Data check", check[0], 1);
  459. WR32_sleep (HF_REG_CONTROL, HF_CONTROL_ENABLE_READ);
  460. printf ("%-16s %f us\n", "FPGA latency", RD32 (0x20) * 4 / 1000.0);
  461. ocl_get_event_times (signal_event, &times[0], &times[1], &times[2], &times[3]);
  462. printf ("submit -> start = %f, queue -> start = %f, run_time=%lu\n",
  463. (times[0] - times[3]) / 1000.0, (times[0] - times[2]) / 1000.0, times[1] - times[0]);
  464. /* printf ("\n%-16s %i\n", "Kernel count", check[1]); */
  465. printf ("%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
  466. /* printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0); */
  467. /* printf ("%-16s %f us\n", "FPGA [debug]", debug_latency); */
  468. /* printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.); */
  469. }
  470. static void
  471. measure_fpga_to_gpu_latency_with_cpu (App *app)
  472. {
  473. uint32_t hardware_ptr;
  474. uint32_t counter;
  475. unsigned flag_index;
  476. struct timespec start;
  477. struct timespec end;
  478. double host_latency;
  479. double debug_latency;
  480. printf ("\n** FPGA to GPU latency [CPU]\n\n");
  481. fill_gpu_buffer (app);
  482. hardware_ptr = 0;
  483. flag_index = app->board_gen == 3 ? 2 : 4;
  484. configure_dma (app);
  485. configure_dma_descriptors (app);
  486. setup_counter (app);
  487. WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
  488. reset_dbg (app);
  489. clock_gettime (CLOCK_MONOTONIC, &start);
  490. WR32 (HF_REG_DMA, 1);
  491. do {
  492. hardware_ptr = app->desc[flag_index];
  493. }
  494. while (hardware_ptr != (app->gpu.addr.surface_bus_address & 0xFFFFFFFF));
  495. clock_gettime (CLOCK_MONOTONIC, &end);
  496. stop_dma (app);
  497. counter = RD32 (HF_REG_PERF_COUNTER);
  498. host_latency = elapsed_seconds (&start, &end);
  499. debug_latency = compute_debug_latency (app);
  500. check_data_transfer (app, PAGE_SIZE);
  501. printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
  502. printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
  503. printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
  504. printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
  505. }
  506. static void
  507. measure_fpga_to_cpu_latency (App *app)
  508. {
  509. uint32_t hardware_ptr;
  510. uint32_t counter;
  511. unsigned flag_index;
  512. struct timespec start;
  513. struct timespec end;
  514. double host_latency;
  515. double debug_latency;
  516. printf ("\n** FPGA to CPU latency\n\n");
  517. hardware_ptr = 0;
  518. flag_index = app->board_gen == 3 ? 2 : 4;
  519. configure_dma (app);
  520. configure_dma_descriptors (app);
  521. setup_counter (app);
  522. WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->cpu.addr);
  523. reset_dbg (app);
  524. clock_gettime (CLOCK_MONOTONIC, &start);
  525. WR32 (HF_REG_DMA, 1);
  526. do {
  527. hardware_ptr = app->desc[flag_index];
  528. }
  529. while (hardware_ptr != app->cpu.addr);
  530. clock_gettime (CLOCK_MONOTONIC, &end);
  531. stop_dma (app);
  532. // Data back to FPGA
  533. memcpy(app->bar + 0x9400, app->cpu.buffer, CPU_BUFFER_SIZE);
  534. counter = RD32 (HF_REG_PERF_COUNTER);
  535. host_latency = elapsed_seconds (&start, &end);
  536. debug_latency = compute_debug_latency (app);
  537. check_data_transfer_cpu (app, PAGE_SIZE);
  538. printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
  539. printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
  540. printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
  541. printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
  542. }
  543. static void
  544. check_latency_results (App *app, size_t check_size)
  545. {
  546. uint32_t *cnt;
  547. cnt = malloc (16);
  548. memset (cnt, 0, 16);
  549. OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->latency_data.buffer, CL_TRUE, 0, 16, cnt, 0, NULL, NULL));
  550. for (uint32_t i = 0; i < 4; i++) {
  551. printf("cnt %x\n", cnt[i]);
  552. }
  553. uint32_t *data;
  554. printf ("size %d\n", check_size);
  555. data = malloc (check_size);
  556. memset (data, 0xab, check_size);
  557. OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));
  558. for (uint32_t i = 0; i < check_size / 4; i++) {
  559. if (i < check_size / 4) {
  560. printf ("%-16s %x \n", "Latency FPGA->GPU [FPGA Master]", data[i]);
  561. }
  562. }
  563. debug_assert ("FPGA->GPU->FPGA latency", true);
  564. free (data);
  565. }
  566. static void
  567. measure_fpga_to_gpu_latency_master (App *app)
  568. {
  569. printf ("\n** FPGA to GPU latency [FPGA MASTER]\n\n");
  570. // reset
  571. reset_dbg (app);
  572. WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x10);
  573. WR32_sleep( HF_REG_INTERCONNECT,
  574. HF_INTERCONNECT_DDR_FROM_RX_MASTER |
  575. HF_INTERCONNECT_DDR_TO_DMA |
  576. HF_INTERCONNECT_MASTER_DMA);
  577. // results array size in WORDS (32 bits)
  578. WR32_sleep(HF_REG_LATENCY_NUM_MEAS, GPU_BUFFER_SIZE / 4);
  579. // 3FF clock cycles * 4 ~= 4 us -> every 4 us the read req is repeated
  580. WR32_sleep(HF_REG_LATENCY_REPEAT_MASK, 0x3FFFFF);
  581. // FFFFF clock cycles * 4 ~= 4 ms -> timeout -> WR is repeated then RD until HF_REG_CONF_DMA_TX_ENGINE != 1
  582. WR32_sleep(HF_REG_LATENCY_TIMEOUT, 0xFFFFF);
  583. WR64_sleep(HF_REG_DESC_LATENCY_DATA, app->latency_data.addr.surface_bus_address);
  584. WR64_sleep(HF_REG_DESC_LATENCY_RESULTS, app->gpu.addr.surface_bus_address);
  585. sleep(1);
  586. // Measurement starts with rising edge of bit 0 in HF_REG_CONF_DMA_TX_ENGINE
  587. WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x11); // Mode of operation for taking latency measurements, any other number goes to the default mode
  588. //check_latency_results (app, GPU_BUFFER_SIZE);
  589. uint32_t result;
  590. result = RD32 (HF_REG_RESULT_LATENCY_MASTER);
  591. printf ("latency result : %u ns\n", result * 4 );
  592. }
  593. static void
  594. measure_fpga_to_cpu_latency_master (App *app)
  595. {
  596. printf ("\n** FPGA to CPU latency [FPGA MASTER]\n\n");
  597. // reset
  598. reset_dbg (app);
  599. WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x10);
  600. // results array size in WORDS (32 bits)
  601. WR32_sleep(HF_REG_LATENCY_NUM_MEAS, GPU_BUFFER_SIZE / 4);
  602. // 3FF clock cycles * 4 ~= 4 us -> every 4 us the read req is repeated
  603. WR32_sleep(HF_REG_LATENCY_REPEAT_MASK, 0x3FFFFF);
  604. // FFFFF clock cycles * 4 ~= 4 ms -> timeout -> WR is repeated then RD until HF_REG_CONF_DMA_TX_ENGINE != 1
  605. WR32_sleep(HF_REG_LATENCY_TIMEOUT, 0xFFFFF);
  606. WR64_sleep(HF_REG_DESC_LATENCY_DATA, app->cpu_latency_data.addr);
  607. WR64_sleep(HF_REG_DESC_LATENCY_RESULTS, app->gpu.addr.surface_bus_address);
  608. sleep(1);
  609. // Measurement starts with rising edge of bit 0 in HF_REG_CONF_DMA_TX_ENGINE
  610. WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x11); // Mode of operation for taking latency measurements, any other number goes to the default mode
  611. //check_latency_results (app, GPU_BUFFER_SIZE);
  612. uint32_t result;
  613. result = RD32 (HF_REG_RESULT_LATENCY_MASTER);
  614. printf ("latency result : %u ns\n", result * 4 );
  615. }
  616. static void
  617. measure_fpga_to_gpu_throughput (App *app)
  618. {
  619. uint32_t current_ptr;
  620. uint32_t hardware_ptr;
  621. unsigned flag_index;
  622. size_t transferred;
  623. cl_ulong addr;
  624. double elapsed;
  625. struct timespec start;
  626. struct timespec end;
  627. printf ("\n** FPGA to GPU throughput\n\n");
  628. configure_dma (app);
  629. configure_dma_descriptors (app);
  630. setup_counter (app);
  631. current_ptr = 0;
  632. hardware_ptr = 0;
  633. transferred = 0;
  634. addr = app->gpu.addr.surface_bus_address;
  635. flag_index = app->board_gen == 3 ? 2 : 4;
  636. WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, addr);
  637. clock_gettime (CLOCK_MONOTONIC, &start);
  638. WR32 (HF_REG_DMA, 1);
  639. do {
  640. do {
  641. hardware_ptr = app->desc[flag_index];
  642. }
  643. while (hardware_ptr == current_ptr); /* it should work to check against `addr` but it never does */
  644. addr += NUM_PAGES * PAGE_SIZE;
  645. transferred += NUM_PAGES * PAGE_SIZE;
  646. current_ptr = hardware_ptr;
  647. WR64 (HF_REG_DESCRIPTOR_ADDRESS, addr);
  648. }
  649. while (transferred < GPU_BUFFER_SIZE);
  650. clock_gettime (CLOCK_MONOTONIC, &end);
  651. stop_dma (app);
  652. check_data_transfer (app, GPU_BUFFER_SIZE);
  653. elapsed = elapsed_seconds (&start, &end);
  654. printf ("\n%-16s %.2f MB/s [%3.5f us]\n", "Host", GPU_BUFFER_SIZE / elapsed / 1024. / 1024., elapsed * 1000 * 1000);
  655. }
  656. static void
  657. measure_gpu_to_fpga_latency (App *app)
  658. {
  659. cl_event event;
  660. cl_mem src_buffer;
  661. cl_int err;
  662. uint32_t pattern;
  663. struct timespec start;
  664. struct timespec end;
  665. printf ("\n** GPU to FPGA latency\n\n");
  666. src_buffer = clCreateBuffer (app->context, CL_MEM_READ_ONLY, PAGE_SIZE, NULL, &err);
  667. OCL_CHECK_ERROR (err);
  668. pattern = 0xdeadf00d;
  669. OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, src_buffer, &pattern, sizeof (pattern), 0, PAGE_SIZE, 0, NULL, &event));
  670. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  671. OCL_CHECK_ERROR (clReleaseEvent (event));
  672. clock_gettime (CLOCK_MONOTONIC, &start);
  673. OCL_CHECK_ERROR (clEnqueueCopyBuffer (app->queue, src_buffer, app->fpga_buffer, 0, 0x9400, PAGE_SIZE, 0, NULL, &event));
  674. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  675. clock_gettime (CLOCK_MONOTONIC, &end);
  676. printf ("%-16s %f us\n", "Wall time", elapsed_seconds (&start, &end) * 1000 * 1000);
  677. printf ("%-16s %f us\n", "GPU time", elapsed_gpu_seconds (event) * 1000 * 1000);
  678. OCL_CHECK_ERROR (clReleaseEvent (event));
  679. OCL_CHECK_ERROR (clReleaseMemObject (src_buffer));
  680. }
  681. int
  682. main (int argc, char const* argv[])
  683. {
  684. App app;
  685. if (argc > 1) {
  686. PAGE_SIZE = GPU_BUFFER_SIZE = atoi (argv[1]);
  687. }
  688. if (argc > 2) {
  689. NUM_PAGES = atoi (argv[2]);
  690. }
  691. printf ("** Parameters\n\n");
  692. printf ("%-16s %u\n", "Pages", NUM_PAGES);
  693. printf ("%-16s %zu B\n", "Buffer size", GPU_BUFFER_SIZE);
  694. if (!init_pcilib (&app))
  695. return 1;
  696. if (!init_opencl (&app))
  697. return 1;
  698. /*
  699. measure_fpga_to_gpu_latency_with_marker (&app);
  700. measure_fpga_to_gpu_latency_with_kernel (&app);
  701. measure_fpga_to_gpu_latency_with_cpu (&app);
  702. measure_fpga_to_cpu_latency (&app);
  703. measure_gpu_to_fpga_latency (&app);
  704. measure_fpga_to_gpu_throughput (&app);
  705. */
  706. measure_fpga_to_gpu_latency_master (&app);
  707. //measure_fpga_to_cpu_latency_master (&app);
  708. close_opencl (&app);
  709. close_pcilib (&app);
  710. return 0;
  711. }