signal.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. #define _XOPEN_SOURCE 500
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <stdbool.h>
  5. #include <string.h>
  6. #include <unistd.h>
  7. #include <pcilib.h>
  8. #include <pcilib/bar.h>
  9. #include <pcilib/kmem.h>
  10. #include <CL/cl.h>
  11. #include <CL/cl_ext.h>
  12. #include "ocl.h"
  13. /* this should actually come from the distributed pcitool sources */
  14. #include "pciDriver.h"
  15. typedef struct {
  16. /* pcilib */
  17. pcilib_t *pci;
  18. uint8_t *bar;
  19. cl_ulong bar_phys;
  20. uint8_t board_gen;
  21. pcilib_kmem_handle_t *kdesc;
  22. uintptr_t kdesc_bus;
  23. volatile uint32_t *desc;
  24. /* OpenCL */
  25. OclPlatform *ocl;
  26. cl_device_id device;
  27. cl_command_queue queue;
  28. cl_context context;
  29. cl_program program;
  30. cl_kernel kernel;
  31. cl_mem check_buffer;
  32. /* both */
  33. cl_mem fpga_buffer;
  34. struct {
  35. cl_mem buffer;
  36. cl_bus_address_amd addr;
  37. } gpu;
  38. } App;
  39. #define UNICODE_CHECK_MARK "\u2713"
  40. #define UNICODE_CROSS "\u2717"
  41. #define KMEM_DEFAULT_FLAGS PCILIB_KMEM_FLAG_HARDWARE | \
  42. PCILIB_KMEM_FLAG_PERSISTENT | \
  43. PCILIB_KMEM_FLAG_EXCLUSIVE
  44. #define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
  45. #define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
  46. #define REG_RESET_DMA 0x00
  47. #define REG_START_DMA 0x04
  48. #define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10
  49. #define REG_PACKET_LENGTH 0x0C
  50. #define REG_DESCRIPTOR_ADDRESS 0x50
  51. #define REG_UPDATE_ADDRESS 0x54
  52. #define REG_LAST_DESCRIPTOR 0x58
  53. #define REG_NUM_DESCRIPTORS 0x5C
  54. #define REG_UPDATE_THRESHOLD 0x60
  55. #define REG_ENABLE_COUNTER 0x9000
  56. #define REG_CONTROL 0x9040
  57. #define REG_NUM_ROWS 0x9168
  58. #define REG_NUM_FRAMES 0x9170
  59. #define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
  60. #define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
  61. #define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);
  62. static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL;
  63. /* declaration should actually come from a distributed header file */
  64. const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);
  65. static bool
  66. init_pcilib (App *app)
  67. {
  68. static const char *DEVICE = "/dev/fpga0";
  69. const pcilib_board_info_t *board;
  70. app->pci = pcilib_open (DEVICE, "pci");
  71. if (app->pci == NULL) {
  72. printf ("Could not open `%s'", DEVICE);
  73. return false;
  74. }
  75. app->bar = pcilib_map_bar (app->pci, PCILIB_BAR0);
  76. if (app->bar == NULL) {
  77. printf ("Unable to map BAR\n");
  78. pcilib_close (app->pci);
  79. return false;
  80. }
  81. board = pcilib_get_board_info (app->pci);
  82. app->bar_phys = board->bar_start[PCILIB_BAR0];
  83. app->board_gen = RD32 (0x18) & 0xF;
  84. app->kdesc = pcilib_alloc_kernel_memory (app->pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
  85. app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0);
  86. app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0);
  87. memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));
  88. return true;
  89. }
  90. static void
  91. close_pcilib (App *app)
  92. {
  93. pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
  94. pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
  95. pcilib_close (app->pci);
  96. }
  97. static cl_int
  98. create_fpga_buffer (App *app, size_t size)
  99. {
  100. cl_mem_flags flags;
  101. cl_bus_address_amd addr;
  102. cl_int error;
  103. flags = CL_MEM_EXTERNAL_PHYSICAL_AMD;
  104. addr.surface_bus_address = (cl_ulong) app->bar_phys;
  105. addr.marker_bus_address = (cl_ulong) app->bar_phys;
  106. app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error);
  107. return error;
  108. }
  109. static cl_int
  110. create_gpu_buffer (App *app, size_t size)
  111. {
  112. cl_mem_flags flags;
  113. cl_int error;
  114. char *data;
  115. data = malloc (size);
  116. memset (data, 0, size);
  117. flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR;
  118. app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error);
  119. if (error != CL_SUCCESS)
  120. return error;
  121. return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
  122. }
  123. static bool
  124. init_opencl (App *app)
  125. {
  126. cl_int error;
  127. cl_platform_id platform;
  128. app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE);
  129. platform = ocl_get_platform (app->ocl);
  130. clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD");
  131. app->device = ocl_get_devices (app->ocl)[0];
  132. app->queue = ocl_get_cmd_queues (app->ocl)[0];
  133. app->context = ocl_get_context (app->ocl);
  134. app->program = ocl_create_program_from_file (app->ocl, "kernel.cl", NULL, &error);
  135. OCL_CHECK_ERROR (error);
  136. app->kernel = clCreateKernel (app->program, "write_to_fpga", &error);
  137. OCL_CHECK_ERROR (error);
  138. app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, 8, NULL, &error);
  139. OCL_CHECK_ERROR (error);
  140. OCL_CHECK_ERROR (create_fpga_buffer (app, 1024 * 64));
  141. OCL_CHECK_ERROR (create_gpu_buffer (app, 1024 * 64));
  142. return error != CL_SUCCESS ? false : true;
  143. }
  144. static void
  145. close_opencl (App *app)
  146. {
  147. OCL_CHECK_ERROR (clReleaseKernel (app->kernel));
  148. OCL_CHECK_ERROR (clReleaseProgram (app->program));
  149. OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
  150. OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
  151. ocl_free (app->ocl);
  152. }
  153. static void
  154. debug_wait (const char *message)
  155. {
  156. printf ("%-32s", message);
  157. fflush (stdout);
  158. }
  159. static void
  160. debug_assert (const char *message, bool condition)
  161. {
  162. printf ("%-32s", message);
  163. if (condition)
  164. printf (UNICODE_CHECK_MARK"\n");
  165. else
  166. printf (UNICODE_CROSS"\n");
  167. }
  168. static void
  169. debug_assert_cmp (const char *message, uint32_t value, uint32_t expected)
  170. {
  171. printf ("%-32s", message);
  172. if (value != expected)
  173. printf (UNICODE_CROSS" [%u != %u]\n", value, expected);
  174. else
  175. printf (UNICODE_CHECK_MARK"\n");
  176. }
  177. static void
  178. debug_done (void)
  179. {
  180. printf (UNICODE_CHECK_MARK"\n");
  181. }
  182. static void
  183. check_pcie (App *app)
  184. {
  185. uint32_t value;
  186. WR32 (REG_RESET_DMA, 1);
  187. usleep (100000);
  188. WR32 (REG_RESET_DMA, 0);
  189. usleep (100000);
  190. value = RD32 (REG_RESET_DMA);
  191. debug_assert ("PCIe ready?", value == 335746816 || value == 335681280);
  192. }
  193. static void
  194. configure_dma (App *app)
  195. {
  196. const unsigned TLP_SIZE = 32;
  197. debug_wait ("Configure DMA ...");
  198. WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, 1);
  199. if (app->board_gen == 3) {
  200. WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
  201. }
  202. else {
  203. WR32 (REG_PACKET_LENGTH, TLP_SIZE);
  204. }
  205. WR32 (REG_NUM_DESCRIPTORS, 0);
  206. debug_done ();
  207. }
  208. static void
  209. configure_dma_descriptors (App *app)
  210. {
  211. debug_wait ("Configure DMA descriptors ...");
  212. WR32 (REG_LAST_DESCRIPTOR, 0);
  213. WR32 (REG_UPDATE_THRESHOLD, 1);
  214. WR32 (REG_UPDATE_ADDRESS, app->kdesc_bus);
  215. usleep (100000);
  216. usleep (1000);
  217. WR32 (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
  218. debug_done ();
  219. debug_assert_cmp ("Descriptor address correct?", RD32 (REG_DESCRIPTOR_ADDRESS), app->gpu.addr.surface_bus_address);
  220. }
  221. static void
  222. start_dma (App *app)
  223. {
  224. debug_wait ("Start DMA ... ");
  225. WR32_sleep (REG_NUM_ROWS, 0);
  226. WR32_sleep (REG_NUM_FRAMES, 0);
  227. WR32_sleep (REG_CONTROL, 0);
  228. WR32_sleep (REG_ENABLE_COUNTER, 0xFF);
  229. WR32_sleep (REG_ENABLE_COUNTER, 1);
  230. WR32 (REG_START_DMA, 1);
  231. debug_done ();
  232. }
  233. static void
  234. stop_dma (App *app)
  235. {
  236. debug_wait ("Stop DMA ... ");
  237. WR32_sleep (REG_START_DMA, 0);
  238. WR32 (REG_RESET_DMA, 1);
  239. debug_done ();
  240. }
  241. static void
  242. transfer_data (App *app)
  243. {
  244. uint32_t current_ptr;
  245. uint32_t hardware_ptr;
  246. debug_wait ("Transfer data ... ");
  247. current_ptr = 0;
  248. do {
  249. hardware_ptr = app->desc[3]; /* only valid for board gen 3 */
  250. } while (hardware_ptr == current_ptr);
  251. debug_done ();
  252. }
  253. static void
  254. print_mem_words (App *app, cl_mem buffer, unsigned offset, unsigned range)
  255. {
  256. uint32_t *data;
  257. size_t size;
  258. size = range * sizeof (uint32_t);
  259. data = malloc (size);
  260. OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
  261. for (unsigned i = 0; i < range; i++) {
  262. printf ("0x%08x ", data[i]);
  263. }
  264. printf ("\n");
  265. free (data);
  266. }
  267. static void
  268. check_value (App *app, const char *message, uint32_t addr, uint32_t expected)
  269. {
  270. uint32_t value;
  271. value = RD32 (addr);
  272. debug_assert_cmp (message, value, expected);
  273. }
  274. static void
  275. launch_signal (App *app)
  276. {
  277. cl_event event;
  278. uint32_t value;
  279. uintptr_t addr;
  280. uint32_t check[2];
  281. size_t global_work_size;
  282. cl_ulong start, end, queued, submitted;
  283. configure_dma (app);
  284. configure_dma_descriptors (app);
  285. start_dma (app);
  286. transfer_data (app);
  287. addr = 0x9168;
  288. /* try to override defaultvalue */
  289. value = 0xc001;
  290. WR32 (addr, value);
  291. check_value (app, "CPU write check ...", addr, value);
  292. value = 0xdeadf00d;
  293. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->fpga_buffer));
  294. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->check_buffer));
  295. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (uint32_t), &addr));
  296. OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 3, sizeof (uint32_t), &value));
  297. global_work_size = 1;
  298. OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1,
  299. NULL, &global_work_size, NULL,
  300. 0, NULL, &event));
  301. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  302. ocl_get_event_times (event, &start, &end, &queued, &submitted);
  303. OCL_CHECK_ERROR (clReleaseEvent (event));
  304. /* let's see if the GPU wrote anything */
  305. check_value (app, "GPU write check ...", addr, value);
  306. /* let's see if the kernel did at least something */
  307. printf ("%-32s", "Sanity check ...");
  308. check[0] = check[1] = 0;
  309. OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE, 0, 8, check, 0, NULL, NULL));
  310. if (check[0] == addr && check[1] == value)
  311. printf (UNICODE_CHECK_MARK"\n");
  312. else
  313. printf (UNICODE_CROSS" [0x%x != %p || 0x%x != 0x%x]\n", check[0], (void *) addr, check[1], value);
  314. printf ("> exec : %lu ns\n", end - start);
  315. printf ("> submit: %lu ns\n", end - submitted);
  316. printf ("> queue : %lu ns\n", end - queued);
  317. stop_dma (app);
  318. }
  319. int
  320. main (int argc, char const* argv[])
  321. {
  322. App app;
  323. if (!init_pcilib (&app))
  324. return 1;
  325. if (!init_opencl (&app))
  326. return 1;
  327. check_pcie (&app); /* FIXME: Without this, there are no data transfers */
  328. launch_signal (&app);
  329. close_opencl (&app);
  330. close_pcilib (&app);
  331. return 0;
  332. }