opencl_fft.c 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include "opencl_fft.h"
  4. double sum_of_absolute_differences (float *a, float *b, int n, bool scale)
  5. {
  6. double sum = 0.0;
  7. for (int i = 0; i < n; i++)
  8. sum += fabs (a[i] - b[i] / (n / 2.));
  9. return sum;
  10. }
  11. void loop_data_opencl (const char *vendor,
  12. OclBenchmarkFunc func,
  13. cl_context context,
  14. cl_command_queue *queues,
  15. int n_devices,
  16. OutputType outputType,
  17. TimeEntry *time_entries)
  18. {
  19. Timer *timer;
  20. cl_int err;
  21. char *device_name;
  22. size_t device_name_size;
  23. cl_device_id *devices;
  24. size_t device_list_size;
  25. clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size);
  26. devices = (cl_device_id *) malloc(device_list_size);
  27. clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL);
  28. timer = timer_new ();
  29. for (int j = 0; j < n_devices; j++) {
  30. clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &device_name_size);
  31. device_name = (char *) malloc(device_name_size);
  32. clGetDeviceInfo(devices[j], CL_DEVICE_NAME, device_name_size, device_name, NULL);
  33. char vendor_name[100];
  34. int v_len = sprintf(vendor_name, "%s (%s)", device_name, vendor);
  35. time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
  36. strcpy(time_entries[j].lib_name, vendor_name);
  37. time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
  38. printf ("Device: %s\n", time_entries[j].lib_name);
  39. fflush (stdout);
  40. for (int k = 0; k < N_DIMS; k++) {
  41. int dim = DIMS[k];
  42. int power_min = N_POWERS_INTERVALS[k][0];
  43. int power_max = N_POWERS_INTERVALS[k][1];
  44. int num_entries = power_max - power_min + 1;
  45. time_entries[j].dim_entries[k].n_dims = dim;
  46. time_entries[j].dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
  47. time_entries[j].dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
  48. time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
  49. PRINT_DIM (dim);
  50. fflush (stdout);
  51. for (int m = power_min, i = 0; m <= power_max; m++, i++) {
  52. size_t size_bytes;
  53. float *host_orig_mem;
  54. float *host_result_mem;
  55. cl_mem dev_mem;
  56. cl_mem dev_out_mem;
  57. size_t side_size = pow(2,m);
  58. size_t size = pow(side_size,dim);
  59. size_bytes = size * 2 * sizeof (float);
  60. host_orig_mem = malloc (size_bytes);
  61. host_result_mem = malloc (size_bytes);
  62. for (int l = 0; l < size * 2; l++) {
  63. host_orig_mem[l] = rand() / ((float) RAND_MAX);
  64. }
  65. dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
  66. OCL_CHECK_ERROR (err);
  67. dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
  68. OCL_CHECK_ERROR (err);
  69. PRINT_DIMS(dim, side_size);
  70. fflush (stdout);
  71. double time_sec;
  72. double sum;
  73. bool scale;
  74. printf (".");
  75. fflush (stdout);
  76. OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem, 0, NULL, NULL));
  77. size_t fft_size[3] = { 1, 1, 1};
  78. time_entries[j].dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
  79. for (int l = 0; l < dim; l++) {
  80. fft_size[l] = side_size;
  81. time_entries[j].dim_entries[k].sizes[i][j] = side_size;
  82. }
  83. scale = func (context, queues[j], dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
  84. /* Check precision */
  85. OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
  86. sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
  87. time_sec = timer_get_seconds (timer) / N_RUNS;
  88. time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
  89. time_entries[j].dim_entries[k].errors[i] = sum / size;
  90. free (host_orig_mem);
  91. free (host_result_mem);
  92. OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
  93. OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
  94. }
  95. printf ("\n");
  96. fflush (stdout);
  97. }
  98. }
  99. printf ("\n");
  100. timer_destroy (timer);
  101. }
  102. #ifdef HAVE_AMD_FFT
  103. bool compute_amd_fft (cl_context context,
  104. cl_command_queue queue,
  105. cl_mem dev_mem,
  106. cl_mem out_mem,
  107. int n_dims,
  108. size_t *dims,
  109. int n_runs,
  110. Timer *timer)
  111. {
  112. clfftSetupData setup;
  113. clfftPlanHandle plan;
  114. clfftDim dim;
  115. cl_event event;
  116. size_t size;
  117. switch (n_dims) {
  118. case 1:
  119. dim = CLFFT_1D;
  120. break;
  121. case 2:
  122. dim = CLFFT_2D;
  123. break;
  124. case 3:
  125. dim = CLFFT_3D;
  126. break;
  127. default:
  128. fprintf (stderr, "Unknown FFT dimensions\n");
  129. return false;
  130. }
  131. OCL_CHECK_ERROR (clfftSetup (&setup));
  132. OCL_CHECK_ERROR (clfftCreateDefaultPlan (&plan, context, dim, dims));
  133. OCL_CHECK_ERROR (clfftSetPlanPrecision (plan, CLFFT_SINGLE));
  134. OCL_CHECK_ERROR (clfftSetLayout (plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED));
  135. OCL_CHECK_ERROR (clfftSetResultLocation (plan, CLFFT_OUTOFPLACE));
  136. OCL_CHECK_ERROR (clfftBakePlan (plan, 1, &queue, NULL, NULL));
  137. timer_start (timer);
  138. for (int i = 0; i < n_runs; i++) {
  139. OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_FORWARD, 1, &queue, 0, NULL, &event, &dev_mem, &out_mem, NULL));
  140. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  141. OCL_CHECK_ERROR (clReleaseEvent (event));
  142. }
  143. timer_stop (timer);
  144. OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_BACKWARD, 1, &queue, 0, NULL, &event, &out_mem, &dev_mem, NULL));
  145. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  146. OCL_CHECK_ERROR (clReleaseEvent (event));
  147. /*
  148. * We rely on the fact, that out_mem contains the inverse which currently
  149. * lies in dev_mem, so let's copy it back.
  150. */
  151. OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
  152. OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
  153. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  154. OCL_CHECK_ERROR (clReleaseEvent (event));
  155. OCL_CHECK_ERROR (clfftDestroyPlan (&plan));
  156. clfftTeardown ();
  157. return false;
  158. }
  159. #endif
  160. #ifdef HAVE_APPLE_FFT
  161. bool compute_apple_fft (cl_context context,
  162. cl_command_queue queue,
  163. cl_mem dev_mem,
  164. cl_mem out_mem,
  165. int n_dims,
  166. size_t *dims,
  167. int n_runs,
  168. Timer *timer)
  169. {
  170. clFFT_Plan plan;
  171. clFFT_Dimension dim;
  172. clFFT_Dim3 dim_sizes = {.x = 1, .y = 1, .z = 1};
  173. cl_event event;
  174. cl_int err;
  175. size_t size;
  176. switch (n_dims) {
  177. case 1:
  178. dim = clFFT_1D;
  179. dim_sizes.x = dims[0];
  180. break;
  181. case 2:
  182. dim = clFFT_2D;
  183. dim_sizes.x = dims[0];
  184. dim_sizes.y = dims[1];
  185. break;
  186. case 3:
  187. dim = clFFT_3D;
  188. dim_sizes.x = dims[0];
  189. dim_sizes.y = dims[1];
  190. dim_sizes.z = dims[2];
  191. break;
  192. default:
  193. fprintf (stderr, "Unknown FFT dimensions\n");
  194. return true;
  195. }
  196. plan = clFFT_CreatePlan (context, dim_sizes, dim, clFFT_InterleavedComplexFormat, &err);
  197. OCL_CHECK_ERROR (err);
  198. timer_start (timer);
  199. for (int i = 0; i < n_runs; i++) {
  200. err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Forward, dev_mem, out_mem,
  201. 0, NULL, NULL);
  202. OCL_CHECK_ERROR (err);
  203. /* Apple FFT does not return events, hence we need the hammer */
  204. OCL_CHECK_ERROR (clFinish (queue));
  205. }
  206. timer_stop (timer);
  207. err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Inverse, out_mem, dev_mem, 0, NULL, NULL);
  208. OCL_CHECK_ERROR (err);
  209. OCL_CHECK_ERROR (clFinish (queue));
  210. OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
  211. OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
  212. OCL_CHECK_ERROR (clWaitForEvents (1, &event));
  213. OCL_CHECK_ERROR (clReleaseEvent (event));
  214. clFFT_DestroyPlan (plan);
  215. return true;
  216. }
  217. #endif