cuda_fft.c 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #include "cuda_fft.h"
  2. void loop_data_cuda (const char *vendor,
  3. CudaBenchmarkFunc func,
  4. int n_devices,
  5. OutputType outputType,
  6. TimeEntry *time_entries)
  7. {
  8. Timer *timer;
  9. timer = timer_new ();
  10. for (int j = 0; j < n_devices; j++) {
  11. struct cudaDeviceProp device_properties;
  12. cudaGetDeviceProperties(&device_properties, j);
  13. char vendor_name[100];
  14. int v_len = sprintf(vendor_name, "%s (%s)", device_properties.name, vendor);
  15. time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
  16. strcpy(time_entries[j].lib_name, vendor_name);
  17. time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
  18. printf ("Device: %s\n", time_entries[j].lib_name);
  19. fflush (stdout);
  20. cudaThreadExit();
  21. cudaSetDevice(j);
  22. for (int k = 0; k < N_DIMS; k++) {
  23. int dim = DIMS[k];
  24. int power_min = N_POWERS_INTERVALS[k][0];
  25. int power_max = N_POWERS_INTERVALS[k][1];
  26. int num_entries = power_max - power_min + 1;
  27. time_entries[j].dim_entries[k].n_dims = dim;
  28. time_entries[j].dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
  29. time_entries[j].dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
  30. time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
  31. PRINT_DIM (dim);
  32. fflush (stdout);
  33. for (int m = power_min, i = 0; m <= power_max; m++, i++) {
  34. size_t size_bytes;
  35. cufftComplex *host_orig_mem;
  36. cufftComplex *host_result_mem;
  37. cufftComplex *dev_mem;
  38. cufftComplex *dev_out_mem;
  39. size_t side_size = pow(2,m);
  40. size_t size = pow(side_size,dim);
  41. size_bytes = size * sizeof (cufftComplex);
  42. host_orig_mem = (cufftComplex *)malloc(size_bytes);
  43. host_result_mem = (cufftComplex *)malloc(size_bytes);
  44. for (int l = 0; l < size; l++) {
  45. host_orig_mem[l].x = rand() / ((float) RAND_MAX);
  46. host_orig_mem[l].y = rand() / ((float) RAND_MAX);
  47. }
  48. CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
  49. CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
  50. PRINT_DIMS(dim, side_size);
  51. fflush (stdout);
  52. double time_sec;
  53. double sum;
  54. bool scale;
  55. printf (".");
  56. fflush (stdout);
  57. CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
  58. size_t fft_size[3] = { 1, 1, 1};
  59. for (int l = 0; l < dim; l++) {
  60. fft_size[l] = side_size;
  61. }
  62. scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
  63. /* Check precision */
  64. CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
  65. sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
  66. time_sec = timer_get_seconds (timer) / N_RUNS;
  67. time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
  68. time_entries[j].dim_entries[k].errors[i] = sum / size;
  69. free (host_orig_mem);
  70. free (host_result_mem);
  71. CUDA_SAFE_CALL (cudaFree (dev_mem));
  72. CUDA_SAFE_CALL (cudaFree (dev_out_mem));
  73. }
  74. printf ("\n");
  75. fflush (stdout);
  76. }
  77. }
  78. printf ("\n");
  79. timer_destroy (timer);
  80. }
  81. bool compute_cuda_fft (cufftComplex *dev_mem,
  82. cufftComplex *out_mem,
  83. int n_dims,
  84. size_t *dims,
  85. int n_runs,
  86. Timer *timer)
  87. {
  88. cufftHandle plan;
  89. int dim_sizes[3] = {1, 1, 1};
  90. switch (n_dims) {
  91. case 1:
  92. dim_sizes[0] = dims[0];
  93. break;
  94. case 2:
  95. dim_sizes[0] = dims[0];
  96. dim_sizes[1] = dims[1];
  97. break;
  98. case 3:
  99. dim_sizes[0] = dims[0];
  100. dim_sizes[1] = dims[1];
  101. dim_sizes[2] = dims[2];
  102. break;
  103. default:
  104. fprintf (stderr, "Unknown FFT dimensions\n");
  105. return true;
  106. }
  107. CUFFT_SAFE_CALL (cufftPlanMany(&plan, n_dims, dim_sizes, NULL, 1, 0, NULL, 1, 0, CUFFT_C2C, 1));
  108. timer_start (timer);
  109. for (int i = 0; i < n_runs; i++) {
  110. CUFFT_SAFE_CALL (cufftExecC2C (plan, (cufftComplex *)dev_mem, (cufftComplex *)out_mem, CUFFT_FORWARD));
  111. CUDA_SAFE_CALL (cudaDeviceSynchronize ());
  112. }
  113. timer_stop (timer);
  114. CUFFT_SAFE_CALL (cufftExecC2C (plan, (cufftComplex *)out_mem, (cufftComplex *)dev_mem, CUFFT_INVERSE));
  115. CUDA_SAFE_CALL (cudaDeviceSynchronize ());
  116. CUDA_SAFE_CALL (cudaMemcpy (out_mem, dev_mem, dim_sizes[0] * dim_sizes[1] * dim_sizes[2] * sizeof(cufftComplex), cudaMemcpyDeviceToDevice));
  117. CUFFT_SAFE_CALL (cufftDestroy (plan));
  118. return true;
  119. }
  120. double sum_of_absolute_differences_complex (cufftComplex *a, cufftComplex *b, int n, bool scale)
  121. {
  122. double sum = 0.0;
  123. for (int i = 0; i < n; i++) {
  124. sum += fabs (a[i].x - b[i].x / ((float)n));
  125. sum += fabs (a[i].y - b[i].y / ((float)n));
  126. }
  127. return sum;
  128. }