|
@@ -11,855 +11,18 @@
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_OPENCL
|
|
|
-#include <CL/cl.h>
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef HAVE_CUDA_FFT
|
|
|
-#include <cufft.h>
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef HAVE_CUDA
|
|
|
-#include <cuda.h>
|
|
|
-#include <cuda_runtime.h>
|
|
|
-#include <cuda_runtime_api.h>
|
|
|
-
|
|
|
-#define CUDA_SAFE_CALL( call) do { \
|
|
|
-cudaError_t err = call; \
|
|
|
-if (cudaSuccess != err) { \
|
|
|
-fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
|
|
|
-__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
|
|
-exit(EXIT_FAILURE); \
|
|
|
-} \
|
|
|
-} while (0)
|
|
|
-
|
|
|
-#define CUFFT_SAFE_CALL( call) do { \
|
|
|
-cufftResult err = call; \
|
|
|
-if (err != CUFFT_SUCCESS) { \
|
|
|
-fprintf(stderr, "Cufft error in file '%s' in line %i : %s.\n", \
|
|
|
-__FILE__, __LINE__, "error" ); \
|
|
|
-exit(EXIT_FAILURE); \
|
|
|
-} \
|
|
|
-} while (0)
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef HAVE_AMD_FFT
|
|
|
-#include <clFFT.h>
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef HAVE_APPLE_FFT
|
|
|
-#include <oclfft/clFFT.h>
|
|
|
+#include "opencl_fft.h"
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_FFTW
|
|
|
-#include <fftw3.h>
|
|
|
-#endif
|
|
|
-
|
|
|
-#include "timer.h"
|
|
|
-#include "time_entry.h"
|
|
|
-
|
|
|
-#define N_DIMS 3
|
|
|
-#define MIN_POW2 1
|
|
|
-#define MAX_POW2 15
|
|
|
-#define MIN_RUNS 1
|
|
|
-#define MAX_RUNS 100
|
|
|
-
|
|
|
-const int DIMS[N_DIMS] = {1, 2, 3};
|
|
|
-
|
|
|
-static int N_RUNS = 4;
|
|
|
-static int N_POWERS_INTERVALS[N_DIMS][2] = {{5, 11}, {8, 11}, {7, 7}};
|
|
|
-
|
|
|
-typedef enum _OutputType {
|
|
|
- OUT_MILLISECONDS,
|
|
|
- OUT_SECONDS,
|
|
|
- OUT_MFLOPS,
|
|
|
- OUT_GFLOPS,
|
|
|
- OUT_THROUGHTPUT_GBS,
|
|
|
- OUT_THROUGHTPUT_MBS,
|
|
|
- OUT_NONE
|
|
|
-} OutputType;
|
|
|
-
|
|
|
-#define PRINT_DIM(dim) printf ("%dD:", dim);
|
|
|
-
|
|
|
-#define PRINT_DIM_SIZE(side_size,dim) { \
|
|
|
- printf(" %zu", side_size); while (dim != 1) { printf("x%zu", side_size);dim--; } printf("."); }
|
|
|
-
|
|
|
-#ifdef HAVE_OPENCL
|
|
|
-#define OCL_CHECK_ERROR(error) { \
|
|
|
- if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>\n", __FILE__, __LINE__); }
|
|
|
-
|
|
|
-#define PRINT_DIMS(dim,side_size) { \
|
|
|
- if (dim == 1) { printf (" %zu", side_size); } \
|
|
|
- else if (dim == 2) { printf (" %zux%zu", side_size, side_size); } \
|
|
|
- else { printf (" %zux%zux%zu", side_size, side_size, side_size); } }
|
|
|
-
|
|
|
-typedef bool (*OclBenchmarkFunc) (cl_context context, cl_command_queue queue, cl_mem dev_mem, cl_mem dev_out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer);
|
|
|
-
|
|
|
-#ifdef HAVE_CUDA
|
|
|
-typedef bool (*CudaBenchmarkFunc) (cufftComplex *dev_mem, cufftComplex *dev_out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer);
|
|
|
-#endif
|
|
|
-
|
|
|
-static double
|
|
|
-sum_of_absolute_differences (float *a, float *b, int n, bool scale)
|
|
|
-{
|
|
|
- double sum = 0.0;
|
|
|
-
|
|
|
- for (int i = 0; i < n; i++)
|
|
|
- sum += fabs (a[i] - b[i] / (n / 2.));
|
|
|
-
|
|
|
- return sum;
|
|
|
-}
|
|
|
-
|
|
|
-#ifdef HAVE_CUDA
|
|
|
-static double
|
|
|
-sum_of_absolute_differences_complex (cufftComplex *a, cufftComplex *b, int n, bool scale)
|
|
|
-{
|
|
|
- double sum = 0.0;
|
|
|
-
|
|
|
- for (int i = 0; i < n; i++) {
|
|
|
- sum += fabs (a[i].x - b[i].x / ((float)n));
|
|
|
- sum += fabs (a[i].y - b[i].y / ((float)n));
|
|
|
- }
|
|
|
-
|
|
|
- return sum;
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
-static double
|
|
|
-get_measurements_with_format (OutputType outputType, size_t size_bytes, double time_sec)
|
|
|
-{
|
|
|
- double out_result = -1;
|
|
|
-
|
|
|
- if (outputType == OUT_MFLOPS) {
|
|
|
- size_t size = size_bytes / 2 / sizeof (float);
|
|
|
- out_result = 5 * size * log (size) / log (2) / (time_sec / 1000.0);
|
|
|
- }
|
|
|
- else if (outputType == OUT_GFLOPS) {
|
|
|
- out_result = -1;
|
|
|
- }
|
|
|
- else if (outputType == OUT_THROUGHTPUT_MBS) {
|
|
|
- out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0;
|
|
|
- }
|
|
|
- else if (outputType == OUT_THROUGHTPUT_GBS) {
|
|
|
- out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0 / 1000.0;
|
|
|
- }
|
|
|
- else if (outputType == OUT_MILLISECONDS) {
|
|
|
- out_result = time_sec * 1000.0;
|
|
|
- }
|
|
|
- else if (outputType == OUT_SECONDS) {
|
|
|
- out_result = time_sec * 1000.0;
|
|
|
- }
|
|
|
- else {
|
|
|
- fprintf (stderr, "Unknown output type of OpenCL routines!\n");
|
|
|
- }
|
|
|
-
|
|
|
- return out_result;
|
|
|
-}
|
|
|
-
|
|
|
-static void
|
|
|
-loop_data_opencl (const char *vendor,
|
|
|
- OclBenchmarkFunc func,
|
|
|
- cl_context context,
|
|
|
- cl_command_queue *queues,
|
|
|
- int n_devices,
|
|
|
- OutputType outputType,
|
|
|
- TimeEntry *time_entries)
|
|
|
-{
|
|
|
- Timer *timer;
|
|
|
- cl_int err;
|
|
|
- char *device_name;
|
|
|
- size_t device_name_size;
|
|
|
- cl_device_id *devices;
|
|
|
- size_t device_list_size;
|
|
|
-
|
|
|
- clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size);
|
|
|
- devices = (cl_device_id *) malloc(device_list_size);
|
|
|
- clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL);
|
|
|
-
|
|
|
- timer = timer_new ();
|
|
|
-
|
|
|
- for (int j = 0; j < n_devices; j++) {
|
|
|
- clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &device_name_size);
|
|
|
- device_name = (char *) malloc(device_name_size);
|
|
|
- clGetDeviceInfo(devices[j], CL_DEVICE_NAME, device_name_size, device_name, NULL);
|
|
|
-
|
|
|
- char vendor_name[100];
|
|
|
- int v_len = sprintf(vendor_name, "%s (%s)", device_name, vendor);
|
|
|
- time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
|
|
|
- strcpy(time_entries[j].lib_name, vendor_name);
|
|
|
-
|
|
|
- time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
|
|
|
-
|
|
|
- if (n_devices > 1) {
|
|
|
- printf ("Device: %s\n", time_entries[j].lib_name);
|
|
|
- fflush (stdout);
|
|
|
- }
|
|
|
-
|
|
|
- for (int k = 0; k < N_DIMS; k++) {
|
|
|
- int dim = DIMS[k];
|
|
|
- int power_min = N_POWERS_INTERVALS[k][0];
|
|
|
- int power_max = N_POWERS_INTERVALS[k][1];
|
|
|
- int num_entries = power_max - power_min + 1;
|
|
|
-
|
|
|
- time_entries[j].dim_entries[k].n_dims = dim;
|
|
|
- time_entries[j].dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
|
|
|
- time_entries[j].dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
|
|
|
- time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
|
|
|
-
|
|
|
- PRINT_DIM (dim);
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- for (int m = power_min, i = 0; m <= power_max; m++, i++) {
|
|
|
- size_t size_bytes;
|
|
|
- float *host_orig_mem;
|
|
|
- float *host_result_mem;
|
|
|
- cl_mem dev_mem;
|
|
|
- cl_mem dev_out_mem;
|
|
|
-
|
|
|
- size_t side_size = pow(2,m);
|
|
|
- size_t size = pow(side_size,dim);
|
|
|
-
|
|
|
- size_bytes = size * 2 * sizeof (float);
|
|
|
- host_orig_mem = malloc (size_bytes);
|
|
|
- host_result_mem = malloc (size_bytes);
|
|
|
-
|
|
|
- for (int l = 0; l < size * 2; l++) {
|
|
|
- host_orig_mem[l] = rand() / ((float) RAND_MAX);
|
|
|
- }
|
|
|
-
|
|
|
- dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
|
|
|
- OCL_CHECK_ERROR (err);
|
|
|
-
|
|
|
- dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
|
|
|
- OCL_CHECK_ERROR (err);
|
|
|
-
|
|
|
- PRINT_DIMS(dim, side_size);
|
|
|
-
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- double time_sec;
|
|
|
- double sum;
|
|
|
- bool scale;
|
|
|
-
|
|
|
- printf (".");
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem, 0, NULL, NULL));
|
|
|
-
|
|
|
- size_t fft_size[3] = { 1, 1, 1};
|
|
|
- time_entries[j].dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
|
|
|
-
|
|
|
- for (int l = 0; l < dim; l++) {
|
|
|
- fft_size[l] = side_size;
|
|
|
- time_entries[j].dim_entries[k].sizes[i][j] = side_size;
|
|
|
- }
|
|
|
-
|
|
|
- scale = func (context, queues[j], dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
|
|
|
-
|
|
|
- /* Check precision */
|
|
|
- OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
|
|
|
- sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
|
|
|
-
|
|
|
- time_sec = timer_get_seconds (timer) / N_RUNS;
|
|
|
-
|
|
|
- time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
|
|
|
- time_entries[j].dim_entries[k].errors[i] = sum / size;
|
|
|
-
|
|
|
- free (host_orig_mem);
|
|
|
- free (host_result_mem);
|
|
|
- OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
|
|
|
- OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
|
|
|
- }
|
|
|
-
|
|
|
- printf ("\n");
|
|
|
- fflush (stdout);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- printf ("\n");
|
|
|
- timer_destroy (timer);
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef HAVE_CUDA
|
|
|
-static void
|
|
|
-loop_data_cuda (const char *vendor,
|
|
|
- CudaBenchmarkFunc func,
|
|
|
- int n_devices,
|
|
|
- OutputType outputType,
|
|
|
- TimeEntry *time_entries)
|
|
|
-{
|
|
|
- Timer *timer;
|
|
|
-
|
|
|
- timer = timer_new ();
|
|
|
-
|
|
|
- for (int j = 0; j < n_devices; j++) {
|
|
|
- char vendor_name[50];
|
|
|
- int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
|
|
|
- time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
|
|
|
- strcpy(time_entries[j].lib_name, vendor_name);
|
|
|
-
|
|
|
- time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
|
|
|
-
|
|
|
- for (int k = 0; k < N_DIMS; k++) {
|
|
|
- int dim = DIMS[k];
|
|
|
- int power_min = N_POWERS_INTERVALS[k][0];
|
|
|
- int power_max = N_POWERS_INTERVALS[k][1];
|
|
|
- int num_entries = power_max - power_min + 1;
|
|
|
-
|
|
|
- time_entries[j].dim_entries[k].n_dims = dim;
|
|
|
- time_entries[j].dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
|
|
|
- time_entries[j].dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
|
|
|
- time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
|
|
|
-
|
|
|
- PRINT_DIM (dim);
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- for (int m = power_min, i = 0; m <= power_max; m++, i++) {
|
|
|
- size_t size_bytes;
|
|
|
- cufftComplex *host_orig_mem;
|
|
|
- cufftComplex *host_result_mem;
|
|
|
- cufftComplex *dev_mem;
|
|
|
- cufftComplex *dev_out_mem;
|
|
|
-
|
|
|
- size_t side_size = pow(2,m);
|
|
|
- size_t size = pow(side_size,dim);
|
|
|
-
|
|
|
- size_bytes = size * sizeof (cufftComplex);
|
|
|
- host_orig_mem = (cufftComplex *)malloc(size_bytes);
|
|
|
- host_result_mem = (cufftComplex *)malloc(size_bytes);
|
|
|
-
|
|
|
- for (int l = 0; l < size; l++) {
|
|
|
- host_orig_mem[l].x = rand() / ((float) RAND_MAX);
|
|
|
- host_orig_mem[l].y = rand() / ((float) RAND_MAX);
|
|
|
- }
|
|
|
-
|
|
|
- CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
|
|
|
- CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
|
|
|
-
|
|
|
- PRINT_DIMS(dim, side_size);
|
|
|
-
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- double time_sec;
|
|
|
- double sum;
|
|
|
- bool scale;
|
|
|
-
|
|
|
- printf (".");
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
|
|
|
-
|
|
|
- size_t fft_size[3] = { 1, 1, 1};
|
|
|
- for (int l = 0; l < dim; l++) {
|
|
|
- fft_size[l] = side_size;
|
|
|
- }
|
|
|
-
|
|
|
- scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
|
|
|
-
|
|
|
- /* Check precision */
|
|
|
- CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
|
|
|
- sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
|
|
|
-
|
|
|
- time_sec = timer_get_seconds (timer) / N_RUNS;
|
|
|
-
|
|
|
- time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
|
|
|
- time_entries[j].dim_entries[k].errors[i] = sum / size;
|
|
|
-
|
|
|
- free (host_orig_mem);
|
|
|
- free (host_result_mem);
|
|
|
-
|
|
|
- CUDA_SAFE_CALL (cudaFree (dev_mem));
|
|
|
- CUDA_SAFE_CALL (cudaFree (dev_out_mem));
|
|
|
- }
|
|
|
-
|
|
|
- printf ("\n");
|
|
|
- fflush (stdout);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- printf ("\n");
|
|
|
- timer_destroy (timer);
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef HAVE_FFTW
|
|
|
-static void
|
|
|
-loop_data_fftw (OutputType outputType, TimeEntry *time_entry)
|
|
|
-{
|
|
|
- Timer *timer;
|
|
|
-
|
|
|
- timer = timer_new ();
|
|
|
-
|
|
|
- time_entry->lib_name = "FFTW";
|
|
|
- time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
|
|
|
-
|
|
|
- for (int k = 0; k < N_DIMS; k++) {
|
|
|
- int dim = DIMS[k];
|
|
|
- int power_min = N_POWERS_INTERVALS[k][0];
|
|
|
- int power_max = N_POWERS_INTERVALS[k][1];
|
|
|
- int num_entries = power_max - power_min + 1;
|
|
|
-
|
|
|
- time_entry->dim_entries[k].n_dims = dim;
|
|
|
- time_entry->dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
|
|
|
- time_entry->dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
|
|
|
- time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
|
|
|
-
|
|
|
- PRINT_DIM (dim);
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- for (int m = power_min, i = 0; m <= power_max; m++, i++) {
|
|
|
- fftw_complex *host_orig_mem;
|
|
|
- fftw_complex *host_result_mem;
|
|
|
- fftw_complex *host_immediate_mem;
|
|
|
- fftw_plan plan;
|
|
|
- fftw_plan inverse_plan;
|
|
|
- double time_sec;
|
|
|
- double sum = 0.0;
|
|
|
-
|
|
|
- size_t side_size = pow(2,m);
|
|
|
- size_t size = pow(side_size,dim);
|
|
|
- size_t size_bytes = sizeof (fftw_complex) * size;
|
|
|
-
|
|
|
- host_orig_mem = fftw_malloc (size_bytes);
|
|
|
- host_immediate_mem = fftw_malloc (size_bytes);
|
|
|
- host_result_mem = fftw_malloc (size_bytes);
|
|
|
-
|
|
|
- switch (dim) {
|
|
|
- case 1:
|
|
|
- plan = fftw_plan_dft_1d (side_size,
|
|
|
- host_orig_mem, host_immediate_mem,
|
|
|
- FFTW_FORWARD, FFTW_ESTIMATE);
|
|
|
- inverse_plan = fftw_plan_dft_1d (side_size,
|
|
|
- host_immediate_mem,
|
|
|
- host_result_mem,
|
|
|
- FFTW_BACKWARD, FFTW_ESTIMATE);
|
|
|
- break;
|
|
|
- case 2:
|
|
|
- plan = fftw_plan_dft_2d (side_size, side_size,
|
|
|
- host_orig_mem, host_immediate_mem,
|
|
|
- FFTW_FORWARD, FFTW_ESTIMATE);
|
|
|
- inverse_plan = fftw_plan_dft_2d (side_size, side_size,
|
|
|
- host_immediate_mem, host_result_mem,
|
|
|
- FFTW_BACKWARD, FFTW_ESTIMATE);
|
|
|
- break;
|
|
|
- case 3:
|
|
|
- plan = fftw_plan_dft_3d (side_size, side_size, side_size,
|
|
|
- host_orig_mem, host_immediate_mem,
|
|
|
- FFTW_FORWARD, FFTW_ESTIMATE);
|
|
|
- inverse_plan = fftw_plan_dft_3d (side_size, side_size, side_size,
|
|
|
- host_immediate_mem, host_result_mem,
|
|
|
- FFTW_BACKWARD, FFTW_ESTIMATE);
|
|
|
- break;
|
|
|
- default:
|
|
|
- fprintf (stderr, "Unknown FFT dimensions\n");
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- for (int j = 0; j < size; j++) {
|
|
|
- host_orig_mem[j][0] = rand() / ((double) RAND_MAX);
|
|
|
- host_orig_mem[j][1] = rand() / ((double) RAND_MAX);
|
|
|
- }
|
|
|
-
|
|
|
- PRINT_DIMS(dim, side_size);
|
|
|
-
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- printf (".");
|
|
|
- fflush (stdout);
|
|
|
-
|
|
|
- timer_start (timer);
|
|
|
-
|
|
|
- for (int j = 0; j < N_RUNS; j++) {
|
|
|
- fftw_execute (plan);
|
|
|
- }
|
|
|
-
|
|
|
- timer_stop (timer);
|
|
|
-
|
|
|
- /* Check precision */
|
|
|
- fftw_execute (inverse_plan);
|
|
|
-
|
|
|
- for (int j = 0; j < size; j++) {
|
|
|
- sum += fabs (host_result_mem[j][0] / size - host_orig_mem[j][0]);
|
|
|
- sum += fabs (host_result_mem[j][1] / size - host_orig_mem[j][1]);
|
|
|
- }
|
|
|
-
|
|
|
- time_sec = timer_get_seconds (timer) / N_RUNS;
|
|
|
-
|
|
|
- time_entry->dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
|
|
|
- for (int j = 0; j < dim; j++) {
|
|
|
- time_entry->dim_entries[k].sizes[i][j] = side_size;
|
|
|
- }
|
|
|
-
|
|
|
- time_entry->dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
|
|
|
- time_entry->dim_entries[k].errors[i] = sum / size;
|
|
|
-
|
|
|
- fftw_destroy_plan (inverse_plan);
|
|
|
- fftw_destroy_plan (plan);
|
|
|
- fftw_free (host_orig_mem);
|
|
|
- fftw_free (host_immediate_mem);
|
|
|
- fftw_free (host_result_mem);
|
|
|
- }
|
|
|
- printf ("\n");
|
|
|
- fflush (stdout);
|
|
|
- }
|
|
|
-
|
|
|
- printf ("\n");
|
|
|
- timer_destroy (timer);
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef HAVE_AMD_FFT
|
|
|
-static bool
|
|
|
-compute_amd_fft (cl_context context,
|
|
|
- cl_command_queue queue,
|
|
|
- cl_mem dev_mem,
|
|
|
- cl_mem out_mem,
|
|
|
- int n_dims,
|
|
|
- size_t *dims,
|
|
|
- int n_runs,
|
|
|
- Timer *timer)
|
|
|
-{
|
|
|
- clfftSetupData setup;
|
|
|
- clfftPlanHandle plan;
|
|
|
- clfftDim dim;
|
|
|
- cl_event event;
|
|
|
- size_t size;
|
|
|
-
|
|
|
- switch (n_dims) {
|
|
|
- case 1:
|
|
|
- dim = CLFFT_1D;
|
|
|
- break;
|
|
|
- case 2:
|
|
|
- dim = CLFFT_2D;
|
|
|
- break;
|
|
|
- case 3:
|
|
|
- dim = CLFFT_3D;
|
|
|
- break;
|
|
|
- default:
|
|
|
- fprintf (stderr, "Unknown FFT dimensions\n");
|
|
|
- return false;
|
|
|
- }
|
|
|
-
|
|
|
- OCL_CHECK_ERROR (clfftSetup (&setup));
|
|
|
- OCL_CHECK_ERROR (clfftCreateDefaultPlan (&plan, context, dim, dims));
|
|
|
- OCL_CHECK_ERROR (clfftSetPlanPrecision (plan, CLFFT_SINGLE));
|
|
|
- OCL_CHECK_ERROR (clfftSetLayout (plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED));
|
|
|
- OCL_CHECK_ERROR (clfftSetResultLocation (plan, CLFFT_OUTOFPLACE));
|
|
|
- OCL_CHECK_ERROR (clfftBakePlan (plan, 1, &queue, NULL, NULL));
|
|
|
-
|
|
|
- timer_start (timer);
|
|
|
-
|
|
|
- for (int i = 0; i < n_runs; i++) {
|
|
|
- OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_FORWARD, 1, &queue, 0, NULL, &event, &dev_mem, &out_mem, NULL));
|
|
|
- OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
- OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
- }
|
|
|
-
|
|
|
- timer_stop (timer);
|
|
|
-
|
|
|
- OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_BACKWARD, 1, &queue, 0, NULL, &event, &out_mem, &dev_mem, NULL));
|
|
|
- OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
- OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
-
|
|
|
- /*
|
|
|
- * We rely on the fact, that out_mem contains the inverse which currently
|
|
|
- * lies in dev_mem, so let's copy it back.
|
|
|
- */
|
|
|
- OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
|
|
|
- OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
|
|
|
- OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
- OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
-
|
|
|
- OCL_CHECK_ERROR (clfftDestroyPlan (&plan));
|
|
|
- clfftTeardown ();
|
|
|
-
|
|
|
- return false;
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
-#ifdef HAVE_APPLE_FFT
|
|
|
-static bool
|
|
|
-compute_apple_fft (cl_context context,
|
|
|
- cl_command_queue queue,
|
|
|
- cl_mem dev_mem,
|
|
|
- cl_mem out_mem,
|
|
|
- int n_dims,
|
|
|
- size_t *dims,
|
|
|
- int n_runs,
|
|
|
- Timer *timer)
|
|
|
-{
|
|
|
- clFFT_Plan plan;
|
|
|
- clFFT_Dimension dim;
|
|
|
- clFFT_Dim3 dim_sizes = {.x = 1, .y = 1, .z = 1};
|
|
|
- cl_event event;
|
|
|
- cl_int err;
|
|
|
- size_t size;
|
|
|
-
|
|
|
- switch (n_dims) {
|
|
|
- case 1:
|
|
|
- dim = clFFT_1D;
|
|
|
- dim_sizes.x = dims[0];
|
|
|
- break;
|
|
|
- case 2:
|
|
|
- dim = clFFT_2D;
|
|
|
- dim_sizes.x = dims[0];
|
|
|
- dim_sizes.y = dims[1];
|
|
|
- break;
|
|
|
- case 3:
|
|
|
- dim = clFFT_3D;
|
|
|
- dim_sizes.x = dims[0];
|
|
|
- dim_sizes.y = dims[1];
|
|
|
- dim_sizes.z = dims[2];
|
|
|
- break;
|
|
|
- default:
|
|
|
- fprintf (stderr, "Unknown FFT dimensions\n");
|
|
|
- return true;
|
|
|
- }
|
|
|
-
|
|
|
- plan = clFFT_CreatePlan (context, dim_sizes, dim, clFFT_InterleavedComplexFormat, &err);
|
|
|
- OCL_CHECK_ERROR (err);
|
|
|
-
|
|
|
- timer_start (timer);
|
|
|
-
|
|
|
- for (int i = 0; i < n_runs; i++) {
|
|
|
- err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Forward, dev_mem, out_mem,
|
|
|
- 0, NULL, NULL);
|
|
|
- OCL_CHECK_ERROR (err);
|
|
|
-
|
|
|
- /* Apple FFT does not return events, hence we need the hammer */
|
|
|
- OCL_CHECK_ERROR (clFinish (queue));
|
|
|
- }
|
|
|
-
|
|
|
- timer_stop (timer);
|
|
|
-
|
|
|
- err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Inverse, out_mem, dev_mem, 0, NULL, NULL);
|
|
|
- OCL_CHECK_ERROR (err);
|
|
|
- OCL_CHECK_ERROR (clFinish (queue));
|
|
|
-
|
|
|
- OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
|
|
|
- OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
|
|
|
- OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
- OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
-
|
|
|
- clFFT_DestroyPlan (plan);
|
|
|
-
|
|
|
- return true;
|
|
|
-}
|
|
|
+#include "cpu_fft.h"
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_CUDA_FFT
|
|
|
-static bool
|
|
|
-compute_cuda_fft (cufftComplex *dev_mem,
|
|
|
- cufftComplex *out_mem,
|
|
|
- int n_dims,
|
|
|
- size_t *dims,
|
|
|
- int n_runs,
|
|
|
- Timer *timer)
|
|
|
-{
|
|
|
- cufftHandle plan;
|
|
|
-
|
|
|
- int dim_sizes[3] = {1, 1, 1};
|
|
|
-
|
|
|
- switch (n_dims) {
|
|
|
- case 1:
|
|
|
- dim_sizes[0] = dims[0];
|
|
|
- break;
|
|
|
- case 2:
|
|
|
- dim_sizes[0] = dims[0];
|
|
|
- dim_sizes[1] = dims[1];
|
|
|
- break;
|
|
|
- case 3:
|
|
|
- dim_sizes[0] = dims[0];
|
|
|
- dim_sizes[1] = dims[1];
|
|
|
- dim_sizes[2] = dims[2];
|
|
|
- break;
|
|
|
- default:
|
|
|
- fprintf (stderr, "Unknown FFT dimensions\n");
|
|
|
- return true;
|
|
|
- }
|
|
|
-
|
|
|
- CUFFT_SAFE_CALL (cufftPlanMany(&plan, n_dims, dim_sizes, NULL, 1, 0, NULL, 1, 0, CUFFT_C2C, 1));
|
|
|
-
|
|
|
- timer_start (timer);
|
|
|
-
|
|
|
- for (int i = 0; i < n_runs; i++) {
|
|
|
- CUFFT_SAFE_CALL (cufftExecC2C (plan, (cufftComplex *)dev_mem, (cufftComplex *)out_mem, CUFFT_FORWARD));
|
|
|
- CUDA_SAFE_CALL (cudaDeviceSynchronize ());
|
|
|
- }
|
|
|
-
|
|
|
- timer_stop (timer);
|
|
|
-
|
|
|
- CUFFT_SAFE_CALL (cufftExecC2C (plan, (cufftComplex *)out_mem, (cufftComplex *)dev_mem, CUFFT_INVERSE));
|
|
|
-
|
|
|
- CUDA_SAFE_CALL (cudaDeviceSynchronize ());
|
|
|
-
|
|
|
- CUDA_SAFE_CALL (cudaMemcpy (out_mem, dev_mem, dim_sizes[0] * dim_sizes[1] * dim_sizes[2] * sizeof(cufftComplex), cudaMemcpyDeviceToDevice));
|
|
|
-
|
|
|
- CUFFT_SAFE_CALL (cufftDestroy (plan));
|
|
|
-
|
|
|
- return true;
|
|
|
-}
|
|
|
+#include "cuda_fft.h"
|
|
|
#endif
|
|
|
|
|
|
-static void
|
|
|
-write_headers_in_file (int n_dims, int only_time, FILE *fp)
|
|
|
-{
|
|
|
- fprintf (fp, "# ");
|
|
|
-
|
|
|
- for (int i = 0; i < n_dims; i++) {
|
|
|
- int min_power = N_POWERS_INTERVALS[i][0];
|
|
|
- int max_power = N_POWERS_INTERVALS[i][1];
|
|
|
-
|
|
|
- for (int j = min_power; j <= max_power; j++) {
|
|
|
- int side_size = pow(2,j);
|
|
|
- switch (DIMS[i]) {
|
|
|
- case 1:
|
|
|
- fprintf (fp, ";");
|
|
|
- fprintf (fp, "%d ", side_size);
|
|
|
- if (!only_time) {
|
|
|
- fprintf (fp, ";");
|
|
|
- fprintf (fp, "%d(Error) ", side_size);
|
|
|
- }
|
|
|
- break;
|
|
|
- case 2:
|
|
|
- fprintf (fp, ";");
|
|
|
- fprintf (fp, "%dx%d ", side_size, side_size);
|
|
|
- if (!only_time) {
|
|
|
- fprintf (fp, ";");
|
|
|
- fprintf (fp, "%dx%d(Error) ", side_size, side_size);
|
|
|
- }
|
|
|
- break;
|
|
|
- case 3:
|
|
|
- fprintf (fp, ";");
|
|
|
- fprintf (fp, "%dx%dx%d ", side_size, side_size, side_size);
|
|
|
- if (!only_time) {
|
|
|
- fprintf (fp, ";");
|
|
|
- fprintf (fp, "%dx%dx%d(Error) ", side_size, side_size, side_size);
|
|
|
- }
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
-}
|
|
|
-
|
|
|
-static void
|
|
|
-write_time_entries_in_file (TimeEntry* time_entries, int num_entries, int n_dims, int only_time, bool new_line, FILE *fp)
|
|
|
-{
|
|
|
- if (new_line) {
|
|
|
- fprintf (fp, "\n");
|
|
|
- }
|
|
|
-
|
|
|
- for (int i = 0; i < num_entries; i++) {
|
|
|
- fprintf (fp, "%s ", time_entries[i].lib_name);
|
|
|
-
|
|
|
- DimEntry *dim_entries = time_entries[i].dim_entries;
|
|
|
-
|
|
|
- for (int dim = 0; dim < n_dims; dim++) {
|
|
|
- DimEntry dim_entry = dim_entries[dim];
|
|
|
-
|
|
|
- for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
|
|
|
- if (only_time) {
|
|
|
- fprintf (fp, ";");
|
|
|
- fprintf (fp, "%f", dim_entry.times[j]);
|
|
|
- }
|
|
|
- else {
|
|
|
- fprintf (fp, ";%f;%f ", dim_entry.times[j], dim_entry.errors[j]);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (i != num_entries -1) {
|
|
|
- fprintf (fp, "\n");
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-static OutputType
|
|
|
-get_output_type_by_measure(char *measure) {
|
|
|
- OutputType outputType = OUT_MILLISECONDS;
|
|
|
- bool invalid_format = false;
|
|
|
-
|
|
|
- if (measure != NULL) {
|
|
|
- if (!strcmp(optarg, "ms"))
|
|
|
- outputType = OUT_MILLISECONDS;
|
|
|
- else if (!strcmp(optarg, "sec"))
|
|
|
- outputType = OUT_SECONDS;
|
|
|
- else if (!strcmp(optarg, "gflops"))
|
|
|
- outputType = OUT_GFLOPS;
|
|
|
- else if (!strcmp(optarg, "mflops"))
|
|
|
- outputType = OUT_MFLOPS;
|
|
|
- else if (!strcmp(optarg, "MBs") || !strcmp(optarg, "mbs"))
|
|
|
- outputType = OUT_THROUGHTPUT_MBS;
|
|
|
- else if (!strcmp(optarg, "GBs") || !strcmp(optarg, "gbs"))
|
|
|
- outputType = OUT_THROUGHTPUT_GBS;
|
|
|
- else {
|
|
|
- invalid_format = true;
|
|
|
- fprintf (stderr, "Incorrect measure format [%s], [ms] will be used.\nUse the following formats: ms,sec,mflops,gflops,GBs,MBs.\n\n", measure);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (!invalid_format || (measure == NULL)) {
|
|
|
- fprintf (stdout, "Output will be use measure [%s].\n\n", measure == NULL ? "ms" : measure);
|
|
|
- }
|
|
|
-
|
|
|
- return outputType;
|
|
|
-}
|
|
|
-
|
|
|
-static bool
|
|
|
-get_fft_range(char *val1, char *val2, int *out_range) {
|
|
|
- int out_val1;
|
|
|
- int out_val2;
|
|
|
- bool range_is_valid = true;
|
|
|
-
|
|
|
- out_val1 = (int) strtol (val1, NULL, 10);
|
|
|
- out_val2 = (int) strtol (val2, NULL, 10);
|
|
|
-
|
|
|
- if (out_val1 != 0L &&
|
|
|
- out_val2 != 0L &&
|
|
|
- (out_val1 >= MIN_POW2 && out_val1 <= MAX_POW2) &&
|
|
|
- (out_val2 >= MIN_POW2 && out_val2 <= MAX_POW2) &&
|
|
|
- (out_val1 < out_val2)) {
|
|
|
- out_range[0] = out_val1;
|
|
|
- out_range[1] = out_val2;
|
|
|
- }
|
|
|
- else {
|
|
|
- fprintf (stderr, "Incorrect the range of powers, it should be in range [%d %d].\n\n", MIN_POW2, MAX_POW2);
|
|
|
- range_is_valid = false;
|
|
|
- }
|
|
|
-
|
|
|
- return range_is_valid;
|
|
|
-}
|
|
|
-
|
|
|
-static bool
|
|
|
-get_number_of_runs(char *val, int *out) {
|
|
|
- bool value_valid = true;
|
|
|
- int out_val;
|
|
|
-
|
|
|
- out_val = (int) strtol (val, NULL, 10);
|
|
|
-
|
|
|
- if (out_val != 0L && (out_val >= MIN_RUNS && out_val <= MAX_RUNS)) {
|
|
|
- *out = out_val;
|
|
|
- }
|
|
|
- else {
|
|
|
- fprintf (stderr, "Incorrect the number of runs, the value should be from %d to %d.\n\n", MIN_RUNS, MAX_RUNS);
|
|
|
- value_valid = false;
|
|
|
- }
|
|
|
-
|
|
|
- return value_valid;
|
|
|
-}
|
|
|
-
|
|
|
-static void
|
|
|
-print_usage(char *app_name, struct option long_options[], char **options_descritions, int exit_code)
|
|
|
-{
|
|
|
- printf ("Usage: %s [OPTIONS]\n", app_name);
|
|
|
- printf ("Options:\n");
|
|
|
- for (int i = 0; long_options[i].name != 0; i++) {
|
|
|
- printf(" --%-20s %s\n", long_options[i].name, options_descritions[i]);
|
|
|
- }
|
|
|
-
|
|
|
- exit (exit_code);
|
|
|
-}
|
|
|
+#include "utilities.h"
|
|
|
|
|
|
int
|
|
|
main (int argc, char **argv)
|