Browse Source

Splitting files on CUDA, OpenCL, CPU and Utilities

Roman Shkarin 9 years ago
parent
commit
487f0e7b84
10 changed files with 915 additions and 845 deletions
  1. 9 4
      Makefile
  2. 4 841
      benchmark.c
  3. 127 0
      cpu_fft.c
  4. 10 0
      cpu_fft.h
  5. 167 0
      cuda_fft.c
  6. 32 0
      cuda_fft.h
  7. 274 0
      opencl_fft.c
  8. 45 0
      opencl_fft.h
  9. 188 0
      utilities.c
  10. 59 0
      utilities.h

+ 9 - 4
Makefile

@@ -8,6 +8,11 @@ DEP_OCLFFT = .deps/oclfft/src/liboclfft.so
 DEP_CLFFT = .deps/clFFT/src/library/libclFFT.so
 DEPS = $(DEP_OCLFFT)
 
+CC_MARK := @
+ifeq ($(V), 1)
+	override CC_MARK =
+endif
+
 # Get bits of OS
 ARCH := $(shell getconf LONG_BIT 2> /dev/null)
 ifeq ($(ARCH),32)
@@ -19,7 +24,7 @@ LIBS_MSG    :=
 
 # Common flags definition
 CPPFLAGS    :=
-CFLAGS      := -O3 -Wall -Werror -std=c99 -fmessage-length=0
+CFLAGS      := -O3 -Wall -Werror -Wno-unused-variable -std=c99 -fmessage-length=0
 LDFLAGS     := -lm
 
 # Detect OpenCL
@@ -36,7 +41,7 @@ CUDA_FFT := $(shell find $(CUDA_PATH)/ -maxdepth 2 -type f -name "libcufft.so*"
 
 # If true then try to use CUDA, otherwise AMD
 ifneq ($(CUDA_FFT),)
-	override CPPFLAGS += -DHAVE_CUDA -DHAVE_CUDA_FFT
+	override CPPFLAGS += -DHAVE_CUDA_FFT
 	override CFLAGS += -I$(CUDA_PATH)/include
 	override LDFLAGS += -L$(CUDA_PATH)/lib$(ARCH) -lcufft -lcudart -Wl,-rpath=$(CUDA_PATH)/lib$(ARCH) 
 	override LIBS_MSG += " +cuda"
@@ -97,11 +102,11 @@ all: $(BIN)
 
 %.o: %.c Makefile $(DEPS)
 	@echo [CC] $@
-	@$(CC) $(ALL_CPPFLAGS) $(ALL_CFLAGS) -c $< -o $@
+	$(CC_MARK)$(CC) $(ALL_CPPFLAGS) $(ALL_CFLAGS) -c $< -o $@
 
 $(BIN): $(OBJS) $(DEPS)
 	@echo [LD] $@
-	@$(CC) $(OBJS) -o $@ $(ALL_LDFLAGS)
+	$(CC_MARK)$(CC) $(OBJS) -o $@ $(ALL_LDFLAGS)
 	@echo "     built with:$(LIBS_MSG)"
 
 run: $(BIN)

+ 4 - 841
benchmark.c

@@ -11,855 +11,18 @@
 #endif
 
 #ifdef HAVE_OPENCL
-#include <CL/cl.h>
-#endif
-
-#ifdef HAVE_CUDA_FFT
-#include <cufft.h>
-#endif
-
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-
-#define CUDA_SAFE_CALL( call) do { \
-cudaError_t err = call; \
-if (cudaSuccess != err) { \
-fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
-__FILE__, __LINE__, cudaGetErrorString( err) ); \
-exit(EXIT_FAILURE); \
-} \
-} while (0)
-
-#define CUFFT_SAFE_CALL( call) do { \
-cufftResult err = call; \
-if (err != CUFFT_SUCCESS) { \
-fprintf(stderr, "Cufft error in file '%s' in line %i : %s.\n", \
-__FILE__, __LINE__, "error" ); \
-exit(EXIT_FAILURE); \
-} \
-} while (0)
-#endif
-
-#ifdef HAVE_AMD_FFT
-#include <clFFT.h>
-#endif
-
-#ifdef HAVE_APPLE_FFT
-#include <oclfft/clFFT.h>
+#include "opencl_fft.h"
 #endif
 
 #ifdef HAVE_FFTW
-#include <fftw3.h>
-#endif
-
-#include "timer.h"
-#include "time_entry.h"
-
-#define N_DIMS 3
-#define MIN_POW2 1
-#define MAX_POW2 15
-#define MIN_RUNS 1
-#define MAX_RUNS 100
-
-const int DIMS[N_DIMS] = {1, 2, 3};
-
-static int N_RUNS = 4;
-static int N_POWERS_INTERVALS[N_DIMS][2] = {{5, 11}, {8, 11}, {7, 7}};
-
-typedef enum _OutputType {
-    OUT_MILLISECONDS,
-    OUT_SECONDS,
-    OUT_MFLOPS,
-    OUT_GFLOPS,
-    OUT_THROUGHTPUT_GBS,
-    OUT_THROUGHTPUT_MBS,
-    OUT_NONE
-} OutputType;
-
-#define PRINT_DIM(dim) printf ("%dD:", dim);
-
-#define PRINT_DIM_SIZE(side_size,dim) { \
-    printf(" %zu", side_size); while (dim != 1) { printf("x%zu", side_size);dim--; } printf("."); }
-
-#ifdef HAVE_OPENCL
-#define OCL_CHECK_ERROR(error) { \
-    if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>\n", __FILE__, __LINE__); }
-
-#define PRINT_DIMS(dim,side_size) { \
-    if (dim == 1) { printf (" %zu", side_size); } \
-    else if (dim == 2) { printf (" %zux%zu", side_size, side_size); } \
-    else { printf (" %zux%zux%zu", side_size, side_size, side_size); } }
-
-typedef bool (*OclBenchmarkFunc) (cl_context context, cl_command_queue queue, cl_mem dev_mem, cl_mem dev_out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer);
-
-#ifdef HAVE_CUDA
-typedef bool (*CudaBenchmarkFunc) (cufftComplex *dev_mem, cufftComplex *dev_out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer);
-#endif
-
-static double
-sum_of_absolute_differences (float *a, float *b, int n, bool scale)
-{
-    double sum = 0.0;
-
-    for (int i = 0; i < n; i++)
-        sum += fabs (a[i] - b[i] / (n / 2.));
-
-    return sum;
-}
-
-#ifdef HAVE_CUDA
-static double
-sum_of_absolute_differences_complex (cufftComplex *a, cufftComplex *b, int n, bool scale)
-{
-    double sum = 0.0;
-
-    for (int i = 0; i < n; i++) {
-        sum += fabs (a[i].x - b[i].x / ((float)n));
-        sum += fabs (a[i].y - b[i].y / ((float)n));
-    }
-
-    return sum;
-}
-#endif
-
-static double
-get_measurements_with_format (OutputType outputType, size_t size_bytes, double time_sec)
-{
-    double out_result = -1;
-
-    if (outputType == OUT_MFLOPS) {
-        size_t size = size_bytes / 2 / sizeof (float);
-        out_result = 5 * size * log (size) / log (2) / (time_sec / 1000.0);
-    }
-    else if (outputType == OUT_GFLOPS) {
-        out_result = -1;
-    }
-    else if (outputType == OUT_THROUGHTPUT_MBS) {
-        out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0;
-    }
-    else if (outputType == OUT_THROUGHTPUT_GBS) {
-        out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0 / 1000.0;
-    }
-    else if (outputType == OUT_MILLISECONDS) {
-        out_result = time_sec * 1000.0;
-    }
-    else if (outputType == OUT_SECONDS) {
-        out_result = time_sec * 1000.0;
-    }
-    else {
-        fprintf (stderr, "Unknown output type of OpenCL routines!\n");
-    }
-
-    return out_result;
-}
-
-static void
-loop_data_opencl (const char *vendor,
-                  OclBenchmarkFunc func,
-                  cl_context context,
-                  cl_command_queue *queues,
-                  int n_devices,
-                  OutputType outputType,
-                  TimeEntry *time_entries)
-{
-    Timer *timer;
-    cl_int err;
-    char *device_name;
-    size_t device_name_size;
-    cl_device_id *devices;
-    size_t device_list_size;
-
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size);
-    devices = (cl_device_id *) malloc(device_list_size);
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL);
-
-    timer = timer_new ();
-
-    for (int j = 0; j < n_devices; j++) {
-        clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &device_name_size);
-        device_name = (char *) malloc(device_name_size);
-        clGetDeviceInfo(devices[j], CL_DEVICE_NAME, device_name_size, device_name, NULL);
-
-        char vendor_name[100];
-        int v_len = sprintf(vendor_name, "%s (%s)", device_name, vendor);
-        time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
-        strcpy(time_entries[j].lib_name, vendor_name);
-
-        time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
-
-        if (n_devices > 1) {
-            printf ("Device: %s\n", time_entries[j].lib_name);
-            fflush (stdout);
-        }
-
-        for (int k = 0; k < N_DIMS; k++) {
-            int dim = DIMS[k];
-            int power_min = N_POWERS_INTERVALS[k][0];
-            int power_max = N_POWERS_INTERVALS[k][1];
-            int num_entries = power_max - power_min + 1;
-
-            time_entries[j].dim_entries[k].n_dims = dim;
-            time_entries[j].dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
-            time_entries[j].dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
-            time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
-
-            PRINT_DIM (dim);
-            fflush (stdout);
-
-            for (int m = power_min, i = 0; m <= power_max; m++, i++) {
-                size_t size_bytes;
-                float *host_orig_mem;
-                float *host_result_mem;
-                cl_mem dev_mem;
-                cl_mem dev_out_mem;
-
-                size_t side_size = pow(2,m);
-                size_t size = pow(side_size,dim);
-
-                size_bytes = size * 2 * sizeof (float);
-                host_orig_mem = malloc (size_bytes);
-                host_result_mem = malloc (size_bytes);
-
-                for (int l = 0; l < size * 2; l++) {
-                    host_orig_mem[l] = rand() / ((float) RAND_MAX);
-                }
-
-                dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
-                OCL_CHECK_ERROR (err); 
-
-                dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
-                OCL_CHECK_ERROR (err);
-
-                PRINT_DIMS(dim, side_size);
-
-                fflush (stdout);
-
-                double time_sec;
-                double sum;
-                bool scale;
-
-                printf (".");
-                fflush (stdout);
-
-                OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem, 0, NULL, NULL));
-
-                size_t fft_size[3] = { 1, 1, 1};
-                time_entries[j].dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
-
-                for (int l = 0; l < dim; l++) {
-                    fft_size[l] = side_size;
-                    time_entries[j].dim_entries[k].sizes[i][j] = side_size;
-                }
-
-                scale = func (context, queues[j], dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
-
-                /* Check precision */
-                OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
-                sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
-
-                time_sec = timer_get_seconds (timer) / N_RUNS;
-
-                time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
-                time_entries[j].dim_entries[k].errors[i] = sum / size;
-
-                free (host_orig_mem);
-                free (host_result_mem);
-                OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
-                OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
-            }
-
-            printf ("\n");
-            fflush (stdout);
-        }
-    }
-
-    printf ("\n");
-    timer_destroy (timer);
-}
-#endif
-
-#ifdef HAVE_CUDA
-static void
-loop_data_cuda (const char *vendor,
-                CudaBenchmarkFunc func,
-                int n_devices,
-                OutputType outputType,
-                TimeEntry *time_entries)
-{
-    Timer *timer;
-
-    timer = timer_new ();
-
-    for (int j = 0; j < n_devices; j++) {
-        char vendor_name[50];
-        int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
-        time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
-        strcpy(time_entries[j].lib_name, vendor_name);
-
-        time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
-
-        for (int k = 0; k < N_DIMS; k++) {
-            int dim = DIMS[k];
-            int power_min = N_POWERS_INTERVALS[k][0];
-            int power_max = N_POWERS_INTERVALS[k][1];
-            int num_entries = power_max - power_min + 1;
-
-            time_entries[j].dim_entries[k].n_dims = dim;
-            time_entries[j].dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
-            time_entries[j].dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
-            time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
-
-            PRINT_DIM (dim);
-            fflush (stdout);
-
-            for (int m = power_min, i = 0; m <= power_max; m++, i++) {
-                size_t size_bytes;
-                cufftComplex *host_orig_mem;
-                cufftComplex *host_result_mem;
-                cufftComplex *dev_mem;
-                cufftComplex *dev_out_mem;
-
-                size_t side_size = pow(2,m);
-                size_t size = pow(side_size,dim);
-
-                size_bytes = size * sizeof (cufftComplex);
-                host_orig_mem = (cufftComplex *)malloc(size_bytes);
-                host_result_mem = (cufftComplex *)malloc(size_bytes);
-
-                for (int l = 0; l < size; l++) {
-                    host_orig_mem[l].x = rand() / ((float) RAND_MAX);
-                    host_orig_mem[l].y = rand() / ((float) RAND_MAX);
-                }
-
-                CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
-                CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
-
-                PRINT_DIMS(dim, side_size);
-
-                fflush (stdout);
-
-                double time_sec;
-                double sum;
-                bool scale;
-
-                printf (".");
-                fflush (stdout);
-
-                CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
-
-                size_t fft_size[3] = { 1, 1, 1};
-                for (int l = 0; l < dim; l++) {
-                    fft_size[l] = side_size;
-                }
-
-                scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
-
-                /* Check precision */
-                CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
-                sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
-        
-                time_sec = timer_get_seconds (timer) / N_RUNS;
-
-                time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
-                time_entries[j].dim_entries[k].errors[i] = sum / size;
-
-                free (host_orig_mem);
-                free (host_result_mem);
-
-                CUDA_SAFE_CALL (cudaFree (dev_mem));
-                CUDA_SAFE_CALL (cudaFree (dev_out_mem));
-            }
-
-            printf ("\n");
-            fflush (stdout);
-        }
-    }
-
-    printf ("\n");
-    timer_destroy (timer);
-}
-#endif
-
-#ifdef HAVE_FFTW
-static void
-loop_data_fftw (OutputType outputType, TimeEntry *time_entry)
-{
-    Timer *timer;
-
-    timer = timer_new ();
-
-    time_entry->lib_name = "FFTW";
-    time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
-
-    for (int k = 0; k < N_DIMS; k++) {
-        int dim = DIMS[k];
-        int power_min = N_POWERS_INTERVALS[k][0];
-        int power_max = N_POWERS_INTERVALS[k][1];
-        int num_entries = power_max - power_min + 1;
-
-        time_entry->dim_entries[k].n_dims = dim;
-        time_entry->dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
-        time_entry->dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
-        time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
-
-        PRINT_DIM (dim);
-        fflush (stdout);
-
-        for (int m = power_min, i = 0; m <= power_max; m++, i++) {
-            fftw_complex *host_orig_mem;
-            fftw_complex *host_result_mem;
-            fftw_complex *host_immediate_mem;
-            fftw_plan plan;
-            fftw_plan inverse_plan;
-            double time_sec;
-            double sum = 0.0;
-
-            size_t side_size = pow(2,m);
-            size_t size = pow(side_size,dim);
-            size_t size_bytes = sizeof (fftw_complex) * size;
-
-            host_orig_mem = fftw_malloc (size_bytes);
-            host_immediate_mem = fftw_malloc (size_bytes);
-            host_result_mem = fftw_malloc (size_bytes);
-
-            switch (dim) {
-                case 1:
-                    plan = fftw_plan_dft_1d (side_size, 
-                                             host_orig_mem, host_immediate_mem,
-                                             FFTW_FORWARD, FFTW_ESTIMATE);
-                    inverse_plan = fftw_plan_dft_1d (side_size, 
-                                                     host_immediate_mem,
-                                                     host_result_mem,
-                                                     FFTW_BACKWARD, FFTW_ESTIMATE);
-                    break;
-                case 2:
-                    plan = fftw_plan_dft_2d (side_size, side_size,
-                                             host_orig_mem, host_immediate_mem,
-                                             FFTW_FORWARD, FFTW_ESTIMATE);
-                    inverse_plan = fftw_plan_dft_2d (side_size, side_size,
-                                                     host_immediate_mem, host_result_mem,
-                                                     FFTW_BACKWARD, FFTW_ESTIMATE);
-                    break;
-                case 3:
-                    plan = fftw_plan_dft_3d (side_size, side_size, side_size,
-                                             host_orig_mem, host_immediate_mem,
-                                             FFTW_FORWARD, FFTW_ESTIMATE);
-                    inverse_plan = fftw_plan_dft_3d (side_size, side_size, side_size,
-                                                     host_immediate_mem, host_result_mem,
-                                                     FFTW_BACKWARD, FFTW_ESTIMATE);
-                    break;
-                default:
-                    fprintf (stderr, "Unknown FFT dimensions\n");
-                    return;
-            }
-
-            for (int j = 0; j < size; j++) {
-                host_orig_mem[j][0] = rand() / ((double) RAND_MAX);
-                host_orig_mem[j][1] = rand() / ((double) RAND_MAX);
-            }
-
-            PRINT_DIMS(dim, side_size);
-
-            fflush (stdout);
-
-            printf (".");
-            fflush (stdout);
-
-            timer_start (timer);
-
-            for (int j = 0; j < N_RUNS; j++) {
-                fftw_execute (plan);
-            }
-
-            timer_stop (timer);
-
-            /* Check precision */
-            fftw_execute (inverse_plan);
-
-            for (int j = 0; j < size; j++) {
-                sum += fabs (host_result_mem[j][0] / size - host_orig_mem[j][0]);
-                sum += fabs (host_result_mem[j][1] / size - host_orig_mem[j][1]);
-            }
-
-            time_sec = timer_get_seconds (timer) / N_RUNS;
-
-            time_entry->dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
-            for (int j = 0; j < dim; j++) {
-                time_entry->dim_entries[k].sizes[i][j] = side_size;
-            }
-
-            time_entry->dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
-            time_entry->dim_entries[k].errors[i] = sum / size;
-            
-            fftw_destroy_plan (inverse_plan);
-            fftw_destroy_plan (plan);
-            fftw_free (host_orig_mem);
-            fftw_free (host_immediate_mem);
-            fftw_free (host_result_mem);
-        }
-        printf ("\n");
-        fflush (stdout);
-    }
-
-    printf ("\n");
-    timer_destroy (timer);
-}
-#endif
-
-#ifdef HAVE_AMD_FFT
-static bool
-compute_amd_fft (cl_context context,
-                 cl_command_queue queue,
-                 cl_mem dev_mem,
-                 cl_mem out_mem,
-                 int n_dims,
-                 size_t *dims,
-                 int n_runs,
-                 Timer *timer)
-{
-    clfftSetupData setup;
-    clfftPlanHandle plan;
-    clfftDim dim;
-    cl_event event;
-    size_t size;
-
-    switch (n_dims) {
-        case 1:
-            dim = CLFFT_1D;
-            break;
-        case 2:
-            dim = CLFFT_2D;
-            break;
-        case 3:
-            dim = CLFFT_3D;
-            break;
-        default:
-            fprintf (stderr, "Unknown FFT dimensions\n");
-            return false;
-    }
-
-    OCL_CHECK_ERROR (clfftSetup (&setup));
-    OCL_CHECK_ERROR (clfftCreateDefaultPlan (&plan, context, dim, dims));
-    OCL_CHECK_ERROR (clfftSetPlanPrecision (plan, CLFFT_SINGLE));
-    OCL_CHECK_ERROR (clfftSetLayout (plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED));
-    OCL_CHECK_ERROR (clfftSetResultLocation (plan, CLFFT_OUTOFPLACE));
-    OCL_CHECK_ERROR (clfftBakePlan (plan, 1, &queue, NULL, NULL));
-
-    timer_start (timer);
-
-    for (int i = 0; i < n_runs; i++) {
-        OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_FORWARD, 1, &queue, 0, NULL, &event, &dev_mem, &out_mem, NULL));
-        OCL_CHECK_ERROR (clWaitForEvents (1, &event));
-        OCL_CHECK_ERROR (clReleaseEvent (event));
-    }
-
-    timer_stop (timer);
-
-    OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_BACKWARD, 1, &queue, 0, NULL, &event, &out_mem, &dev_mem, NULL));
-    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
-    OCL_CHECK_ERROR (clReleaseEvent (event));
-
-    /*
-     * We rely on the fact, that out_mem contains the inverse which currently
-     * lies in dev_mem, so let's copy it back.
-     */
-    OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
-    OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
-    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
-    OCL_CHECK_ERROR (clReleaseEvent (event));
-
-    OCL_CHECK_ERROR (clfftDestroyPlan (&plan));
-    clfftTeardown ();
-
-    return false;
-}
-#endif
-
-#ifdef HAVE_APPLE_FFT
-static bool
-compute_apple_fft (cl_context context,
-                   cl_command_queue queue,
-                   cl_mem dev_mem,
-                   cl_mem out_mem,
-                   int n_dims,
-                   size_t *dims,
-                   int n_runs,
-                   Timer *timer)
-{
-    clFFT_Plan plan;
-    clFFT_Dimension dim;
-    clFFT_Dim3 dim_sizes = {.x = 1, .y = 1, .z = 1};
-    cl_event event;
-    cl_int err;
-    size_t size;
-
-    switch (n_dims) {
-        case 1:
-            dim = clFFT_1D;
-            dim_sizes.x = dims[0];
-            break;
-        case 2:
-            dim = clFFT_2D;
-            dim_sizes.x = dims[0];
-            dim_sizes.y = dims[1];
-            break;
-        case 3:
-            dim = clFFT_3D;
-            dim_sizes.x = dims[0];
-            dim_sizes.y = dims[1];
-            dim_sizes.z = dims[2];
-            break;
-        default:
-            fprintf (stderr, "Unknown FFT dimensions\n");
-            return true;
-    }
-
-    plan = clFFT_CreatePlan (context, dim_sizes, dim, clFFT_InterleavedComplexFormat, &err);
-    OCL_CHECK_ERROR (err);
-
-    timer_start (timer);
-
-    for (int i = 0; i < n_runs; i++) {
-        err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Forward, dev_mem, out_mem,
-                                        0, NULL, NULL);
-        OCL_CHECK_ERROR (err);
-
-        /* Apple FFT does not return events, hence we need the hammer */
-        OCL_CHECK_ERROR (clFinish (queue));
-    }
-
-    timer_stop (timer);
-
-    err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Inverse, out_mem, dev_mem, 0, NULL, NULL);
-    OCL_CHECK_ERROR (err);
-    OCL_CHECK_ERROR (clFinish (queue));
-
-    OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
-    OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
-    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
-    OCL_CHECK_ERROR (clReleaseEvent (event));
-
-    clFFT_DestroyPlan (plan);
-
-    return true;
-}
+#include "cpu_fft.h"
 #endif
 
 #ifdef HAVE_CUDA_FFT
-static bool
-compute_cuda_fft (cufftComplex *dev_mem,
-                  cufftComplex *out_mem,
-                  int n_dims,
-                  size_t *dims,
-                  int n_runs,
-                  Timer *timer)
-{
-    cufftHandle plan;
-
-    int dim_sizes[3] = {1, 1, 1};
-
-    switch (n_dims) {
-        case 1:
-            dim_sizes[0] = dims[0];
-            break;
-        case 2:
-            dim_sizes[0] = dims[0];
-            dim_sizes[1] = dims[1];
-            break;
-        case 3:
-            dim_sizes[0] = dims[0];
-            dim_sizes[1] = dims[1];
-            dim_sizes[2] = dims[2];
-            break;
-        default:
-            fprintf (stderr, "Unknown FFT dimensions\n");
-            return true;
-    }
-
-    CUFFT_SAFE_CALL (cufftPlanMany(&plan, n_dims, dim_sizes, NULL, 1, 0, NULL, 1, 0, CUFFT_C2C, 1));
-
-    timer_start (timer);
-
-    for (int i = 0; i < n_runs; i++) {
-        CUFFT_SAFE_CALL (cufftExecC2C (plan, (cufftComplex *)dev_mem, (cufftComplex *)out_mem, CUFFT_FORWARD));
-        CUDA_SAFE_CALL (cudaDeviceSynchronize ());
-    }
-
-    timer_stop (timer);
-
-    CUFFT_SAFE_CALL (cufftExecC2C (plan, (cufftComplex *)out_mem, (cufftComplex *)dev_mem, CUFFT_INVERSE));
-
-    CUDA_SAFE_CALL (cudaDeviceSynchronize ());
-
-    CUDA_SAFE_CALL (cudaMemcpy (out_mem, dev_mem, dim_sizes[0] * dim_sizes[1] * dim_sizes[2] * sizeof(cufftComplex), cudaMemcpyDeviceToDevice));   
-
-    CUFFT_SAFE_CALL (cufftDestroy (plan));
-
-    return true;
-}
+#include "cuda_fft.h"
 #endif
 
-static void
-write_headers_in_file (int n_dims, int only_time, FILE *fp)
-{    
-    fprintf (fp, "# ");
-
-    for (int i = 0; i < n_dims; i++) {
-        int min_power = N_POWERS_INTERVALS[i][0];
-        int max_power = N_POWERS_INTERVALS[i][1];
-
-        for (int j = min_power; j <= max_power; j++) {
-            int side_size = pow(2,j);
-            switch (DIMS[i]) {
-                case 1:
-                fprintf (fp, ";");
-                fprintf (fp, "%d ", side_size);
-                if (!only_time) {
-                    fprintf (fp, ";");
-                    fprintf (fp, "%d(Error) ", side_size);
-                }
-                break;
-                case 2:
-                fprintf (fp, ";");
-                fprintf (fp, "%dx%d ", side_size, side_size);
-                if (!only_time) {
-                    fprintf (fp, ";");
-                    fprintf (fp, "%dx%d(Error) ", side_size, side_size);
-                }
-                break;
-                case 3:
-                fprintf (fp, ";");
-                fprintf (fp, "%dx%dx%d ", side_size, side_size, side_size);
-                if (!only_time) {
-                    fprintf (fp, ";");
-                    fprintf (fp, "%dx%dx%d(Error) ", side_size, side_size, side_size);
-                }
-                break;
-            }   
-        } 
-    }
-
-}
-
-static void
-write_time_entries_in_file (TimeEntry* time_entries, int num_entries, int n_dims, int only_time, bool new_line, FILE *fp)
-{
-    if (new_line) {
-        fprintf (fp, "\n");
-    }
-
-    for (int i = 0; i < num_entries; i++) {
-        fprintf (fp, "%s ", time_entries[i].lib_name);
-
-        DimEntry *dim_entries = time_entries[i].dim_entries;
-
-        for (int dim = 0; dim < n_dims; dim++) {
-            DimEntry dim_entry = dim_entries[dim];
-
-            for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
-                if (only_time) {
-                    fprintf (fp, ";");
-                    fprintf (fp, "%f", dim_entry.times[j]);
-                }
-                else {
-                    fprintf (fp, ";%f;%f ", dim_entry.times[j], dim_entry.errors[j]);
-                }
-            }
-        }
-
-        if (i != num_entries -1) {
-            fprintf (fp, "\n");
-        }
-    }
-}
-
-static OutputType
-get_output_type_by_measure(char *measure) {
-    OutputType outputType = OUT_MILLISECONDS;
-    bool invalid_format = false;
-
-    if (measure != NULL) {
-        if (!strcmp(optarg, "ms"))
-            outputType = OUT_MILLISECONDS;
-        else if (!strcmp(optarg, "sec"))
-            outputType = OUT_SECONDS;
-        else if (!strcmp(optarg, "gflops"))
-            outputType = OUT_GFLOPS;
-        else if (!strcmp(optarg, "mflops"))
-            outputType = OUT_MFLOPS;
-        else if (!strcmp(optarg, "MBs") || !strcmp(optarg, "mbs"))
-            outputType = OUT_THROUGHTPUT_MBS;
-        else if (!strcmp(optarg, "GBs") || !strcmp(optarg, "gbs"))
-            outputType = OUT_THROUGHTPUT_GBS;
-        else {
-            invalid_format = true;
-            fprintf (stderr, "Incorrect measure format [%s], [ms] will be used.\nUse the following formats: ms,sec,mflops,gflops,GBs,MBs.\n\n", measure);
-        }
-    }
-    
-    if (!invalid_format || (measure == NULL)) {
-        fprintf (stdout, "Output will be use measure [%s].\n\n", measure == NULL ? "ms" : measure);
-    }
-    
-    return outputType;
-}
-
-static bool
-get_fft_range(char *val1, char *val2, int *out_range) {
-    int out_val1;
-    int out_val2;
-    bool range_is_valid = true;
-
-    out_val1 = (int) strtol (val1, NULL, 10);
-    out_val2 = (int) strtol (val2, NULL, 10);
-
-    if (out_val1 != 0L &&
-        out_val2 != 0L &&
-        (out_val1 >= MIN_POW2 && out_val1 <= MAX_POW2) &&
-        (out_val2 >= MIN_POW2 && out_val2 <= MAX_POW2) &&
-        (out_val1 < out_val2)) {
-        out_range[0] = out_val1;
-        out_range[1] = out_val2;
-    }
-    else {
-        fprintf (stderr, "Incorrect the range of powers, it should be in range [%d %d].\n\n", MIN_POW2, MAX_POW2);
-        range_is_valid = false;
-    }
-
-    return range_is_valid;
-}
-
-static bool
-get_number_of_runs(char *val, int *out) {
-    bool value_valid = true;
-    int out_val;
-
-    out_val = (int) strtol (val, NULL, 10);
-
-    if (out_val != 0L && (out_val >= MIN_RUNS && out_val <= MAX_RUNS)) {
-        *out = out_val;
-    }
-    else {
-        fprintf (stderr, "Incorrect the number of runs, the value should be from %d to %d.\n\n", MIN_RUNS, MAX_RUNS);
-        value_valid = false;
-    }
-
-    return value_valid;
-}
-
-static void
-print_usage(char *app_name, struct option long_options[], char **options_descritions, int exit_code)
-{
-    printf ("Usage: %s [OPTIONS]\n", app_name);
-    printf ("Options:\n");
-    for (int i = 0; long_options[i].name != 0; i++) {
-        printf("  --%-20s %s\n", long_options[i].name, options_descritions[i]);
-    }
-
-    exit (exit_code);
-}
+#include "utilities.h"
 
 int
 main (int argc, char **argv)

+ 127 - 0
cpu_fft.c

@@ -0,0 +1,127 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cpu_fft.h"
+
+void loop_data_fftw (OutputType outputType, TimeEntry *time_entry)
+{
+    
+    Timer *timer;
+
+    timer = timer_new ();
+
+    time_entry->lib_name = "FFTW";
+    time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
+
+    for (int k = 0; k < N_DIMS; k++) {
+        int dim = DIMS[k];
+        int power_min = N_POWERS_INTERVALS[k][0];
+        int power_max = N_POWERS_INTERVALS[k][1];
+        int num_entries = power_max - power_min + 1;
+
+        time_entry->dim_entries[k].n_dims = dim;
+        time_entry->dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
+        time_entry->dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
+        time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
+
+        PRINT_DIM (dim);
+        fflush (stdout);
+
+        for (int m = power_min, i = 0; m <= power_max; m++, i++) {
+            fftw_complex *host_orig_mem;
+            fftw_complex *host_result_mem;
+            fftw_complex *host_immediate_mem;
+            fftw_plan plan;
+            fftw_plan inverse_plan;
+            double time_sec;
+            double sum = 0.0;
+
+            size_t side_size = pow(2,m);
+            size_t size = pow(side_size,dim);
+            size_t size_bytes = sizeof (fftw_complex) * size;
+
+            host_orig_mem = fftw_malloc (size_bytes);
+            host_immediate_mem = fftw_malloc (size_bytes);
+            host_result_mem = fftw_malloc (size_bytes);
+
+            switch (dim) {
+                case 1:
+                    plan = fftw_plan_dft_1d (side_size, 
+                                             host_orig_mem, host_immediate_mem,
+                                             FFTW_FORWARD, FFTW_ESTIMATE);
+                    inverse_plan = fftw_plan_dft_1d (side_size, 
+                                                     host_immediate_mem,
+                                                     host_result_mem,
+                                                     FFTW_BACKWARD, FFTW_ESTIMATE);
+                    break;
+                case 2:
+                    plan = fftw_plan_dft_2d (side_size, side_size,
+                                             host_orig_mem, host_immediate_mem,
+                                             FFTW_FORWARD, FFTW_ESTIMATE);
+                    inverse_plan = fftw_plan_dft_2d (side_size, side_size,
+                                                     host_immediate_mem, host_result_mem,
+                                                     FFTW_BACKWARD, FFTW_ESTIMATE);
+                    break;
+                case 3:
+                    plan = fftw_plan_dft_3d (side_size, side_size, side_size,
+                                             host_orig_mem, host_immediate_mem,
+                                             FFTW_FORWARD, FFTW_ESTIMATE);
+                    inverse_plan = fftw_plan_dft_3d (side_size, side_size, side_size,
+                                                     host_immediate_mem, host_result_mem,
+                                                     FFTW_BACKWARD, FFTW_ESTIMATE);
+                    break;
+                default:
+                    fprintf (stderr, "Unknown FFT dimensions\n");
+                    return;
+            }
+
+            for (int j = 0; j < size; j++) {
+                host_orig_mem[j][0] = rand() / ((double) RAND_MAX);
+                host_orig_mem[j][1] = rand() / ((double) RAND_MAX);
+            }
+
+            PRINT_DIMS(dim, side_size);
+
+            fflush (stdout);
+
+            printf (".");
+            fflush (stdout);
+
+            timer_start (timer);
+
+            for (int j = 0; j < N_RUNS; j++) {
+                fftw_execute (plan);
+            }
+
+            timer_stop (timer);
+
+            fftw_execute (inverse_plan);
+
+            for (int j = 0; j < size; j++) {
+                sum += fabs (host_result_mem[j][0] / size - host_orig_mem[j][0]);
+                sum += fabs (host_result_mem[j][1] / size - host_orig_mem[j][1]);
+            }
+
+            time_sec = timer_get_seconds (timer) / N_RUNS;
+
+            time_entry->dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
+            for (int j = 0; j < dim; j++) {
+                time_entry->dim_entries[k].sizes[i][j] = side_size;
+            }
+
+            time_entry->dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
+            time_entry->dim_entries[k].errors[i] = sum / size;
+            
+            fftw_destroy_plan (inverse_plan);
+            fftw_destroy_plan (plan);
+            fftw_free (host_orig_mem);
+            fftw_free (host_immediate_mem);
+            fftw_free (host_result_mem);
+        }
+        printf ("\n");
+        fflush (stdout);
+    }
+
+    printf ("\n");
+    timer_destroy (timer);
+}

+ 10 - 0
cpu_fft.h

@@ -0,0 +1,10 @@
+#ifndef CPU_FFT_H
+#define CPU_FFT_H
+
+#include <fftw3.h>
+
+#include "utilities.h"
+
+void loop_data_fftw (OutputType outputType, TimeEntry *time_entry);
+
+#endif

+ 167 - 0
cuda_fft.c

@@ -0,0 +1,167 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#include "cuda_fft.h"
+
+void loop_data_cuda (const char *vendor,
+	                 CudaBenchmarkFunc func,
+	                 int n_devices,
+	                 OutputType outputType,
+	                 TimeEntry *time_entries)
+{
+    Timer *timer;
+
+    timer = timer_new ();
+
+    for (int j = 0; j < n_devices; j++) {
+        char vendor_name[50];
+        int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
+        time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
+        strcpy(time_entries[j].lib_name, vendor_name);
+
+        time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
+
+        for (int k = 0; k < N_DIMS; k++) {
+            int dim = DIMS[k];
+            int power_min = N_POWERS_INTERVALS[k][0];
+            int power_max = N_POWERS_INTERVALS[k][1];
+            int num_entries = power_max - power_min + 1;
+
+            time_entries[j].dim_entries[k].n_dims = dim;
+            time_entries[j].dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
+            time_entries[j].dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
+            time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
+
+            PRINT_DIM (dim);
+            fflush (stdout);
+
+            for (int m = power_min, i = 0; m <= power_max; m++, i++) {
+                size_t size_bytes;
+                cufftComplex *host_orig_mem;
+                cufftComplex *host_result_mem;
+                cufftComplex *dev_mem;
+                cufftComplex *dev_out_mem;
+
+                size_t side_size = pow(2,m);
+                size_t size = pow(side_size,dim);
+
+                size_bytes = size * sizeof (cufftComplex);
+                host_orig_mem = (cufftComplex *)malloc(size_bytes);
+                host_result_mem = (cufftComplex *)malloc(size_bytes);
+
+                for (int l = 0; l < size; l++) {
+                    host_orig_mem[l].x = rand() / ((float) RAND_MAX);
+                    host_orig_mem[l].y = rand() / ((float) RAND_MAX);
+                }
+
+                CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
+                CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
+
+                PRINT_DIMS(dim, side_size);
+
+                fflush (stdout);
+
+                double time_sec;
+                double sum;
+                bool scale;
+
+                printf (".");
+                fflush (stdout);
+
+                CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
+
+                size_t fft_size[3] = { 1, 1, 1};
+                for (int l = 0; l < dim; l++) {
+                    fft_size[l] = side_size;
+                }
+
+                scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
+
+                /* Check precision */
+                CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
+                sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
+        
+                time_sec = timer_get_seconds (timer) / N_RUNS;
+
+                time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
+                time_entries[j].dim_entries[k].errors[i] = sum / size;
+
+                free (host_orig_mem);
+                free (host_result_mem);
+
+                CUDA_SAFE_CALL (cudaFree (dev_mem));
+                CUDA_SAFE_CALL (cudaFree (dev_out_mem));
+            }
+
+            printf ("\n");
+            fflush (stdout);
+        }
+    }
+
+    printf ("\n");
+    timer_destroy (timer);
+}
+
+bool compute_cuda_fft (cufftComplex *dev_mem,
+	                   cufftComplex *out_mem,
+	                   int n_dims,
+	                   size_t *dims,
+	                   int n_runs,
+	                   Timer *timer)
+{
+    cufftHandle plan;
+
+    int dim_sizes[3] = {1, 1, 1};
+
+    switch (n_dims) {
+        case 1:
+            dim_sizes[0] = dims[0];
+            break;
+        case 2:
+            dim_sizes[0] = dims[0];
+            dim_sizes[1] = dims[1];
+            break;
+        case 3:
+            dim_sizes[0] = dims[0];
+            dim_sizes[1] = dims[1];
+            dim_sizes[2] = dims[2];
+            break;
+        default:
+            fprintf (stderr, "Unknown FFT dimensions\n");
+            return true;
+    }
+
+    CUFFT_SAFE_CALL (cufftPlanMany(&plan, n_dims, dim_sizes, NULL, 1, 0, NULL, 1, 0, CUFFT_C2C, 1));
+
+    timer_start (timer);
+
+    for (int i = 0; i < n_runs; i++) {
+        CUFFT_SAFE_CALL (cufftExecC2C (plan, (cufftComplex *)dev_mem, (cufftComplex *)out_mem, CUFFT_FORWARD));
+        CUDA_SAFE_CALL (cudaDeviceSynchronize ());
+    }
+
+    timer_stop (timer);
+
+    CUFFT_SAFE_CALL (cufftExecC2C (plan, (cufftComplex *)out_mem, (cufftComplex *)dev_mem, CUFFT_INVERSE));
+
+    CUDA_SAFE_CALL (cudaDeviceSynchronize ());
+
+    CUDA_SAFE_CALL (cudaMemcpy (out_mem, dev_mem, dim_sizes[0] * dim_sizes[1] * dim_sizes[2] * sizeof(cufftComplex), cudaMemcpyDeviceToDevice));   
+
+    CUFFT_SAFE_CALL (cufftDestroy (plan));
+
+    return true;
+}
+
+double sum_of_absolute_differences_complex (cufftComplex *a, cufftComplex *b, int n, bool scale)
+{
+    double sum = 0.0;
+
+    for (int i = 0; i < n; i++) {
+        sum += fabs (a[i].x - b[i].x / ((float)n));
+        sum += fabs (a[i].y - b[i].y / ((float)n));
+    }
+
+    return sum;
+}

+ 32 - 0
cuda_fft.h

@@ -0,0 +1,32 @@
+#ifndef CUDA_FFT_H
+#define CUDA_FFT_H
+
+#include <cufft.h>
+
+#include "utilities.h"
+
+#define CUDA_SAFE_CALL( call) do { \
+cudaError_t err = call; \
+if (cudaSuccess != err) { \
+fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
+__FILE__, __LINE__, cudaGetErrorString( err) ); \
+exit(EXIT_FAILURE); \
+} \
+} while (0)
+
+#define CUFFT_SAFE_CALL( call) do { \
+cufftResult err = call; \
+if (err != CUFFT_SUCCESS) { \
+fprintf(stderr, "Cufft error in file '%s' in line %i : %s.\n", \
+__FILE__, __LINE__, "error" ); \
+exit(EXIT_FAILURE); \
+} \
+} while (0)
+
+typedef bool (*CudaBenchmarkFunc) (cufftComplex *dev_mem, cufftComplex *dev_out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer);
+
+void loop_data_cuda (const char *vendor, CudaBenchmarkFunc func, int n_devices, OutputType outputType, TimeEntry *time_entries);
+bool compute_cuda_fft (cufftComplex *dev_mem, cufftComplex *out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer);
+double sum_of_absolute_differences_complex (cufftComplex *a, cufftComplex *b, int n, bool scale);
+
+#endif

+ 274 - 0
opencl_fft.c

@@ -0,0 +1,274 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "opencl_fft.h"
+
+double sum_of_absolute_differences (float *a, float *b, int n, bool scale)
+{
+    double sum = 0.0;
+
+    for (int i = 0; i < n; i++)
+        sum += fabs (a[i] - b[i] / (n / 2.));
+
+    return sum;
+}
+
+void loop_data_opencl (const char *vendor,
+	                  OclBenchmarkFunc func,
+	                  cl_context context,
+	                  cl_command_queue *queues,
+	                  int n_devices,
+	                  OutputType outputType,
+	                  TimeEntry *time_entries)
+{
+    Timer *timer;
+    cl_int err;
+    char *device_name;
+    size_t device_name_size;
+    cl_device_id *devices;
+    size_t device_list_size;
+
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size);
+    devices = (cl_device_id *) malloc(device_list_size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL);
+
+    timer = timer_new ();
+
+    for (int j = 0; j < n_devices; j++) {
+        clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &device_name_size);
+        device_name = (char *) malloc(device_name_size);
+        clGetDeviceInfo(devices[j], CL_DEVICE_NAME, device_name_size, device_name, NULL);
+
+        char vendor_name[100];
+        int v_len = sprintf(vendor_name, "%s (%s)", device_name, vendor);
+        time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
+        strcpy(time_entries[j].lib_name, vendor_name);
+
+        time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
+
+        if (n_devices > 1) {
+            printf ("Device: %s\n", time_entries[j].lib_name);
+            fflush (stdout);
+        }
+
+        for (int k = 0; k < N_DIMS; k++) {
+            int dim = DIMS[k];
+            int power_min = N_POWERS_INTERVALS[k][0];
+            int power_max = N_POWERS_INTERVALS[k][1];
+            int num_entries = power_max - power_min + 1;
+
+            time_entries[j].dim_entries[k].n_dims = dim;
+            time_entries[j].dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
+            time_entries[j].dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
+            time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
+
+            PRINT_DIM (dim);
+            fflush (stdout);
+
+            for (int m = power_min, i = 0; m <= power_max; m++, i++) {
+                size_t size_bytes;
+                float *host_orig_mem;
+                float *host_result_mem;
+                cl_mem dev_mem;
+                cl_mem dev_out_mem;
+
+                size_t side_size = pow(2,m);
+                size_t size = pow(side_size,dim);
+
+                size_bytes = size * 2 * sizeof (float);
+                host_orig_mem = malloc (size_bytes);
+                host_result_mem = malloc (size_bytes);
+
+                for (int l = 0; l < size * 2; l++) {
+                    host_orig_mem[l] = rand() / ((float) RAND_MAX);
+                }
+
+                dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
+                OCL_CHECK_ERROR (err); 
+
+                dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
+                OCL_CHECK_ERROR (err);
+
+                PRINT_DIMS(dim, side_size);
+
+                fflush (stdout);
+
+                double time_sec;
+                double sum;
+                bool scale;
+
+                printf (".");
+                fflush (stdout);
+
+                OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem, 0, NULL, NULL));
+
+                size_t fft_size[3] = { 1, 1, 1};
+                time_entries[j].dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
+
+                for (int l = 0; l < dim; l++) {
+                    fft_size[l] = side_size;
+                    time_entries[j].dim_entries[k].sizes[i][j] = side_size;
+                }
+
+                scale = func (context, queues[j], dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
+
+                /* Check precision */
+                OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
+                sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
+
+                time_sec = timer_get_seconds (timer) / N_RUNS;
+
+                time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
+                time_entries[j].dim_entries[k].errors[i] = sum / size;
+
+                free (host_orig_mem);
+                free (host_result_mem);
+                OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
+                OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
+            }
+
+            printf ("\n");
+            fflush (stdout);
+        }
+    }
+
+    printf ("\n");
+    timer_destroy (timer);
+}
+
+#ifdef HAVE_AMD_FFT
+bool compute_amd_fft (cl_context context,
+	                  cl_command_queue queue,
+	                  cl_mem dev_mem,
+	                  cl_mem out_mem,
+	                  int n_dims,
+	                  size_t *dims,
+	                  int n_runs,
+	                  Timer *timer)
+{
+    clfftSetupData setup;
+    clfftPlanHandle plan;
+    clfftDim dim;
+    cl_event event;
+    size_t size;
+
+    switch (n_dims) {
+        case 1:
+            dim = CLFFT_1D;
+            break;
+        case 2:
+            dim = CLFFT_2D;
+            break;
+        case 3:
+            dim = CLFFT_3D;
+            break;
+        default:
+            fprintf (stderr, "Unknown FFT dimensions\n");
+            return false;
+    }
+
+    OCL_CHECK_ERROR (clfftSetup (&setup));
+    OCL_CHECK_ERROR (clfftCreateDefaultPlan (&plan, context, dim, dims));
+    OCL_CHECK_ERROR (clfftSetPlanPrecision (plan, CLFFT_SINGLE));
+    OCL_CHECK_ERROR (clfftSetLayout (plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED));
+    OCL_CHECK_ERROR (clfftSetResultLocation (plan, CLFFT_OUTOFPLACE));
+    OCL_CHECK_ERROR (clfftBakePlan (plan, 1, &queue, NULL, NULL));
+
+    timer_start (timer);
+
+    for (int i = 0; i < n_runs; i++) {
+        OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_FORWARD, 1, &queue, 0, NULL, &event, &dev_mem, &out_mem, NULL));
+        OCL_CHECK_ERROR (clWaitForEvents (1, &event));
+        OCL_CHECK_ERROR (clReleaseEvent (event));
+    }
+
+    timer_stop (timer);
+
+    OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_BACKWARD, 1, &queue, 0, NULL, &event, &out_mem, &dev_mem, NULL));
+    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
+    OCL_CHECK_ERROR (clReleaseEvent (event));
+
+    /*
+     * We rely on the fact, that out_mem contains the inverse which currently
+     * lies in dev_mem, so let's copy it back.
+     */
+    OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
+    OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
+    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
+    OCL_CHECK_ERROR (clReleaseEvent (event));
+
+    OCL_CHECK_ERROR (clfftDestroyPlan (&plan));
+    clfftTeardown ();
+
+    return false;
+}
+#endif
+
+#ifdef HAVE_APPLE_FFT
+bool compute_apple_fft (cl_context context,
+	                    cl_command_queue queue,
+	                    cl_mem dev_mem,
+	                    cl_mem out_mem,
+	                    int n_dims,
+	                    size_t *dims,
+	                    int n_runs,
+	                    Timer *timer)
+{
+    clFFT_Plan plan;
+    clFFT_Dimension dim;
+    clFFT_Dim3 dim_sizes = {.x = 1, .y = 1, .z = 1};
+    cl_event event;
+    cl_int err;
+    size_t size;
+
+    switch (n_dims) {
+        case 1:
+            dim = clFFT_1D;
+            dim_sizes.x = dims[0];
+            break;
+        case 2:
+            dim = clFFT_2D;
+            dim_sizes.x = dims[0];
+            dim_sizes.y = dims[1];
+            break;
+        case 3:
+            dim = clFFT_3D;
+            dim_sizes.x = dims[0];
+            dim_sizes.y = dims[1];
+            dim_sizes.z = dims[2];
+            break;
+        default:
+            fprintf (stderr, "Unknown FFT dimensions\n");
+            return true;
+    }
+
+    plan = clFFT_CreatePlan (context, dim_sizes, dim, clFFT_InterleavedComplexFormat, &err);
+    OCL_CHECK_ERROR (err);
+
+    timer_start (timer);
+
+    for (int i = 0; i < n_runs; i++) {
+        err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Forward, dev_mem, out_mem,
+                                        0, NULL, NULL);
+        OCL_CHECK_ERROR (err);
+
+        /* Apple FFT does not return events, hence we need the hammer */
+        OCL_CHECK_ERROR (clFinish (queue));
+    }
+
+    timer_stop (timer);
+
+    err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Inverse, out_mem, dev_mem, 0, NULL, NULL);
+    OCL_CHECK_ERROR (err);
+    OCL_CHECK_ERROR (clFinish (queue));
+
+    OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
+    OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
+    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
+    OCL_CHECK_ERROR (clReleaseEvent (event));
+
+    clFFT_DestroyPlan (plan);
+
+    return true;
+}
+#endif

+ 45 - 0
opencl_fft.h

@@ -0,0 +1,45 @@
+#ifndef OPENCL_FFT_H
+#define OPENCL_FFT_H
+
+#include <CL/cl.h>
+
+#ifdef HAVE_AMD_FFT
+#include <clFFT.h>
+#endif
+
+#ifdef HAVE_APPLE_FFT
+#include <oclfft/clFFT.h>
+#endif
+
+#include "utilities.h"
+
+typedef bool (*OclBenchmarkFunc) (cl_context context, cl_command_queue queue, 
+								  cl_mem dev_mem, cl_mem dev_out_mem,
+								  int n_dims, size_t *dims, int n_runs, Timer *timer);
+
+double sum_of_absolute_differences (float *a, float *b, int n, bool scale);
+void loop_data_opencl (const char *vendor,
+			           OclBenchmarkFunc func,
+			           cl_context context,
+			           cl_command_queue *queues,
+			           int n_devices,
+			           OutputType outputType,
+			           TimeEntry *time_entries);
+bool compute_amd_fft (cl_context context,
+			          cl_command_queue queue,
+			          cl_mem dev_mem,
+			          cl_mem out_mem,
+			          int n_dims,
+			          size_t *dims,
+			          int n_runs,
+			          Timer *timer);
+bool compute_apple_fft (cl_context context,
+			            cl_command_queue queue,
+			            cl_mem dev_mem,
+			            cl_mem out_mem,
+			            int n_dims,
+			            size_t *dims,
+			            int n_runs,
+			            Timer *timer);
+
+#endif

+ 188 - 0
utilities.c

@@ -0,0 +1,188 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "utilities.h"
+
+void write_headers_in_file (int n_dims, int only_time, FILE *fp)
+{    
+    fprintf (fp, "# ");
+
+    for (int i = 0; i < n_dims; i++) {
+        int min_power = N_POWERS_INTERVALS[i][0];
+        int max_power = N_POWERS_INTERVALS[i][1];
+
+        for (int j = min_power; j <= max_power; j++) {
+            int side_size = pow(2,j);
+            switch (DIMS[i]) {
+                case 1:
+                fprintf (fp, ";");
+                fprintf (fp, "%d ", side_size);
+                if (!only_time) {
+                    fprintf (fp, ";");
+                    fprintf (fp, "%d(Error) ", side_size);
+                }
+                break;
+                case 2:
+                fprintf (fp, ";");
+                fprintf (fp, "%dx%d ", side_size, side_size);
+                if (!only_time) {
+                    fprintf (fp, ";");
+                    fprintf (fp, "%dx%d(Error) ", side_size, side_size);
+                }
+                break;
+                case 3:
+                fprintf (fp, ";");
+                fprintf (fp, "%dx%dx%d ", side_size, side_size, side_size);
+                if (!only_time) {
+                    fprintf (fp, ";");
+                    fprintf (fp, "%dx%dx%d(Error) ", side_size, side_size, side_size);
+                }
+                break;
+            }   
+        } 
+    }
+
+}
+
+void write_time_entries_in_file (TimeEntry* time_entries, int num_entries, int n_dims, int only_time, bool new_line, FILE *fp)
+{
+    if (new_line) {
+        fprintf (fp, "\n");
+    }
+
+    for (int i = 0; i < num_entries; i++) {
+        fprintf (fp, "%s ", time_entries[i].lib_name);
+
+        DimEntry *dim_entries = time_entries[i].dim_entries;
+
+        for (int dim = 0; dim < n_dims; dim++) {
+            DimEntry dim_entry = dim_entries[dim];
+
+            for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
+                if (only_time) {
+                    fprintf (fp, ";");
+                    fprintf (fp, "%f", dim_entry.times[j]);
+                }
+                else {
+                    fprintf (fp, ";%f;%f ", dim_entry.times[j], dim_entry.errors[j]);
+                }
+            }
+        }
+
+        if (i != num_entries -1) {
+            fprintf (fp, "\n");
+        }
+    }
+}
+
+OutputType get_output_type_by_measure(char *measure) {
+    OutputType outputType = OUT_MILLISECONDS;
+    bool invalid_format = false;
+
+    if (measure != NULL) {
+        if (!strcmp(optarg, "ms"))
+            outputType = OUT_MILLISECONDS;
+        else if (!strcmp(optarg, "sec"))
+            outputType = OUT_SECONDS;
+        else if (!strcmp(optarg, "gflops"))
+            outputType = OUT_GFLOPS;
+        else if (!strcmp(optarg, "mflops"))
+            outputType = OUT_MFLOPS;
+        else if (!strcmp(optarg, "MBs") || !strcmp(optarg, "mbs"))
+            outputType = OUT_THROUGHTPUT_MBS;
+        else if (!strcmp(optarg, "GBs") || !strcmp(optarg, "gbs"))
+            outputType = OUT_THROUGHTPUT_GBS;
+        else {
+            invalid_format = true;
+            fprintf (stderr, "Incorrect measure format [%s], [ms] will be used.\nUse the following formats: ms,sec,mflops,gflops,GBs,MBs.\n\n", measure);
+        }
+    }
+    
+    if (!invalid_format || (measure == NULL)) {
+        fprintf (stdout, "Output will be use measure [%s].\n\n", measure == NULL ? "ms" : measure);
+    }
+    
+    return outputType;
+}
+
+bool get_fft_range(char *val1, char *val2, int *out_range) {
+    int out_val1;
+    int out_val2;
+    bool range_is_valid = true;
+
+    out_val1 = (int) strtol (val1, NULL, 10);
+    out_val2 = (int) strtol (val2, NULL, 10);
+
+    if (out_val1 != 0L &&
+        out_val2 != 0L &&
+        (out_val1 >= MIN_POW2 && out_val1 <= MAX_POW2) &&
+        (out_val2 >= MIN_POW2 && out_val2 <= MAX_POW2) &&
+        (out_val1 < out_val2)) {
+        out_range[0] = out_val1;
+        out_range[1] = out_val2;
+    }
+    else {
+        fprintf (stderr, "Incorrect the range of powers, it should be in range [%d %d].\n\n", MIN_POW2, MAX_POW2);
+        range_is_valid = false;
+    }
+
+    return range_is_valid;
+}
+
+bool get_number_of_runs(char *val, int *out) {
+    bool value_valid = true;
+    int out_val;
+
+    out_val = (int) strtol (val, NULL, 10);
+
+    if (out_val != 0L && (out_val >= MIN_RUNS && out_val <= MAX_RUNS)) {
+        *out = out_val;
+    }
+    else {
+        fprintf (stderr, "Incorrect the number of runs, the value should be from %d to %d.\n\n", MIN_RUNS, MAX_RUNS);
+        value_valid = false;
+    }
+
+    return value_valid;
+}
+
+void print_usage(char *app_name, struct option long_options[], char **options_descritions, int exit_code)
+{
+    printf ("Usage: %s [OPTIONS]\n", app_name);
+    printf ("Options:\n");
+    for (int i = 0; long_options[i].name != 0; i++) {
+        printf("  --%-20s %s\n", long_options[i].name, options_descritions[i]);
+    }
+
+    exit (exit_code);
+}
+
+double get_measurements_with_format (OutputType outputType, size_t size_bytes, double time_sec)
+{
+    double out_result = -1;
+
+    if (outputType == OUT_MFLOPS) {
+        size_t size = size_bytes / 2 / sizeof (float);
+        out_result = 5 * size * log (size) / log (2) / (time_sec / 1000.0);
+    }
+    else if (outputType == OUT_GFLOPS) {
+        out_result = -1;
+    }
+    else if (outputType == OUT_THROUGHTPUT_MBS) {
+        out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0;
+    }
+    else if (outputType == OUT_THROUGHTPUT_GBS) {
+        out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0 / 1000.0;
+    }
+    else if (outputType == OUT_MILLISECONDS) {
+        out_result = time_sec * 1000.0;
+    }
+    else if (outputType == OUT_SECONDS) {
+        out_result = time_sec * 1000.0;
+    }
+    else {
+        fprintf (stderr, "Unknown output type of OpenCL routines!\n");
+    }
+
+    return out_result;
+}

+ 59 - 0
utilities.h

@@ -0,0 +1,59 @@
+#ifndef UTILITIES_H
+#define UTILITIES_H
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <getopt.h>
+#include <math.h>
+
+#include "time_entry.h"
+#include "timer.h"
+
+typedef enum _OutputType {
+    OUT_MILLISECONDS,
+    OUT_SECONDS,
+    OUT_MFLOPS,
+    OUT_GFLOPS,
+    OUT_THROUGHTPUT_GBS,
+    OUT_THROUGHTPUT_MBS,
+    OUT_NONE
+} OutputType;
+
+#define N_DIMS 3
+#define MIN_POW2 1
+#define MAX_POW2 15
+#define MIN_RUNS 1
+#define MAX_RUNS 100
+
+static int N_RUNS = 4;
+static const int DIMS[N_DIMS] = {1, 2, 3};
+static int N_POWERS_INTERVALS[N_DIMS][2] = {{5, 11}, {8, 11}, {7, 7}};
+
+#define PRINT_DIM(dim) printf ("%dD:", dim);
+
+#define PRINT_DIM_SIZE(side_size,dim) { \
+    printf(" %zu", side_size); while (dim != 1) { printf("x%zu", side_size);dim--; } printf("."); }
+
+#define OCL_CHECK_ERROR(error) { \
+    if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>\n", __FILE__, __LINE__); }
+
+#define PRINT_DIMS(dim,side_size) { \
+    if (dim == 1) { printf (" %zu", side_size); } \
+    else if (dim == 2) { printf (" %zux%zu", side_size, side_size); } \
+    else { printf (" %zux%zux%zu", side_size, side_size, side_size); } }
+
+void write_headers_in_file (int n_dims, int only_time, FILE *fp);
+void write_time_entries_in_file (TimeEntry* time_entries, 
+										int num_entries, int n_dims, int only_time,
+										bool new_line,
+										FILE *fp);
+OutputType get_output_type_by_measure(char *measure);
+bool get_fft_range(char *val1, char *val2, int *out_range);
+bool get_number_of_runs(char *val, int *out);
+void print_usage(char *app_name, struct option long_options[],
+						char **options_descritions, int exit_code);
+double get_measurements_with_format (OutputType outputType, 
+											size_t size_bytes, double time_sec);
+
+#endif