9 anos atrás · 82b5f54905
--- a/Makefile
+++ b/Makefile
@@ -8,8 +8,7 @@ OBJS = $(subst .c,.o,$(SRC))
 
				 # Dependencies 
			
 
				 DEP_OCLFFT = .deps/oclfft/src/liboclfft.so
			
 
				 DEP_CLFFT = .deps/clFFT/src/library/libclFFT.so
			
 
				-DEP_TOPEFFT = .deps/tope-fft/libtopefft.so
			
 
				-DEPS = $(DEP_OCLFFT) $(DEP_TOPEFFT)
			
 
				+DEPS = $(DEP_OCLFFT)
			
 
				 
			
 
				 # Common flags definition
			
 
				 NVCCFLAGS   := -m${OS_SIZE}
			
@@ -70,10 +69,6 @@ override CFLAGS += -I.deps/
 
				 override LDFLAGS += -L.deps/oclfft/src -loclfft
			
 
				 LIBS_MSG += " +apple"
			
 
				 
			
 
				-# Tope FFT library
			
 
				-override CPPFLAGS += -DHAVE_TOPE_FFT
			
 
				-LIBS_MSG += " +tope"
			
 
				-
			
 
				 # FFTW library
			
 
				 FFTW_EXISTS = $(shell pkg-config --exists fftw3 && echo "1" || echo "0")
			
 
				 
			
@@ -145,9 +140,4 @@ $(DEP_OCLFFT):
 
				 $(DEP_CLFFT):
			
 
				 	@mkdir -p .deps
			
 
				 	@git clone https://github.com/clMathLibraries/clFFT .deps/clFFT
			
 
				-	@cd .deps/clFFT/src; cmake .; make
			
 
				-
			
 
				-$(DEP_TOPEFFT):
			
 
				-	@mkdir -p .deps
			
 
				-	@git clone -b changes https://github.com/matze/tope-fft .deps/tope-fft
			
 
				-	cd .deps/tope-fft; CFLAGS="$(CFLAGS)" make
			
 
				+	@cd .deps/clFFT/src; cmake .; make
			
--- a/benchmark.c
+++ b/benchmark.c
@@ -1,4 +1,5 @@
 
				 #include <stdlib.h>
			
 
				+#include <string.h>
			
 
				 #include <stdio.h>
			
 
				 #include <stdbool.h>
			
 
				 #include <math.h>
			
@@ -66,6 +67,13 @@ const int DIMS[N_DIMS] = {1, 2, 3};
 
				 const int N_DIM_ARRAYS[N_DIMS] = {5, 4, 4};
			
 
				 const int N_POWERS_INTERVALS[N_DIMS][2] = {{5, 11}, {8, 11}, {7, 7}};
			
 
				 
			
 
				+typedef enum _OutputType {
			
 
				+    OUT_MILLISECONDS,
			
 
				+    OUT_MFLOPS,
			
 
				+    OUT_THROUGHTPUT_MBS,
			
 
				+    OUT_NONE
			
 
				+} OutputType;
			
 
				+
			
 
				 #define UPDATE_SIZE(size) size *= 8;
			
 
				 #define PRINT_DIM_SIZE(side_size,dim) { \
			
 
				     printf(" %zu", side_size); while (dim != 1) { printf("x%zu", side_size);dim--; } printf("."); }
			
@@ -101,93 +109,116 @@ sum_of_absolute_differences_complex (cufftComplex *a, cufftComplex *b, int n, bo
 
				     return sum;
			
 
				 }
			
 
				 
			
 
				+static double
			
 
				+get_measurements_with_format (OutputType outputType, size_t size_bytes, double time_sec)
			
 
				+{
			
 
				+    double out_result = -1;
			
 
				+
			
 
				+    if (outputType == OUT_MFLOPS) {
			
 
				+        size_t size = size_bytes / 2 / sizeof (float);
			
 
				+        out_result = 5 * size * log (size) / log (2) / (time_sec / 1000.0);
			
 
				+    }
			
 
				+    else if (outputType == OUT_THROUGHTPUT_MBS) {
			
 
				+        out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0;
			
 
				+    }
			
 
				+    else if (outputType == OUT_MILLISECONDS) {
			
 
				+        out_result = time_sec * 1000.0;
			
 
				+    }
			
 
				+    else {
			
 
				+        fprintf (stderr, "Unknown output type of OpenCL routines!\n");
			
 
				+    }
			
 
				+
			
 
				+    return out_result;
			
 
				+}
			
 
				+
			
 
				 static void
			
 
				 loop_data_opencl (const char *vendor,
			
 
				                   OclBenchmarkFunc func,
			
 
				                   cl_context context,
			
 
				                   cl_command_queue *queues,
			
 
				                   int n_devices,
			
 
				-                  double *times,
			
 
				-                  double *errors,
			
 
				-                  FILE *fp)
			
 
				+                  OutputType outputType,
			
 
				+                  TimeEntry *time_entries)
			
 
				 {
			
 
				     Timer *timer;
			
 
				     cl_int err;
			
 
				 
			
 
				     timer = timer_new ();
			
 
				 
			
 
				-    //for (int j = 0; j < n_devices; j++) {
			
 
				-    //    fprintf (fp, "%s_%i_bw %s_%i_err ", vendor, j, vendor, j);
			
 
				-    //}
			
 
				+    for (int j = 0; j < n_devices; j++) {
			
 
				+        char vendor_name[50];
			
 
				+        int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
			
 
				+        time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
			
 
				+        strcpy(time_entries[j].lib_name, vendor_name);
			
 
				 
			
 
				-    //time_entry->lib_name = "FFTW";
			
 
				-    //time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
			
 
				+        time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
			
 
				 
			
 
				-    for (int k = 0; k < N_DIMS; k++) {
			
 
				-        int dim = DIMS[k];
			
 
				-        int power_min = N_POWERS_INTERVALS[k][0];
			
 
				-        int power_max = N_POWERS_INTERVALS[k][1];
			
 
				-        //int num_entries = power_max - power_min + 1;
			
 
				+        for (int k = 0; k < N_DIMS; k++) {
			
 
				+            int dim = DIMS[k];
			
 
				+            int power_min = N_POWERS_INTERVALS[k][0];
			
 
				+            int power_max = N_POWERS_INTERVALS[k][1];
			
 
				+            int num_entries = power_max - power_min + 1;
			
 
				 
			
 
				-        //time_entry->dim_entries[k].n_dims = dim;
			
 
				-        //time_entry->dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
			
 
				-        //time_entry->dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
			
 
				-        //time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
			
 
				+            time_entries[j].dim_entries[k].n_dims = dim;
			
 
				+            time_entries[j].dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
			
 
				+            time_entries[j].dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
			
 
				+            time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
			
 
				 
			
 
				-        printf ("%dD:", dim);
			
 
				-        fflush (stdout);
			
 
				+            printf ("%dD:", dim);
			
 
				+            fflush (stdout);
			
 
				 
			
 
				-        for (int m = power_min, i = 0; m <= power_max; m++, i++) {
			
 
				-            size_t size_bytes;
			
 
				-            float *host_orig_mem;
			
 
				-            float *host_result_mem;
			
 
				-            cl_mem dev_mem;
			
 
				-            cl_mem dev_out_mem;
			
 
				+            for (int m = power_min, i = 0; m <= power_max; m++, i++) {
			
 
				+                size_t size_bytes;
			
 
				+                float *host_orig_mem;
			
 
				+                float *host_result_mem;
			
 
				+                cl_mem dev_mem;
			
 
				+                cl_mem dev_out_mem;
			
 
				 
			
 
				-            size_t side_size = pow(2,m);
			
 
				-            size_t size = pow(side_size,dim);
			
 
				+                size_t side_size = pow(2,m);
			
 
				+                size_t size = pow(side_size,dim);
			
 
				 
			
 
				-            size_bytes = size * 2 * sizeof (float);
			
 
				-            host_orig_mem = malloc (size_bytes);
			
 
				-            host_result_mem = malloc (size_bytes);
			
 
				+                size_bytes = size * 2 * sizeof (float);
			
 
				+                host_orig_mem = malloc (size_bytes);
			
 
				+                host_result_mem = malloc (size_bytes);
			
 
				 
			
 
				-            for (int j = 0; j < size * 2; j++) {
			
 
				-                host_orig_mem[j] = rand() / ((float) RAND_MAX);
			
 
				-            }
			
 
				+                for (int l = 0; l < size * 2; l++) {
			
 
				+                    host_orig_mem[l] = rand() / ((float) RAND_MAX);
			
 
				+                }
			
 
				 
			
 
				-            dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
			
 
				-            OCL_CHECK_ERROR (err); 
			
 
				+                dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
			
 
				+                OCL_CHECK_ERROR (err); 
			
 
				 
			
 
				-            dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
			
 
				-            OCL_CHECK_ERROR (err);
			
 
				+                dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
			
 
				+                OCL_CHECK_ERROR (err);
			
 
				 
			
 
				-            if (dim == 1) {
			
 
				-                printf (" %zu", side_size);
			
 
				-            }
			
 
				-            else if (dim == 2) {
			
 
				-                printf (" %zux%zu", side_size, side_size);
			
 
				-            }
			
 
				-            else {
			
 
				-                printf (" %zux%zux%zu", side_size, side_size, side_size);
			
 
				-            }
			
 
				+                if (dim == 1) {
			
 
				+                    printf (" %zu", side_size);
			
 
				+                }
			
 
				+                else if (dim == 2) {
			
 
				+                    printf (" %zux%zu", side_size, side_size);
			
 
				+                }
			
 
				+                else {
			
 
				+                    printf (" %zux%zux%zu", side_size, side_size, side_size);
			
 
				+                }
			
 
				 
			
 
				-            fflush (stdout);
			
 
				+                fflush (stdout);
			
 
				 
			
 
				-            for (int j = 0; j < n_devices; j++) {
			
 
				-                //double time;
			
 
				-                //double mflops;
			
 
				+
			
 
				+                double time_sec;
			
 
				                 double sum;
			
 
				                 bool scale;
			
 
				 
			
 
				                 printf (".");
			
 
				                 fflush (stdout);
			
 
				 
			
 
				-                OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem,
			
 
				-                                                        0, NULL, NULL));
			
 
				+                OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem, 0, NULL, NULL));
			
 
				 
			
 
				                 size_t fft_size[3] = { 1, 1, 1};
			
 
				+                time_entries[j].dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
			
 
				+
			
 
				                 for (int l = 0; l < dim; l++) {
			
 
				                     fft_size[l] = side_size;
			
 
				+                    time_entries[j].dim_entries[k].sizes[i][j] = side_size;
			
 
				                 }
			
 
				 
			
 
				                 scale = func (context, queues[j], dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
			
@@ -196,25 +227,20 @@ loop_data_opencl (const char *vendor,
 
				                 OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
			
 
				                 sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
			
 
				 
			
 
				-                /*
			
 
				-                * We use the "mflops" methodology from FFTW, which states that
			
 
				-                *   mflops = 5 * N * log_2 (N) / (time for one FFT in microseconds)
			
 
				-                */
			
 
				-                //time = timer_get_seconds (timer) / N_RUNS / 1000.0;
			
 
				-                //mflops = 5 * size * log (size) / log (2) / time;
			
 
				+                time_sec = timer_get_seconds (timer) / N_RUNS;
			
 
				+
			
 
				+                time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
			
 
				+                time_entries[j].dim_entries[k].errors[i] = sum / size;
			
 
				 
			
 
				-                //times[j * N_ARRAYS + i] = mflops;
			
 
				-                errors[j * N_ARRAYS + i] = sum / size;
			
 
				+                free (host_orig_mem);
			
 
				+                free (host_result_mem);
			
 
				+                OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
			
 
				+                OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
			
 
				             }
			
 
				 
			
 
				-            free (host_orig_mem);
			
 
				-            free (host_result_mem);
			
 
				-            OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
			
 
				-            OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
			
 
				+            printf ("\n");
			
 
				+            fflush (stdout);
			
 
				         }
			
 
				-
			
 
				-        printf ("\n");
			
 
				-        fflush (stdout);
			
 
				     }
			
 
				 
			
 
				     printf ("\n");
			
@@ -227,109 +253,104 @@ static void
 
				 loop_data_cuda (const char *vendor,
			
 
				                 CudaBenchmarkFunc func,
			
 
				                 int n_devices,
			
 
				-                double *times,
			
 
				-                double *errors,
			
 
				-                FILE *fp)
			
 
				+                OutputType outputType,
			
 
				+                TimeEntry *time_entries)
			
 
				 {
			
 
				     Timer *timer;
			
 
				 
			
 
				     timer = timer_new ();
			
 
				 
			
 
				-    //for (int j = 0; j < n_devices; j++) {
			
 
				-    //    fprintf (fp, "%s_%i_bw %s_%i_err ", vendor, j, vendor, j);
			
 
				-    //}
			
 
				+    for (int j = 0; j < n_devices; j++) {
			
 
				+        char vendor_name[50];
			
 
				+        int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
			
 
				+        time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
			
 
				+        strcpy(time_entries[j].lib_name, vendor_name);
			
 
				 
			
 
				-    //time_entry->lib_name = "FFTW";
			
 
				-    //time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
			
 
				+        time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
			
 
				 
			
 
				-    for (int k = 0; k < N_DIMS; k++) {
			
 
				-        int dim = DIMS[k];
			
 
				-        int power_min = N_POWERS_INTERVALS[k][0];
			
 
				-        int power_max = N_POWERS_INTERVALS[k][1];
			
 
				-        //int num_entries = power_max - power_min + 1;
			
 
				+        for (int k = 0; k < N_DIMS; k++) {
			
 
				+            int dim = DIMS[k];
			
 
				+            int power_min = N_POWERS_INTERVALS[k][0];
			
 
				+            int power_max = N_POWERS_INTERVALS[k][1];
			
 
				+            int num_entries = power_max - power_min + 1;
			
 
				 
			
 
				-        //time_entry->dim_entries[k].n_dims = dim;
			
 
				-        //time_entry->dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
			
 
				-        //time_entry->dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
			
 
				-        //time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
			
 
				+            time_entries[j].dim_entries[k].n_dims = dim;
			
 
				+            time_entries[j].dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
			
 
				+            time_entries[j].dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
			
 
				+            time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
			
 
				 
			
 
				-        printf ("%dD:", dim);
			
 
				-        fflush (stdout);
			
 
				+            printf ("%dD:", dim);
			
 
				+            fflush (stdout);
			
 
				 
			
 
				-        for (int m = power_min, i = 0; m <= power_max; m++, i++) {
			
 
				-            size_t size_bytes;
			
 
				-            cufftComplex *host_orig_mem;
			
 
				-            cufftComplex *host_result_mem;
			
 
				-            cufftComplex *dev_mem;
			
 
				-            cufftComplex *dev_out_mem;
			
 
				+            for (int m = power_min, i = 0; m <= power_max; m++, i++) {
			
 
				+                size_t size_bytes;
			
 
				+                cufftComplex *host_orig_mem;
			
 
				+                cufftComplex *host_result_mem;
			
 
				+                cufftComplex *dev_mem;
			
 
				+                cufftComplex *dev_out_mem;
			
 
				 
			
 
				-            size_t side_size = pow(2,m);
			
 
				-            size_t size = pow(side_size,dim);
			
 
				+                size_t side_size = pow(2,m);
			
 
				+                size_t size = pow(side_size,dim);
			
 
				 
			
 
				-            size_bytes = size * sizeof (cufftComplex);
			
 
				-            host_orig_mem = (cufftComplex *)malloc(size_bytes);
			
 
				-            host_result_mem = (cufftComplex *)malloc(size_bytes);
			
 
				+                size_bytes = size * sizeof (cufftComplex);
			
 
				+                host_orig_mem = (cufftComplex *)malloc(size_bytes);
			
 
				+                host_result_mem = (cufftComplex *)malloc(size_bytes);
			
 
				 
			
 
				-            for (int j = 0; j < size; j++) {
			
 
				-                host_orig_mem[j].x = rand() / ((float) 10);
			
 
				-                host_orig_mem[j].y = rand() / ((float) 10);
			
 
				-            }
			
 
				+                for (int l = 0; l < size; l++) {
			
 
				+                    host_orig_mem[l].x = rand() / ((float) 10);
			
 
				+                    host_orig_mem[l].y = rand() / ((float) 10);
			
 
				+                }
			
 
				 
			
 
				-            CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
			
 
				-            CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
			
 
				+                CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
			
 
				+                CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
			
 
				 
			
 
				-            if (dim == 1) {
			
 
				-                printf (" %zu", side_size);
			
 
				-            }
			
 
				-            else if (dim == 2) {
			
 
				-                printf (" %zux%zu", side_size, side_size);
			
 
				-            }
			
 
				-            else {
			
 
				-                printf (" %zux%zux%zu", side_size, side_size, side_size);
			
 
				-            }
			
 
				+                if (dim == 1) {
			
 
				+                    printf (" %zu", side_size);
			
 
				+                }
			
 
				+                else if (dim == 2) {
			
 
				+                    printf (" %zux%zu", side_size, side_size);
			
 
				+                }
			
 
				+                else {
			
 
				+                    printf (" %zux%zux%zu", side_size, side_size, side_size);
			
 
				+                }
			
 
				 
			
 
				-            fflush (stdout);
			
 
				+                fflush (stdout);
			
 
				 
			
 
				-            //double time;
			
 
				-            //double mflops;
			
 
				-            double sum;
			
 
				-            bool scale;
			
 
				+                double time_sec;
			
 
				+                double sum;
			
 
				+                bool scale;
			
 
				 
			
 
				-            printf (".");
			
 
				-            fflush (stdout);
			
 
				+                printf (".");
			
 
				+                fflush (stdout);
			
 
				 
			
 
				-            CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
			
 
				+                CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
			
 
				 
			
 
				-            size_t fft_size[3] = { 1, 1, 1};
			
 
				-            for (int l = 0; l < dim; l++) {
			
 
				-                fft_size[l] = side_size;
			
 
				-            }
			
 
				+                size_t fft_size[3] = { 1, 1, 1};
			
 
				+                for (int l = 0; l < dim; l++) {
			
 
				+                    fft_size[l] = side_size;
			
 
				+                }
			
 
				 
			
 
				-            scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
			
 
				+                scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
			
 
				 
			
 
				-            /* Check precision */
			
 
				-            CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
			
 
				-            sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
			
 
				-
			
 
				-            /*
			
 
				-            * We use the "mflops" methodology from FFTW, which states that
			
 
				-            *   mflops = 5 * N * log_2 (N) / (time for one FFT in microseconds)
			
 
				-            */
			
 
				-            //time = timer_get_seconds (timer) / N_RUNS / 1000.0;
			
 
				-            //mflops = 5 * size * log (size) / log (2) / time;
			
 
				-
			
 
				-            //times[i] = mflops;
			
 
				-            errors[i] = sum / size;
			
 
				+                /* Check precision */
			
 
				+                CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
			
 
				+                sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
			
 
				         
			
 
				-            free (host_orig_mem);
			
 
				-            free (host_result_mem);
			
 
				+                time_sec = timer_get_seconds (timer) / N_RUNS;
			
 
				 
			
 
				-            CUDA_SAFE_CALL (cudaFree (dev_mem));
			
 
				-            CUDA_SAFE_CALL (cudaFree (dev_out_mem));
			
 
				-        }
			
 
				+                time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
			
 
				+                time_entries[j].dim_entries[k].errors[i] = sum / size;
			
 
				 
			
 
				-        printf ("\n");
			
 
				-        fflush (stdout);
			
 
				+                free (host_orig_mem);
			
 
				+                free (host_result_mem);
			
 
				+
			
 
				+                CUDA_SAFE_CALL (cudaFree (dev_mem));
			
 
				+                CUDA_SAFE_CALL (cudaFree (dev_out_mem));
			
 
				+            }
			
 
				+
			
 
				+            printf ("\n");
			
 
				+            fflush (stdout);
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     printf ("\n");
			
@@ -339,7 +360,7 @@ loop_data_cuda (const char *vendor,
 
				 
			
 
				 #ifdef HAVE_FFTW
			
 
				 static void
			
 
				-loop_data_fftw (TimeEntry *time_entry)
			
 
				+loop_data_fftw (OutputType outputType, TimeEntry *time_entry)
			
 
				 {
			
 
				     Timer *timer;
			
 
				 
			
@@ -368,16 +389,16 @@ loop_data_fftw (TimeEntry *time_entry)
 
				             fftw_complex *host_immediate_mem;
			
 
				             fftw_plan plan;
			
 
				             fftw_plan inverse_plan;
			
 
				-            double time;
			
 
				-            double mflops;
			
 
				+            double time_sec;
			
 
				             double sum = 0.0;
			
 
				 
			
 
				             size_t side_size = pow(2,m);
			
 
				             size_t size = pow(side_size,dim);
			
 
				+            size_t size_bytes = sizeof (fftw_complex) * size;
			
 
				 
			
 
				-            host_orig_mem = fftw_malloc (sizeof (fftw_complex) * size);
			
 
				-            host_immediate_mem = fftw_malloc (sizeof (fftw_complex) * size);
			
 
				-            host_result_mem = fftw_malloc (sizeof (fftw_complex) * size);
			
 
				+            host_orig_mem = fftw_malloc (size_bytes);
			
 
				+            host_immediate_mem = fftw_malloc (size_bytes);
			
 
				+            host_result_mem = fftw_malloc (size_bytes);
			
 
				 
			
 
				             switch (dim) {
			
 
				                 case 1:
			
@@ -445,15 +466,14 @@ loop_data_fftw (TimeEntry *time_entry)
 
				                 sum += fabs (host_result_mem[j][1] / size - host_orig_mem[j][1]);
			
 
				             }
			
 
				 
			
 
				-            time = timer_get_seconds (timer) / N_RUNS / 1000.0;
			
 
				-            mflops = 5 * size * log (size) / log (2) / time;
			
 
				+            time_sec = timer_get_seconds (timer) / N_RUNS;
			
 
				 
			
 
				             time_entry->dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
			
 
				             for (int j = 0; j < dim; j++) {
			
 
				                 time_entry->dim_entries[k].sizes[i][j] = side_size;
			
 
				             }
			
 
				 
			
 
				-            time_entry->dim_entries[k].times[i] = mflops;
			
 
				+            time_entry->dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
			
 
				             time_entry->dim_entries[k].errors[i] = sum / size;
			
 
				             
			
 
				             fftw_destroy_plan (inverse_plan);
			
@@ -664,9 +684,75 @@ compute_cuda_fft (cufftComplex *dev_mem,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+static void
			
 
				+write_headers_in_file (int n_dims, bool only_time, FILE *fp)
			
 
				+{    
			
 
				+    fprintf (fp, "# ");
			
 
				+
			
 
				+    for (int i = 0; i < n_dims; i++) {
			
 
				+        int min_power = N_POWERS_INTERVALS[i][0];
			
 
				+        int max_power = N_POWERS_INTERVALS[i][1];
			
 
				+
			
 
				+        for (int j = min_power; j <= max_power; j++) {
			
 
				+            int side_size = pow(2,j);
			
 
				+            switch (DIMS[i]) {
			
 
				+                case 1:
			
 
				+                fprintf (fp, "%d ", side_size);
			
 
				+                if (!only_time) {
			
 
				+                    fprintf (fp, "%d(Error) ", side_size);
			
 
				+                }
			
 
				+                break;
			
 
				+                case 2:
			
 
				+                fprintf (fp, "%dx%d ", side_size, side_size);
			
 
				+                if (!only_time) {
			
 
				+                    fprintf (fp, "%dx%d(Error) ", side_size, side_size);
			
 
				+                }
			
 
				+                break;
			
 
				+                case 3:
			
 
				+                fprintf (fp, "%dx%dx%d ", side_size, side_size, side_size);
			
 
				+                if (!only_time) {
			
 
				+                    fprintf (fp, "%dx%dx%d(Error) ", side_size, side_size, side_size);
			
 
				+                }
			
 
				+                break;
			
 
				+            }   
			
 
				+        } 
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+write_time_entries_in_file (TimeEntry* time_entries, int num_entries, int n_dims, bool only_time, bool new_line, FILE *fp)
			
 
				+{
			
 
				+    if (new_line) {
			
 
				+        fprintf (fp, "\n");
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < num_entries; i++) {
			
 
				+        fprintf (fp, "%s ", time_entries[i].lib_name);
			
 
				+
			
 
				+        DimEntry *dim_entries = time_entries[i].dim_entries;
			
 
				+
			
 
				+        for (int dim = 0; dim < n_dims; dim++) {
			
 
				+            DimEntry dim_entry = dim_entries[dim];
			
 
				+
			
 
				+            for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
			
 
				+                if (only_time) {
			
 
				+                    fprintf (fp, "%f ", dim_entry.times[j]);
			
 
				+                }
			
 
				+                else {
			
 
				+                    fprintf (fp, "%f %f ", dim_entry.times[j], dim_entry.errors[j]);
			
 
				+                }
			
 
				+            }
			
 
				+        }   
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 int
			
 
				 main (int argc, char **argv)
			
 
				 {
			
 
				+    OutputType outputType = OUT_THROUGHTPUT_MBS;
			
 
				+    bool only_time = true;
			
 
				+    bool new_line = true;
			
 
				+
			
 
				 #ifdef HAVE_OPENCL
			
 
				     cl_platform_id platform;
			
 
				     cl_uint n_devices;
			
@@ -678,25 +764,22 @@ main (int argc, char **argv)
 
				 
			
 
				 #ifdef HAVE_AMD_FFT
			
 
				     static int with_amd_fft = 1;
			
 
				-    double *amd_times;
			
 
				-    double *amd_errors;
			
 
				+    TimeEntry *amd_time_entries;
			
 
				 #endif
			
 
				 
			
 
				 #ifdef HAVE_CUDA_FFT
			
 
				     static int with_cuda_fft = 1;
			
 
				-    double *cuda_times;
			
 
				-    double *cuda_errors;
			
 
				+    TimeEntry *cuda_time_entries;
			
 
				 #endif
			
 
				 
			
 
				 #ifdef HAVE_APPLE_FFT
			
 
				     static int with_apple_fft = 1;
			
 
				-    double *apple_times;
			
 
				-    double *apple_errors;
			
 
				+    TimeEntry *apple_time_entries;
			
 
				 #endif
			
 
				 
			
 
				 #ifdef HAVE_FFTW
			
 
				     static int with_fftw = 1;
			
 
				-    TimeEntry *time_entries_fftw;
			
 
				+    TimeEntry *fftw_time_entries;
			
 
				 #endif
			
 
				 
			
 
				     static int show_help = 0;
			
@@ -719,10 +802,6 @@ main (int argc, char **argv)
 
				         {0, 0, 0, 0}
			
 
				     };
			
 
				 
			
 
				-    size_t size = INITIAL_SIZE;
			
 
				-    FILE *fp;
			
 
				-    FILE *fp_new;
			
 
				-
			
 
				     /* Parse options */
			
 
				     while (getopt_long (argc, argv, "", long_options, NULL) != -1)
			
 
				         ;
			
@@ -736,13 +815,10 @@ main (int argc, char **argv)
 
				         return 0;
			
 
				     }
			
 
				 
			
 
				-    /* Write header */
			
 
				-    fp = fopen ("result.txt", "w");
			
 
				-    fp_new = fopen ("result_new.txt", "w");
			
 
				-
			
 
				-    fprintf (fp, "# size ");
			
 
				+    /* Open output file */
			
 
				+    FILE *fp;
			
 
				 
			
 
				-    fprintf (fp_new, "# ");
			
 
				+    fp = fopen ("result.txt", "w");
			
 
				 
			
 
				 #ifdef HAVE_OPENCL
			
 
				     OCL_CHECK_ERROR (clGetPlatformIDs (1, &platform, NULL));
			
@@ -763,123 +839,73 @@ main (int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 #ifdef HAVE_AMD_FFT
			
 
				-    amd_times = malloc (n_devices * N_ARRAYS * sizeof (double));
			
 
				-    amd_errors = malloc (n_devices * N_ARRAYS * sizeof (double));
			
 
				+    amd_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * n_devices);
			
 
				 
			
 
				     if (with_amd_fft) {
			
 
				         printf ("Testing AMD FFT ...\n");
			
 
				-        loop_data_opencl ("AMD", compute_amd_fft, context, queues, n_devices, amd_times, amd_errors, fp);
			
 
				+        loop_data_opencl ("AMD", compute_amd_fft, context, queues, n_devices, outputType, amd_time_entries);
			
 
				     }
			
 
				 #endif
			
 
				 
			
 
				 #ifdef HAVE_APPLE_FFT
			
 
				-    apple_times = malloc (n_devices * N_ARRAYS * sizeof (double));
			
 
				-    apple_errors = malloc (n_devices * N_ARRAYS * sizeof (double));
			
 
				+    apple_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * n_devices);
			
 
				 
			
 
				     if (with_apple_fft) {
			
 
				         printf ("Testing Apple FFT ...\n");
			
 
				-        loop_data_opencl ("APP", compute_apple_fft, context, queues, n_devices, apple_times, apple_errors, fp);
			
 
				+        loop_data_opencl ("APP", compute_apple_fft, context, queues, n_devices, outputType, apple_time_entries);
			
 
				     }
			
 
				 #endif
			
 
				 
			
 
				 #ifdef HAVE_CUDA_FFT
			
 
				-    cuda_times = malloc (1 * N_ARRAYS * sizeof (double));
			
 
				-    cuda_errors = malloc (1 * N_ARRAYS * sizeof (double));
			
 
				+    cuda_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * 1);
			
 
				 
			
 
				     if (with_cuda_fft) {
			
 
				         printf ("Testing CUDA FFT ...\n");
			
 
				-        loop_data_cuda ("CUDA", compute_cuda_fft, 1, cuda_times, cuda_errors, fp);
			
 
				+        loop_data_cuda ("CUDA", compute_cuda_fft, 1, outputType, cuda_time_entries);
			
 
				     }
			
 
				 #endif
			
 
				 
			
 
				 #ifdef HAVE_FFTW
			
 
				-    time_entries_fftw = (TimeEntry *)malloc(sizeof(TimeEntry));
			
 
				+    fftw_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry));
			
 
				 
			
 
				     if (with_fftw) {
			
 
				         printf ("Testing FFTW3 ...\n");
			
 
				-        loop_data_fftw (&(time_entries_fftw[0]));
			
 
				+        loop_data_fftw (outputType, &(fftw_time_entries[0]));
			
 
				     }
			
 
				 #endif
			
 
				 
			
 
				-    for (int i = 0; i < N_DIMS; i++) {
			
 
				-        int min_power = N_POWERS_INTERVALS[i][0];
			
 
				-        int max_power = N_POWERS_INTERVALS[i][1];
			
 
				-
			
 
				-        for (int j = min_power; j <= max_power; j++) {
			
 
				-            int side_size = pow(2,j);
			
 
				-            switch (DIMS[i]) {
			
 
				-                case 1:
			
 
				-                 fprintf (fp_new, "%d ", side_size);
			
 
				-                 fprintf (fp_new, "%d(Error) ", side_size);
			
 
				-                 break;
			
 
				-                case 2:
			
 
				-                 fprintf (fp_new, "%dx%d ", side_size, side_size);
			
 
				-                 fprintf (fp_new, "%dx%d(Error) ", side_size, side_size);
			
 
				-                 break;
			
 
				-                case 3:
			
 
				-                 fprintf (fp_new, "%dx%dx%d ", side_size, side_size, side_size);
			
 
				-                 fprintf (fp_new, "%dx%dx%d(Error) ", side_size, side_size, side_size);
			
 
				-                 break;
			
 
				-            }   
			
 
				-        }
			
 
				-        
			
 
				-    }
			
 
				-
			
 
				-    fprintf (fp_new, "\n");
			
 
				-
			
 
				-    for (int i = 0; i < 1; i++) { //loop over time entries
			
 
				-        fprintf (fp_new, "%s ", time_entries_fftw[i].lib_name);
			
 
				-
			
 
				-        DimEntry *dim_entries = time_entries_fftw[i].dim_entries;
			
 
				-
			
 
				-        for (int dim = 0; dim < N_DIMS; dim++) {
			
 
				-            DimEntry dim_entry = dim_entries[dim];
			
 
				-
			
 
				-            for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
			
 
				-                fprintf (fp_new, "%f %f ", dim_entry.times[j], dim_entry.errors[j]);
			
 
				-            }
			
 
				-        }   
			
 
				-    }
			
 
				-
			
 
				-    for (int i = 0; i < N_ARRAYS; i++) {
			
 
				-        UPDATE_SIZE (size);
			
 
				-
			
 
				-        fprintf (fp, "\n%zu ", size);
			
 
				-
			
 
				-        for (int j = 0; j < n_devices; j++) {
			
 
				-            int index = j * N_ARRAYS + i;
			
 
				+    /* Write headers */
			
 
				+    write_headers_in_file (N_DIMS, only_time, fp);
			
 
				 
			
 
				 #ifdef HAVE_AMD_FFT
			
 
				-            if (with_amd_fft) {
			
 
				-                fprintf (fp, "%f %f ", amd_times[index], amd_errors[index]);
			
 
				-            }
			
 
				+    if (with_amd_fft) {
			
 
				+        write_time_entries_in_file (amd_time_entries, n_devices, N_DIMS, only_time, new_line, fp);
			
 
				+    }
			
 
				 #endif
			
 
				 
			
 
				 #ifdef HAVE_APPLE_FFT
			
 
				-            if (with_apple_fft) {
			
 
				-                fprintf (fp, "%f %f ", apple_times[index], apple_errors[index]);
			
 
				-            }
			
 
				-#endif
			
 
				-        }
			
 
				-/*
			
 
				-#ifdef HAVE_FFTW
			
 
				-        if (with_fftw) {
			
 
				-            fprintf (fp, "%f %f ", fftw_times[i], fftw_errors[i]);
			
 
				-        }
			
 
				+    if (with_apple_fft) {
			
 
				+        write_time_entries_in_file (apple_time_entries, n_devices, N_DIMS, only_time, new_line, fp);
			
 
				+    }
			
 
				 #endif
			
 
				-*/
			
 
				+
			
 
				 #ifdef HAVE_CUDA_FFT
			
 
				-        if (with_cuda_fft) {
			
 
				-            fprintf (fp, "%f %f ", cuda_times[i], cuda_times[i]);
			
 
				-        }
			
 
				+    if (with_cuda_fft) {
			
 
				+        write_time_entries_in_file (cuda_time_entries, 1, N_DIMS, only_time, new_line, fp);
			
 
				+    }
			
 
				 #endif
			
 
				+
			
 
				+#ifdef HAVE_FFTW
			
 
				+    if (with_fftw) {
			
 
				+        write_time_entries_in_file (fftw_time_entries, 1, N_DIMS, only_time, new_line, fp);
			
 
				     }
			
 
				+#endif
			
 
				 
			
 
				-    fprintf (fp, "\n");
			
 
				 
			
 
				 #ifdef HAVE_OPENCL
			
 
				-    for (int i = 0; i < n_devices; i++)
			
 
				+    for (int i = 0; i < n_devices; i++) {
			
 
				         clReleaseCommandQueue (queues[i]);
			
 
				+    }
			
 
				 
			
 
				     clReleaseContext (context);
			
 
				 
			
@@ -889,6 +915,5 @@ main (int argc, char **argv)
 
				 
			
 
				     fclose (fp);
			
 
				 
			
 
				-    fclose (fp_new);
			
 
				     return 0;
			
 
				 }