Ver código fonte

Add new logging approach

Roman Shkarin 9 anos atrás
pai
commit
82b5f54905
2 arquivos alterados com 276 adições e 261 exclusões
  1. 2 12
      Makefile
  2. 274 249
      benchmark.c

+ 2 - 12
Makefile

@@ -8,8 +8,7 @@ OBJS = $(subst .c,.o,$(SRC))
 # Dependencies 
 DEP_OCLFFT = .deps/oclfft/src/liboclfft.so
 DEP_CLFFT = .deps/clFFT/src/library/libclFFT.so
-DEP_TOPEFFT = .deps/tope-fft/libtopefft.so
-DEPS = $(DEP_OCLFFT) $(DEP_TOPEFFT)
+DEPS = $(DEP_OCLFFT)
 
 # Common flags definition
 NVCCFLAGS   := -m${OS_SIZE}
@@ -70,10 +69,6 @@ override CFLAGS += -I.deps/
 override LDFLAGS += -L.deps/oclfft/src -loclfft
 LIBS_MSG += " +apple"
 
-# Tope FFT library
-override CPPFLAGS += -DHAVE_TOPE_FFT
-LIBS_MSG += " +tope"
-
 # FFTW library
 FFTW_EXISTS = $(shell pkg-config --exists fftw3 && echo "1" || echo "0")
 
@@ -145,9 +140,4 @@ $(DEP_OCLFFT):
 $(DEP_CLFFT):
 	@mkdir -p .deps
 	@git clone https://github.com/clMathLibraries/clFFT .deps/clFFT
-	@cd .deps/clFFT/src; cmake .; make
-
-$(DEP_TOPEFFT):
-	@mkdir -p .deps
-	@git clone -b changes https://github.com/matze/tope-fft .deps/tope-fft
-	cd .deps/tope-fft; CFLAGS="$(CFLAGS)" make
+	@cd .deps/clFFT/src; cmake .; make

+ 274 - 249
benchmark.c

@@ -1,4 +1,5 @@
 #include <stdlib.h>
+#include <string.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <math.h>
@@ -66,6 +67,13 @@ const int DIMS[N_DIMS] = {1, 2, 3};
 const int N_DIM_ARRAYS[N_DIMS] = {5, 4, 4};
 const int N_POWERS_INTERVALS[N_DIMS][2] = {{5, 11}, {8, 11}, {7, 7}};
 
+typedef enum _OutputType {
+    OUT_MILLISECONDS,
+    OUT_MFLOPS,
+    OUT_THROUGHTPUT_MBS,
+    OUT_NONE
+} OutputType;
+
 #define UPDATE_SIZE(size) size *= 8;
 #define PRINT_DIM_SIZE(side_size,dim) { \
     printf(" %zu", side_size); while (dim != 1) { printf("x%zu", side_size);dim--; } printf("."); }
@@ -101,93 +109,116 @@ sum_of_absolute_differences_complex (cufftComplex *a, cufftComplex *b, int n, bo
     return sum;
 }
 
+static double
+get_measurements_with_format (OutputType outputType, size_t size_bytes, double time_sec)
+{
+    double out_result = -1;
+
+    if (outputType == OUT_MFLOPS) {
+        size_t size = size_bytes / 2 / sizeof (float);
+        out_result = 5 * size * log (size) / log (2) / (time_sec / 1000.0);
+    }
+    else if (outputType == OUT_THROUGHTPUT_MBS) {
+        out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0;
+    }
+    else if (outputType == OUT_MILLISECONDS) {
+        out_result = time_sec * 1000.0;
+    }
+    else {
+        fprintf (stderr, "Unknown output type of OpenCL routines!\n");
+    }
+
+    return out_result;
+}
+
 static void
 loop_data_opencl (const char *vendor,
                   OclBenchmarkFunc func,
                   cl_context context,
                   cl_command_queue *queues,
                   int n_devices,
-                  double *times,
-                  double *errors,
-                  FILE *fp)
+                  OutputType outputType,
+                  TimeEntry *time_entries)
 {
     Timer *timer;
     cl_int err;
 
     timer = timer_new ();
 
-    //for (int j = 0; j < n_devices; j++) {
-    //    fprintf (fp, "%s_%i_bw %s_%i_err ", vendor, j, vendor, j);
-    //}
+    for (int j = 0; j < n_devices; j++) {
+        char vendor_name[50];
+        int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
+        time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
+        strcpy(time_entries[j].lib_name, vendor_name);
 
-    //time_entry->lib_name = "FFTW";
-    //time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
+        time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
 
-    for (int k = 0; k < N_DIMS; k++) {
-        int dim = DIMS[k];
-        int power_min = N_POWERS_INTERVALS[k][0];
-        int power_max = N_POWERS_INTERVALS[k][1];
-        //int num_entries = power_max - power_min + 1;
+        for (int k = 0; k < N_DIMS; k++) {
+            int dim = DIMS[k];
+            int power_min = N_POWERS_INTERVALS[k][0];
+            int power_max = N_POWERS_INTERVALS[k][1];
+            int num_entries = power_max - power_min + 1;
 
-        //time_entry->dim_entries[k].n_dims = dim;
-        //time_entry->dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
-        //time_entry->dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
-        //time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
+            time_entries[j].dim_entries[k].n_dims = dim;
+            time_entries[j].dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
+            time_entries[j].dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
+            time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
 
-        printf ("%dD:", dim);
-        fflush (stdout);
+            printf ("%dD:", dim);
+            fflush (stdout);
 
-        for (int m = power_min, i = 0; m <= power_max; m++, i++) {
-            size_t size_bytes;
-            float *host_orig_mem;
-            float *host_result_mem;
-            cl_mem dev_mem;
-            cl_mem dev_out_mem;
+            for (int m = power_min, i = 0; m <= power_max; m++, i++) {
+                size_t size_bytes;
+                float *host_orig_mem;
+                float *host_result_mem;
+                cl_mem dev_mem;
+                cl_mem dev_out_mem;
 
-            size_t side_size = pow(2,m);
-            size_t size = pow(side_size,dim);
+                size_t side_size = pow(2,m);
+                size_t size = pow(side_size,dim);
 
-            size_bytes = size * 2 * sizeof (float);
-            host_orig_mem = malloc (size_bytes);
-            host_result_mem = malloc (size_bytes);
+                size_bytes = size * 2 * sizeof (float);
+                host_orig_mem = malloc (size_bytes);
+                host_result_mem = malloc (size_bytes);
 
-            for (int j = 0; j < size * 2; j++) {
-                host_orig_mem[j] = rand() / ((float) RAND_MAX);
-            }
+                for (int l = 0; l < size * 2; l++) {
+                    host_orig_mem[l] = rand() / ((float) RAND_MAX);
+                }
 
-            dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
-            OCL_CHECK_ERROR (err); 
+                dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
+                OCL_CHECK_ERROR (err); 
 
-            dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
-            OCL_CHECK_ERROR (err);
+                dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
+                OCL_CHECK_ERROR (err);
 
-            if (dim == 1) {
-                printf (" %zu", side_size);
-            }
-            else if (dim == 2) {
-                printf (" %zux%zu", side_size, side_size);
-            }
-            else {
-                printf (" %zux%zux%zu", side_size, side_size, side_size);
-            }
+                if (dim == 1) {
+                    printf (" %zu", side_size);
+                }
+                else if (dim == 2) {
+                    printf (" %zux%zu", side_size, side_size);
+                }
+                else {
+                    printf (" %zux%zux%zu", side_size, side_size, side_size);
+                }
 
-            fflush (stdout);
+                fflush (stdout);
 
-            for (int j = 0; j < n_devices; j++) {
-                //double time;
-                //double mflops;
+
+                double time_sec;
                 double sum;
                 bool scale;
 
                 printf (".");
                 fflush (stdout);
 
-                OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem,
-                                                        0, NULL, NULL));
+                OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem, 0, NULL, NULL));
 
                 size_t fft_size[3] = { 1, 1, 1};
+                time_entries[j].dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
+
                 for (int l = 0; l < dim; l++) {
                     fft_size[l] = side_size;
+                    time_entries[j].dim_entries[k].sizes[i][j] = side_size;
                 }
 
                 scale = func (context, queues[j], dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
@@ -196,25 +227,20 @@ loop_data_opencl (const char *vendor,
                 OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
                 sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
 
-                /*
-                * We use the "mflops" methodology from FFTW, which states that
-                *   mflops = 5 * N * log_2 (N) / (time for one FFT in microseconds)
-                */
-                //time = timer_get_seconds (timer) / N_RUNS / 1000.0;
-                //mflops = 5 * size * log (size) / log (2) / time;
+                time_sec = timer_get_seconds (timer) / N_RUNS;
+
+                time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
+                time_entries[j].dim_entries[k].errors[i] = sum / size;
 
-                //times[j * N_ARRAYS + i] = mflops;
-                errors[j * N_ARRAYS + i] = sum / size;
+                free (host_orig_mem);
+                free (host_result_mem);
+                OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
+                OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
             }
 
-            free (host_orig_mem);
-            free (host_result_mem);
-            OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
-            OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
+            printf ("\n");
+            fflush (stdout);
         }
-
-        printf ("\n");
-        fflush (stdout);
     }
 
     printf ("\n");
@@ -227,109 +253,104 @@ static void
 loop_data_cuda (const char *vendor,
                 CudaBenchmarkFunc func,
                 int n_devices,
-                double *times,
-                double *errors,
-                FILE *fp)
+                OutputType outputType,
+                TimeEntry *time_entries)
 {
     Timer *timer;
 
     timer = timer_new ();
 
-    //for (int j = 0; j < n_devices; j++) {
-    //    fprintf (fp, "%s_%i_bw %s_%i_err ", vendor, j, vendor, j);
-    //}
+    for (int j = 0; j < n_devices; j++) {
+        char vendor_name[50];
+        int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
+        time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
+        strcpy(time_entries[j].lib_name, vendor_name);
 
-    //time_entry->lib_name = "FFTW";
-    //time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
+        time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
 
-    for (int k = 0; k < N_DIMS; k++) {
-        int dim = DIMS[k];
-        int power_min = N_POWERS_INTERVALS[k][0];
-        int power_max = N_POWERS_INTERVALS[k][1];
-        //int num_entries = power_max - power_min + 1;
+        for (int k = 0; k < N_DIMS; k++) {
+            int dim = DIMS[k];
+            int power_min = N_POWERS_INTERVALS[k][0];
+            int power_max = N_POWERS_INTERVALS[k][1];
+            int num_entries = power_max - power_min + 1;
 
-        //time_entry->dim_entries[k].n_dims = dim;
-        //time_entry->dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
-        //time_entry->dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
-        //time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
+            time_entries[j].dim_entries[k].n_dims = dim;
+            time_entries[j].dim_entries[k].sizes  = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
+            time_entries[j].dim_entries[k].times  = (double *)malloc(sizeof(double) * num_entries);
+            time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
 
-        printf ("%dD:", dim);
-        fflush (stdout);
+            printf ("%dD:", dim);
+            fflush (stdout);
 
-        for (int m = power_min, i = 0; m <= power_max; m++, i++) {
-            size_t size_bytes;
-            cufftComplex *host_orig_mem;
-            cufftComplex *host_result_mem;
-            cufftComplex *dev_mem;
-            cufftComplex *dev_out_mem;
+            for (int m = power_min, i = 0; m <= power_max; m++, i++) {
+                size_t size_bytes;
+                cufftComplex *host_orig_mem;
+                cufftComplex *host_result_mem;
+                cufftComplex *dev_mem;
+                cufftComplex *dev_out_mem;
 
-            size_t side_size = pow(2,m);
-            size_t size = pow(side_size,dim);
+                size_t side_size = pow(2,m);
+                size_t size = pow(side_size,dim);
 
-            size_bytes = size * sizeof (cufftComplex);
-            host_orig_mem = (cufftComplex *)malloc(size_bytes);
-            host_result_mem = (cufftComplex *)malloc(size_bytes);
+                size_bytes = size * sizeof (cufftComplex);
+                host_orig_mem = (cufftComplex *)malloc(size_bytes);
+                host_result_mem = (cufftComplex *)malloc(size_bytes);
 
-            for (int j = 0; j < size; j++) {
-                host_orig_mem[j].x = rand() / ((float) 10);
-                host_orig_mem[j].y = rand() / ((float) 10);
-            }
+                for (int l = 0; l < size; l++) {
+                    host_orig_mem[l].x = rand() / ((float) 10);
+                    host_orig_mem[l].y = rand() / ((float) 10);
+                }
 
-            CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
-            CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
+                CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
+                CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
 
-            if (dim == 1) {
-                printf (" %zu", side_size);
-            }
-            else if (dim == 2) {
-                printf (" %zux%zu", side_size, side_size);
-            }
-            else {
-                printf (" %zux%zux%zu", side_size, side_size, side_size);
-            }
+                if (dim == 1) {
+                    printf (" %zu", side_size);
+                }
+                else if (dim == 2) {
+                    printf (" %zux%zu", side_size, side_size);
+                }
+                else {
+                    printf (" %zux%zux%zu", side_size, side_size, side_size);
+                }
 
-            fflush (stdout);
+                fflush (stdout);
 
-            //double time;
-            //double mflops;
-            double sum;
-            bool scale;
+                double time_sec;
+                double sum;
+                bool scale;
 
-            printf (".");
-            fflush (stdout);
+                printf (".");
+                fflush (stdout);
 
-            CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
+                CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
 
-            size_t fft_size[3] = { 1, 1, 1};
-            for (int l = 0; l < dim; l++) {
-                fft_size[l] = side_size;
-            }
+                size_t fft_size[3] = { 1, 1, 1};
+                for (int l = 0; l < dim; l++) {
+                    fft_size[l] = side_size;
+                }
 
-            scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
+                scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
 
-            /* Check precision */
-            CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
-            sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
-
-            /*
-            * We use the "mflops" methodology from FFTW, which states that
-            *   mflops = 5 * N * log_2 (N) / (time for one FFT in microseconds)
-            */
-            //time = timer_get_seconds (timer) / N_RUNS / 1000.0;
-            //mflops = 5 * size * log (size) / log (2) / time;
-
-            //times[i] = mflops;
-            errors[i] = sum / size;
+                /* Check precision */
+                CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
+                sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
         
-            free (host_orig_mem);
-            free (host_result_mem);
+                time_sec = timer_get_seconds (timer) / N_RUNS;
 
-            CUDA_SAFE_CALL (cudaFree (dev_mem));
-            CUDA_SAFE_CALL (cudaFree (dev_out_mem));
-        }
+                time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
+                time_entries[j].dim_entries[k].errors[i] = sum / size;
 
-        printf ("\n");
-        fflush (stdout);
+                free (host_orig_mem);
+                free (host_result_mem);
+
+                CUDA_SAFE_CALL (cudaFree (dev_mem));
+                CUDA_SAFE_CALL (cudaFree (dev_out_mem));
+            }
+
+            printf ("\n");
+            fflush (stdout);
+        }
     }
 
     printf ("\n");
@@ -339,7 +360,7 @@ loop_data_cuda (const char *vendor,
 
 #ifdef HAVE_FFTW
 static void
-loop_data_fftw (TimeEntry *time_entry)
+loop_data_fftw (OutputType outputType, TimeEntry *time_entry)
 {
     Timer *timer;
 
@@ -368,16 +389,16 @@ loop_data_fftw (TimeEntry *time_entry)
             fftw_complex *host_immediate_mem;
             fftw_plan plan;
             fftw_plan inverse_plan;
-            double time;
-            double mflops;
+            double time_sec;
             double sum = 0.0;
 
             size_t side_size = pow(2,m);
             size_t size = pow(side_size,dim);
+            size_t size_bytes = sizeof (fftw_complex) * size;
 
-            host_orig_mem = fftw_malloc (sizeof (fftw_complex) * size);
-            host_immediate_mem = fftw_malloc (sizeof (fftw_complex) * size);
-            host_result_mem = fftw_malloc (sizeof (fftw_complex) * size);
+            host_orig_mem = fftw_malloc (size_bytes);
+            host_immediate_mem = fftw_malloc (size_bytes);
+            host_result_mem = fftw_malloc (size_bytes);
 
             switch (dim) {
                 case 1:
@@ -445,15 +466,14 @@ loop_data_fftw (TimeEntry *time_entry)
                 sum += fabs (host_result_mem[j][1] / size - host_orig_mem[j][1]);
             }
 
-            time = timer_get_seconds (timer) / N_RUNS / 1000.0;
-            mflops = 5 * size * log (size) / log (2) / time;
+            time_sec = timer_get_seconds (timer) / N_RUNS;
 
             time_entry->dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
             for (int j = 0; j < dim; j++) {
                 time_entry->dim_entries[k].sizes[i][j] = side_size;
             }
 
-            time_entry->dim_entries[k].times[i] = mflops;
+            time_entry->dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
             time_entry->dim_entries[k].errors[i] = sum / size;
             
             fftw_destroy_plan (inverse_plan);
@@ -664,9 +684,75 @@ compute_cuda_fft (cufftComplex *dev_mem,
 }
 #endif
 
+static void
+write_headers_in_file (int n_dims, bool only_time, FILE *fp)
+{    
+    fprintf (fp, "# ");
+
+    for (int i = 0; i < n_dims; i++) {
+        int min_power = N_POWERS_INTERVALS[i][0];
+        int max_power = N_POWERS_INTERVALS[i][1];
+
+        for (int j = min_power; j <= max_power; j++) {
+            int side_size = pow(2,j);
+            switch (DIMS[i]) {
+                case 1:
+                fprintf (fp, "%d ", side_size);
+                if (!only_time) {
+                    fprintf (fp, "%d(Error) ", side_size);
+                }
+                break;
+                case 2:
+                fprintf (fp, "%dx%d ", side_size, side_size);
+                if (!only_time) {
+                    fprintf (fp, "%dx%d(Error) ", side_size, side_size);
+                }
+                break;
+                case 3:
+                fprintf (fp, "%dx%dx%d ", side_size, side_size, side_size);
+                if (!only_time) {
+                    fprintf (fp, "%dx%dx%d(Error) ", side_size, side_size, side_size);
+                }
+                break;
+            }   
+        } 
+    }
+}
+
+static void
+write_time_entries_in_file (TimeEntry* time_entries, int num_entries, int n_dims, bool only_time, bool new_line, FILE *fp)
+{
+    if (new_line) {
+        fprintf (fp, "\n");
+    }
+
+    for (int i = 0; i < num_entries; i++) {
+        fprintf (fp, "%s ", time_entries[i].lib_name);
+
+        DimEntry *dim_entries = time_entries[i].dim_entries;
+
+        for (int dim = 0; dim < n_dims; dim++) {
+            DimEntry dim_entry = dim_entries[dim];
+
+            for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
+                if (only_time) {
+                    fprintf (fp, "%f ", dim_entry.times[j]);
+                }
+                else {
+                    fprintf (fp, "%f %f ", dim_entry.times[j], dim_entry.errors[j]);
+                }
+            }
+        }   
+    }
+}
+
 int
 main (int argc, char **argv)
 {
+    OutputType outputType = OUT_THROUGHTPUT_MBS;
+    bool only_time = true;
+    bool new_line = true;
+
 #ifdef HAVE_OPENCL
     cl_platform_id platform;
     cl_uint n_devices;
@@ -678,25 +764,22 @@ main (int argc, char **argv)
 
 #ifdef HAVE_AMD_FFT
     static int with_amd_fft = 1;
-    double *amd_times;
-    double *amd_errors;
+    TimeEntry *amd_time_entries;
 #endif
 
 #ifdef HAVE_CUDA_FFT
     static int with_cuda_fft = 1;
-    double *cuda_times;
-    double *cuda_errors;
+    TimeEntry *cuda_time_entries;
 #endif
 
 #ifdef HAVE_APPLE_FFT
     static int with_apple_fft = 1;
-    double *apple_times;
-    double *apple_errors;
+    TimeEntry *apple_time_entries;
 #endif
 
 #ifdef HAVE_FFTW
     static int with_fftw = 1;
-    TimeEntry *time_entries_fftw;
+    TimeEntry *fftw_time_entries;
 #endif
 
     static int show_help = 0;
@@ -719,10 +802,6 @@ main (int argc, char **argv)
         {0, 0, 0, 0}
     };
 
-    size_t size = INITIAL_SIZE;
-    FILE *fp;
-    FILE *fp_new;
-
     /* Parse options */
     while (getopt_long (argc, argv, "", long_options, NULL) != -1)
         ;
@@ -736,13 +815,10 @@ main (int argc, char **argv)
         return 0;
     }
 
-    /* Write header */
-    fp = fopen ("result.txt", "w");
-    fp_new = fopen ("result_new.txt", "w");
-
-    fprintf (fp, "# size ");
+    /* Open output file */
+    FILE *fp;
 
-    fprintf (fp_new, "# ");
+    fp = fopen ("result.txt", "w");
 
 #ifdef HAVE_OPENCL
     OCL_CHECK_ERROR (clGetPlatformIDs (1, &platform, NULL));
@@ -763,123 +839,73 @@ main (int argc, char **argv)
 #endif
 
 #ifdef HAVE_AMD_FFT
-    amd_times = malloc (n_devices * N_ARRAYS * sizeof (double));
-    amd_errors = malloc (n_devices * N_ARRAYS * sizeof (double));
+    amd_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * n_devices);
 
     if (with_amd_fft) {
         printf ("Testing AMD FFT ...\n");
-        loop_data_opencl ("AMD", compute_amd_fft, context, queues, n_devices, amd_times, amd_errors, fp);
+        loop_data_opencl ("AMD", compute_amd_fft, context, queues, n_devices, outputType, amd_time_entries);
     }
 #endif
 
 #ifdef HAVE_APPLE_FFT
-    apple_times = malloc (n_devices * N_ARRAYS * sizeof (double));
-    apple_errors = malloc (n_devices * N_ARRAYS * sizeof (double));
+    apple_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * n_devices);
 
     if (with_apple_fft) {
         printf ("Testing Apple FFT ...\n");
-        loop_data_opencl ("APP", compute_apple_fft, context, queues, n_devices, apple_times, apple_errors, fp);
+        loop_data_opencl ("APP", compute_apple_fft, context, queues, n_devices, outputType, apple_time_entries);
     }
 #endif
 
 #ifdef HAVE_CUDA_FFT
-    cuda_times = malloc (1 * N_ARRAYS * sizeof (double));
-    cuda_errors = malloc (1 * N_ARRAYS * sizeof (double));
+    cuda_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * 1);
 
     if (with_cuda_fft) {
         printf ("Testing CUDA FFT ...\n");
-        loop_data_cuda ("CUDA", compute_cuda_fft, 1, cuda_times, cuda_errors, fp);
+        loop_data_cuda ("CUDA", compute_cuda_fft, 1, outputType, cuda_time_entries);
     }
 #endif
 
 #ifdef HAVE_FFTW
-    time_entries_fftw = (TimeEntry *)malloc(sizeof(TimeEntry));
+    fftw_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry));
 
     if (with_fftw) {
         printf ("Testing FFTW3 ...\n");
-        loop_data_fftw (&(time_entries_fftw[0]));
+        loop_data_fftw (outputType, &(fftw_time_entries[0]));
     }
 #endif
 
-    for (int i = 0; i < N_DIMS; i++) {
-        int min_power = N_POWERS_INTERVALS[i][0];
-        int max_power = N_POWERS_INTERVALS[i][1];
-
-        for (int j = min_power; j <= max_power; j++) {
-            int side_size = pow(2,j);
-            switch (DIMS[i]) {
-                case 1:
-                 fprintf (fp_new, "%d ", side_size);
-                 fprintf (fp_new, "%d(Error) ", side_size);
-                 break;
-                case 2:
-                 fprintf (fp_new, "%dx%d ", side_size, side_size);
-                 fprintf (fp_new, "%dx%d(Error) ", side_size, side_size);
-                 break;
-                case 3:
-                 fprintf (fp_new, "%dx%dx%d ", side_size, side_size, side_size);
-                 fprintf (fp_new, "%dx%dx%d(Error) ", side_size, side_size, side_size);
-                 break;
-            }   
-        }
-        
-    }
-
-    fprintf (fp_new, "\n");
-
-    for (int i = 0; i < 1; i++) { //loop over time entries
-        fprintf (fp_new, "%s ", time_entries_fftw[i].lib_name);
-
-        DimEntry *dim_entries = time_entries_fftw[i].dim_entries;
-
-        for (int dim = 0; dim < N_DIMS; dim++) {
-            DimEntry dim_entry = dim_entries[dim];
-
-            for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
-                fprintf (fp_new, "%f %f ", dim_entry.times[j], dim_entry.errors[j]);
-            }
-        }   
-    }
-
-    for (int i = 0; i < N_ARRAYS; i++) {
-        UPDATE_SIZE (size);
-
-        fprintf (fp, "\n%zu ", size);
-
-        for (int j = 0; j < n_devices; j++) {
-            int index = j * N_ARRAYS + i;
+    /* Write headers */
+    write_headers_in_file (N_DIMS, only_time, fp);
 
 #ifdef HAVE_AMD_FFT
-            if (with_amd_fft) {
-                fprintf (fp, "%f %f ", amd_times[index], amd_errors[index]);
-            }
+    if (with_amd_fft) {
+        write_time_entries_in_file (amd_time_entries, n_devices, N_DIMS, only_time, new_line, fp);
+    }
 #endif
 
 #ifdef HAVE_APPLE_FFT
-            if (with_apple_fft) {
-                fprintf (fp, "%f %f ", apple_times[index], apple_errors[index]);
-            }
-#endif
-        }
-/*
-#ifdef HAVE_FFTW
-        if (with_fftw) {
-            fprintf (fp, "%f %f ", fftw_times[i], fftw_errors[i]);
-        }
+    if (with_apple_fft) {
+        write_time_entries_in_file (apple_time_entries, n_devices, N_DIMS, only_time, new_line, fp);
+    }
 #endif
-*/
+
 #ifdef HAVE_CUDA_FFT
-        if (with_cuda_fft) {
-            fprintf (fp, "%f %f ", cuda_times[i], cuda_times[i]);
-        }
+    if (with_cuda_fft) {
+        write_time_entries_in_file (cuda_time_entries, 1, N_DIMS, only_time, new_line, fp);
+    }
 #endif
+
+#ifdef HAVE_FFTW
+    if (with_fftw) {
+        write_time_entries_in_file (fftw_time_entries, 1, N_DIMS, only_time, new_line, fp);
     }
+#endif
 
-    fprintf (fp, "\n");
 
 #ifdef HAVE_OPENCL
-    for (int i = 0; i < n_devices; i++)
+    for (int i = 0; i < n_devices; i++) {
         clReleaseCommandQueue (queues[i]);
+    }
 
     clReleaseContext (context);
 
@@ -889,6 +915,5 @@ main (int argc, char **argv)
 
     fclose (fp);
 
-    fclose (fp_new);
     return 0;
 }