|
@@ -1,4 +1,5 @@
|
|
|
#include <stdlib.h>
|
|
|
+#include <string.h>
|
|
|
#include <stdio.h>
|
|
|
#include <stdbool.h>
|
|
|
#include <math.h>
|
|
@@ -66,6 +67,13 @@ const int DIMS[N_DIMS] = {1, 2, 3};
|
|
|
const int N_DIM_ARRAYS[N_DIMS] = {5, 4, 4};
|
|
|
const int N_POWERS_INTERVALS[N_DIMS][2] = {{5, 11}, {8, 11}, {7, 7}};
|
|
|
|
|
|
+typedef enum _OutputType {
|
|
|
+ OUT_MILLISECONDS,
|
|
|
+ OUT_MFLOPS,
|
|
|
+ OUT_THROUGHTPUT_MBS,
|
|
|
+ OUT_NONE
|
|
|
+} OutputType;
|
|
|
+
|
|
|
#define UPDATE_SIZE(size) size *= 8;
|
|
|
#define PRINT_DIM_SIZE(side_size,dim) { \
|
|
|
printf(" %zu", side_size); while (dim != 1) { printf("x%zu", side_size);dim--; } printf("."); }
|
|
@@ -101,93 +109,116 @@ sum_of_absolute_differences_complex (cufftComplex *a, cufftComplex *b, int n, bo
|
|
|
return sum;
|
|
|
}
|
|
|
|
|
|
+static double
|
|
|
+get_measurements_with_format (OutputType outputType, size_t size_bytes, double time_sec)
|
|
|
+{
|
|
|
+ double out_result = -1;
|
|
|
+
|
|
|
+ if (outputType == OUT_MFLOPS) {
|
|
|
+ size_t size = size_bytes / 2 / sizeof (float);
|
|
|
+ out_result = 5 * size * log (size) / log (2) / (time_sec / 1000.0);
|
|
|
+ }
|
|
|
+ else if (outputType == OUT_THROUGHTPUT_MBS) {
|
|
|
+ out_result = ((double)size_bytes) / time_sec / 1000.0 / 1000.0;
|
|
|
+ }
|
|
|
+ else if (outputType == OUT_MILLISECONDS) {
|
|
|
+ out_result = time_sec * 1000.0;
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ fprintf (stderr, "Unknown output type of OpenCL routines!\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ return out_result;
|
|
|
+}
|
|
|
+
|
|
|
static void
|
|
|
loop_data_opencl (const char *vendor,
|
|
|
OclBenchmarkFunc func,
|
|
|
cl_context context,
|
|
|
cl_command_queue *queues,
|
|
|
int n_devices,
|
|
|
- double *times,
|
|
|
- double *errors,
|
|
|
- FILE *fp)
|
|
|
+ OutputType outputType,
|
|
|
+ TimeEntry *time_entries)
|
|
|
{
|
|
|
Timer *timer;
|
|
|
cl_int err;
|
|
|
|
|
|
timer = timer_new ();
|
|
|
|
|
|
- //for (int j = 0; j < n_devices; j++) {
|
|
|
- // fprintf (fp, "%s_%i_bw %s_%i_err ", vendor, j, vendor, j);
|
|
|
- //}
|
|
|
+ for (int j = 0; j < n_devices; j++) {
|
|
|
+ char vendor_name[50];
|
|
|
+ int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
|
|
|
+ time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
|
|
|
+ strcpy(time_entries[j].lib_name, vendor_name);
|
|
|
|
|
|
- //time_entry->lib_name = "FFTW";
|
|
|
- //time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
|
|
|
+ time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
|
|
|
|
|
|
- for (int k = 0; k < N_DIMS; k++) {
|
|
|
- int dim = DIMS[k];
|
|
|
- int power_min = N_POWERS_INTERVALS[k][0];
|
|
|
- int power_max = N_POWERS_INTERVALS[k][1];
|
|
|
- //int num_entries = power_max - power_min + 1;
|
|
|
+ for (int k = 0; k < N_DIMS; k++) {
|
|
|
+ int dim = DIMS[k];
|
|
|
+ int power_min = N_POWERS_INTERVALS[k][0];
|
|
|
+ int power_max = N_POWERS_INTERVALS[k][1];
|
|
|
+ int num_entries = power_max - power_min + 1;
|
|
|
|
|
|
- //time_entry->dim_entries[k].n_dims = dim;
|
|
|
- //time_entry->dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
|
|
|
- //time_entry->dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
|
|
|
- //time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
|
|
|
+ time_entries[j].dim_entries[k].n_dims = dim;
|
|
|
+ time_entries[j].dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
|
|
|
+ time_entries[j].dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
|
|
|
+ time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
|
|
|
|
|
|
- printf ("%dD:", dim);
|
|
|
- fflush (stdout);
|
|
|
+ printf ("%dD:", dim);
|
|
|
+ fflush (stdout);
|
|
|
|
|
|
- for (int m = power_min, i = 0; m <= power_max; m++, i++) {
|
|
|
- size_t size_bytes;
|
|
|
- float *host_orig_mem;
|
|
|
- float *host_result_mem;
|
|
|
- cl_mem dev_mem;
|
|
|
- cl_mem dev_out_mem;
|
|
|
+ for (int m = power_min, i = 0; m <= power_max; m++, i++) {
|
|
|
+ size_t size_bytes;
|
|
|
+ float *host_orig_mem;
|
|
|
+ float *host_result_mem;
|
|
|
+ cl_mem dev_mem;
|
|
|
+ cl_mem dev_out_mem;
|
|
|
|
|
|
- size_t side_size = pow(2,m);
|
|
|
- size_t size = pow(side_size,dim);
|
|
|
+ size_t side_size = pow(2,m);
|
|
|
+ size_t size = pow(side_size,dim);
|
|
|
|
|
|
- size_bytes = size * 2 * sizeof (float);
|
|
|
- host_orig_mem = malloc (size_bytes);
|
|
|
- host_result_mem = malloc (size_bytes);
|
|
|
+ size_bytes = size * 2 * sizeof (float);
|
|
|
+ host_orig_mem = malloc (size_bytes);
|
|
|
+ host_result_mem = malloc (size_bytes);
|
|
|
|
|
|
- for (int j = 0; j < size * 2; j++) {
|
|
|
- host_orig_mem[j] = rand() / ((float) RAND_MAX);
|
|
|
- }
|
|
|
+ for (int l = 0; l < size * 2; l++) {
|
|
|
+ host_orig_mem[l] = rand() / ((float) RAND_MAX);
|
|
|
+ }
|
|
|
|
|
|
- dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
|
|
|
- OCL_CHECK_ERROR (err);
|
|
|
+ dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
|
|
|
- dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
|
|
|
- OCL_CHECK_ERROR (err);
|
|
|
+ dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
|
|
|
- if (dim == 1) {
|
|
|
- printf (" %zu", side_size);
|
|
|
- }
|
|
|
- else if (dim == 2) {
|
|
|
- printf (" %zux%zu", side_size, side_size);
|
|
|
- }
|
|
|
- else {
|
|
|
- printf (" %zux%zux%zu", side_size, side_size, side_size);
|
|
|
- }
|
|
|
+ if (dim == 1) {
|
|
|
+ printf (" %zu", side_size);
|
|
|
+ }
|
|
|
+ else if (dim == 2) {
|
|
|
+ printf (" %zux%zu", side_size, side_size);
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ printf (" %zux%zux%zu", side_size, side_size, side_size);
|
|
|
+ }
|
|
|
|
|
|
- fflush (stdout);
|
|
|
+ fflush (stdout);
|
|
|
|
|
|
- for (int j = 0; j < n_devices; j++) {
|
|
|
- //double time;
|
|
|
- //double mflops;
|
|
|
+
|
|
|
+ double time_sec;
|
|
|
double sum;
|
|
|
bool scale;
|
|
|
|
|
|
printf (".");
|
|
|
fflush (stdout);
|
|
|
|
|
|
- OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem,
|
|
|
- 0, NULL, NULL));
|
|
|
+ OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem, 0, NULL, NULL));
|
|
|
|
|
|
size_t fft_size[3] = { 1, 1, 1};
|
|
|
+ time_entries[j].dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
|
|
|
+
|
|
|
for (int l = 0; l < dim; l++) {
|
|
|
fft_size[l] = side_size;
|
|
|
+ time_entries[j].dim_entries[k].sizes[i][j] = side_size;
|
|
|
}
|
|
|
|
|
|
scale = func (context, queues[j], dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
|
|
@@ -196,25 +227,20 @@ loop_data_opencl (const char *vendor,
|
|
|
OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
|
|
|
sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
|
|
|
|
|
|
- /*
|
|
|
- * We use the "mflops" methodology from FFTW, which states that
|
|
|
- * mflops = 5 * N * log_2 (N) / (time for one FFT in microseconds)
|
|
|
- */
|
|
|
- //time = timer_get_seconds (timer) / N_RUNS / 1000.0;
|
|
|
- //mflops = 5 * size * log (size) / log (2) / time;
|
|
|
+ time_sec = timer_get_seconds (timer) / N_RUNS;
|
|
|
+
|
|
|
+ time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
|
|
|
+ time_entries[j].dim_entries[k].errors[i] = sum / size;
|
|
|
|
|
|
- //times[j * N_ARRAYS + i] = mflops;
|
|
|
- errors[j * N_ARRAYS + i] = sum / size;
|
|
|
+ free (host_orig_mem);
|
|
|
+ free (host_result_mem);
|
|
|
+ OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
|
|
|
+ OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
|
|
|
}
|
|
|
|
|
|
- free (host_orig_mem);
|
|
|
- free (host_result_mem);
|
|
|
- OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
|
|
|
- OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
|
|
|
+ printf ("\n");
|
|
|
+ fflush (stdout);
|
|
|
}
|
|
|
-
|
|
|
- printf ("\n");
|
|
|
- fflush (stdout);
|
|
|
}
|
|
|
|
|
|
printf ("\n");
|
|
@@ -227,109 +253,104 @@ static void
|
|
|
loop_data_cuda (const char *vendor,
|
|
|
CudaBenchmarkFunc func,
|
|
|
int n_devices,
|
|
|
- double *times,
|
|
|
- double *errors,
|
|
|
- FILE *fp)
|
|
|
+ OutputType outputType,
|
|
|
+ TimeEntry *time_entries)
|
|
|
{
|
|
|
Timer *timer;
|
|
|
|
|
|
timer = timer_new ();
|
|
|
|
|
|
- //for (int j = 0; j < n_devices; j++) {
|
|
|
- // fprintf (fp, "%s_%i_bw %s_%i_err ", vendor, j, vendor, j);
|
|
|
- //}
|
|
|
+ for (int j = 0; j < n_devices; j++) {
|
|
|
+ char vendor_name[50];
|
|
|
+ int v_len = sprintf(vendor_name, "%s_%d", vendor, j);
|
|
|
+ time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1));
|
|
|
+ strcpy(time_entries[j].lib_name, vendor_name);
|
|
|
|
|
|
- //time_entry->lib_name = "FFTW";
|
|
|
- //time_entry->dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
|
|
|
+ time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry));
|
|
|
|
|
|
- for (int k = 0; k < N_DIMS; k++) {
|
|
|
- int dim = DIMS[k];
|
|
|
- int power_min = N_POWERS_INTERVALS[k][0];
|
|
|
- int power_max = N_POWERS_INTERVALS[k][1];
|
|
|
- //int num_entries = power_max - power_min + 1;
|
|
|
+ for (int k = 0; k < N_DIMS; k++) {
|
|
|
+ int dim = DIMS[k];
|
|
|
+ int power_min = N_POWERS_INTERVALS[k][0];
|
|
|
+ int power_max = N_POWERS_INTERVALS[k][1];
|
|
|
+ int num_entries = power_max - power_min + 1;
|
|
|
|
|
|
- //time_entry->dim_entries[k].n_dims = dim;
|
|
|
- //time_entry->dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
|
|
|
- //time_entry->dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
|
|
|
- //time_entry->dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
|
|
|
+ time_entries[j].dim_entries[k].n_dims = dim;
|
|
|
+ time_entries[j].dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries);
|
|
|
+ time_entries[j].dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries);
|
|
|
+ time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries);
|
|
|
|
|
|
- printf ("%dD:", dim);
|
|
|
- fflush (stdout);
|
|
|
+ printf ("%dD:", dim);
|
|
|
+ fflush (stdout);
|
|
|
|
|
|
- for (int m = power_min, i = 0; m <= power_max; m++, i++) {
|
|
|
- size_t size_bytes;
|
|
|
- cufftComplex *host_orig_mem;
|
|
|
- cufftComplex *host_result_mem;
|
|
|
- cufftComplex *dev_mem;
|
|
|
- cufftComplex *dev_out_mem;
|
|
|
+ for (int m = power_min, i = 0; m <= power_max; m++, i++) {
|
|
|
+ size_t size_bytes;
|
|
|
+ cufftComplex *host_orig_mem;
|
|
|
+ cufftComplex *host_result_mem;
|
|
|
+ cufftComplex *dev_mem;
|
|
|
+ cufftComplex *dev_out_mem;
|
|
|
|
|
|
- size_t side_size = pow(2,m);
|
|
|
- size_t size = pow(side_size,dim);
|
|
|
+ size_t side_size = pow(2,m);
|
|
|
+ size_t size = pow(side_size,dim);
|
|
|
|
|
|
- size_bytes = size * sizeof (cufftComplex);
|
|
|
- host_orig_mem = (cufftComplex *)malloc(size_bytes);
|
|
|
- host_result_mem = (cufftComplex *)malloc(size_bytes);
|
|
|
+ size_bytes = size * sizeof (cufftComplex);
|
|
|
+ host_orig_mem = (cufftComplex *)malloc(size_bytes);
|
|
|
+ host_result_mem = (cufftComplex *)malloc(size_bytes);
|
|
|
|
|
|
- for (int j = 0; j < size; j++) {
|
|
|
- host_orig_mem[j].x = rand() / ((float) 10);
|
|
|
- host_orig_mem[j].y = rand() / ((float) 10);
|
|
|
- }
|
|
|
+ for (int l = 0; l < size; l++) {
|
|
|
+ host_orig_mem[l].x = rand() / ((float) 10);
|
|
|
+ host_orig_mem[l].y = rand() / ((float) 10);
|
|
|
+ }
|
|
|
|
|
|
- CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
|
|
|
- CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
|
|
|
+ CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_mem, size_bytes));
|
|
|
+ CUDA_SAFE_CALL (cudaMalloc ((void **)&dev_out_mem, size_bytes));
|
|
|
|
|
|
- if (dim == 1) {
|
|
|
- printf (" %zu", side_size);
|
|
|
- }
|
|
|
- else if (dim == 2) {
|
|
|
- printf (" %zux%zu", side_size, side_size);
|
|
|
- }
|
|
|
- else {
|
|
|
- printf (" %zux%zux%zu", side_size, side_size, side_size);
|
|
|
- }
|
|
|
+ if (dim == 1) {
|
|
|
+ printf (" %zu", side_size);
|
|
|
+ }
|
|
|
+ else if (dim == 2) {
|
|
|
+ printf (" %zux%zu", side_size, side_size);
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ printf (" %zux%zux%zu", side_size, side_size, side_size);
|
|
|
+ }
|
|
|
|
|
|
- fflush (stdout);
|
|
|
+ fflush (stdout);
|
|
|
|
|
|
- //double time;
|
|
|
- //double mflops;
|
|
|
- double sum;
|
|
|
- bool scale;
|
|
|
+ double time_sec;
|
|
|
+ double sum;
|
|
|
+ bool scale;
|
|
|
|
|
|
- printf (".");
|
|
|
- fflush (stdout);
|
|
|
+ printf (".");
|
|
|
+ fflush (stdout);
|
|
|
|
|
|
- CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
|
|
|
+ CUDA_SAFE_CALL (cudaMemcpy (dev_mem, host_orig_mem, size_bytes, cudaMemcpyHostToDevice));
|
|
|
|
|
|
- size_t fft_size[3] = { 1, 1, 1};
|
|
|
- for (int l = 0; l < dim; l++) {
|
|
|
- fft_size[l] = side_size;
|
|
|
- }
|
|
|
+ size_t fft_size[3] = { 1, 1, 1};
|
|
|
+ for (int l = 0; l < dim; l++) {
|
|
|
+ fft_size[l] = side_size;
|
|
|
+ }
|
|
|
|
|
|
- scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
|
|
|
+ scale = func (dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer);
|
|
|
|
|
|
- /* Check precision */
|
|
|
- CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
|
|
|
- sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
|
|
|
-
|
|
|
- /*
|
|
|
- * We use the "mflops" methodology from FFTW, which states that
|
|
|
- * mflops = 5 * N * log_2 (N) / (time for one FFT in microseconds)
|
|
|
- */
|
|
|
- //time = timer_get_seconds (timer) / N_RUNS / 1000.0;
|
|
|
- //mflops = 5 * size * log (size) / log (2) / time;
|
|
|
-
|
|
|
- //times[i] = mflops;
|
|
|
- errors[i] = sum / size;
|
|
|
+ /* Check precision */
|
|
|
+ CUDA_SAFE_CALL (cudaMemcpy (host_result_mem, dev_out_mem, size_bytes, cudaMemcpyDeviceToHost));
|
|
|
+ sum = sum_of_absolute_differences_complex (host_orig_mem, host_result_mem, size, scale);
|
|
|
|
|
|
- free (host_orig_mem);
|
|
|
- free (host_result_mem);
|
|
|
+ time_sec = timer_get_seconds (timer) / N_RUNS;
|
|
|
|
|
|
- CUDA_SAFE_CALL (cudaFree (dev_mem));
|
|
|
- CUDA_SAFE_CALL (cudaFree (dev_out_mem));
|
|
|
- }
|
|
|
+ time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
|
|
|
+ time_entries[j].dim_entries[k].errors[i] = sum / size;
|
|
|
|
|
|
- printf ("\n");
|
|
|
- fflush (stdout);
|
|
|
+ free (host_orig_mem);
|
|
|
+ free (host_result_mem);
|
|
|
+
|
|
|
+ CUDA_SAFE_CALL (cudaFree (dev_mem));
|
|
|
+ CUDA_SAFE_CALL (cudaFree (dev_out_mem));
|
|
|
+ }
|
|
|
+
|
|
|
+ printf ("\n");
|
|
|
+ fflush (stdout);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
printf ("\n");
|
|
@@ -339,7 +360,7 @@ loop_data_cuda (const char *vendor,
|
|
|
|
|
|
#ifdef HAVE_FFTW
|
|
|
static void
|
|
|
-loop_data_fftw (TimeEntry *time_entry)
|
|
|
+loop_data_fftw (OutputType outputType, TimeEntry *time_entry)
|
|
|
{
|
|
|
Timer *timer;
|
|
|
|
|
@@ -368,16 +389,16 @@ loop_data_fftw (TimeEntry *time_entry)
|
|
|
fftw_complex *host_immediate_mem;
|
|
|
fftw_plan plan;
|
|
|
fftw_plan inverse_plan;
|
|
|
- double time;
|
|
|
- double mflops;
|
|
|
+ double time_sec;
|
|
|
double sum = 0.0;
|
|
|
|
|
|
size_t side_size = pow(2,m);
|
|
|
size_t size = pow(side_size,dim);
|
|
|
+ size_t size_bytes = sizeof (fftw_complex) * size;
|
|
|
|
|
|
- host_orig_mem = fftw_malloc (sizeof (fftw_complex) * size);
|
|
|
- host_immediate_mem = fftw_malloc (sizeof (fftw_complex) * size);
|
|
|
- host_result_mem = fftw_malloc (sizeof (fftw_complex) * size);
|
|
|
+ host_orig_mem = fftw_malloc (size_bytes);
|
|
|
+ host_immediate_mem = fftw_malloc (size_bytes);
|
|
|
+ host_result_mem = fftw_malloc (size_bytes);
|
|
|
|
|
|
switch (dim) {
|
|
|
case 1:
|
|
@@ -445,15 +466,14 @@ loop_data_fftw (TimeEntry *time_entry)
|
|
|
sum += fabs (host_result_mem[j][1] / size - host_orig_mem[j][1]);
|
|
|
}
|
|
|
|
|
|
- time = timer_get_seconds (timer) / N_RUNS / 1000.0;
|
|
|
- mflops = 5 * size * log (size) / log (2) / time;
|
|
|
+ time_sec = timer_get_seconds (timer) / N_RUNS;
|
|
|
|
|
|
time_entry->dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim);
|
|
|
for (int j = 0; j < dim; j++) {
|
|
|
time_entry->dim_entries[k].sizes[i][j] = side_size;
|
|
|
}
|
|
|
|
|
|
- time_entry->dim_entries[k].times[i] = mflops;
|
|
|
+ time_entry->dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec);
|
|
|
time_entry->dim_entries[k].errors[i] = sum / size;
|
|
|
|
|
|
fftw_destroy_plan (inverse_plan);
|
|
@@ -664,9 +684,75 @@ compute_cuda_fft (cufftComplex *dev_mem,
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
+static void
|
|
|
+write_headers_in_file (int n_dims, bool only_time, FILE *fp)
|
|
|
+{
|
|
|
+ fprintf (fp, "# ");
|
|
|
+
|
|
|
+ for (int i = 0; i < n_dims; i++) {
|
|
|
+ int min_power = N_POWERS_INTERVALS[i][0];
|
|
|
+ int max_power = N_POWERS_INTERVALS[i][1];
|
|
|
+
|
|
|
+ for (int j = min_power; j <= max_power; j++) {
|
|
|
+ int side_size = pow(2,j);
|
|
|
+ switch (DIMS[i]) {
|
|
|
+ case 1:
|
|
|
+ fprintf (fp, "%d ", side_size);
|
|
|
+ if (!only_time) {
|
|
|
+ fprintf (fp, "%d(Error) ", side_size);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case 2:
|
|
|
+ fprintf (fp, "%dx%d ", side_size, side_size);
|
|
|
+ if (!only_time) {
|
|
|
+ fprintf (fp, "%dx%d(Error) ", side_size, side_size);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case 3:
|
|
|
+ fprintf (fp, "%dx%dx%d ", side_size, side_size, side_size);
|
|
|
+ if (!only_time) {
|
|
|
+ fprintf (fp, "%dx%dx%d(Error) ", side_size, side_size, side_size);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+write_time_entries_in_file (TimeEntry* time_entries, int num_entries, int n_dims, bool only_time, bool new_line, FILE *fp)
|
|
|
+{
|
|
|
+ if (new_line) {
|
|
|
+ fprintf (fp, "\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int i = 0; i < num_entries; i++) {
|
|
|
+ fprintf (fp, "%s ", time_entries[i].lib_name);
|
|
|
+
|
|
|
+ DimEntry *dim_entries = time_entries[i].dim_entries;
|
|
|
+
|
|
|
+ for (int dim = 0; dim < n_dims; dim++) {
|
|
|
+ DimEntry dim_entry = dim_entries[dim];
|
|
|
+
|
|
|
+ for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
|
|
|
+ if (only_time) {
|
|
|
+ fprintf (fp, "%f ", dim_entry.times[j]);
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ fprintf (fp, "%f %f ", dim_entry.times[j], dim_entry.errors[j]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
int
|
|
|
main (int argc, char **argv)
|
|
|
{
|
|
|
+ OutputType outputType = OUT_THROUGHTPUT_MBS;
|
|
|
+ bool only_time = true;
|
|
|
+ bool new_line = true;
|
|
|
+
|
|
|
#ifdef HAVE_OPENCL
|
|
|
cl_platform_id platform;
|
|
|
cl_uint n_devices;
|
|
@@ -678,25 +764,22 @@ main (int argc, char **argv)
|
|
|
|
|
|
#ifdef HAVE_AMD_FFT
|
|
|
static int with_amd_fft = 1;
|
|
|
- double *amd_times;
|
|
|
- double *amd_errors;
|
|
|
+ TimeEntry *amd_time_entries;
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_CUDA_FFT
|
|
|
static int with_cuda_fft = 1;
|
|
|
- double *cuda_times;
|
|
|
- double *cuda_errors;
|
|
|
+ TimeEntry *cuda_time_entries;
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_APPLE_FFT
|
|
|
static int with_apple_fft = 1;
|
|
|
- double *apple_times;
|
|
|
- double *apple_errors;
|
|
|
+ TimeEntry *apple_time_entries;
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_FFTW
|
|
|
static int with_fftw = 1;
|
|
|
- TimeEntry *time_entries_fftw;
|
|
|
+ TimeEntry *fftw_time_entries;
|
|
|
#endif
|
|
|
|
|
|
static int show_help = 0;
|
|
@@ -719,10 +802,6 @@ main (int argc, char **argv)
|
|
|
{0, 0, 0, 0}
|
|
|
};
|
|
|
|
|
|
- size_t size = INITIAL_SIZE;
|
|
|
- FILE *fp;
|
|
|
- FILE *fp_new;
|
|
|
-
|
|
|
/* Parse options */
|
|
|
while (getopt_long (argc, argv, "", long_options, NULL) != -1)
|
|
|
;
|
|
@@ -736,13 +815,10 @@ main (int argc, char **argv)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
- /* Write header */
|
|
|
- fp = fopen ("result.txt", "w");
|
|
|
- fp_new = fopen ("result_new.txt", "w");
|
|
|
-
|
|
|
- fprintf (fp, "# size ");
|
|
|
+ /* Open output file */
|
|
|
+ FILE *fp;
|
|
|
|
|
|
- fprintf (fp_new, "# ");
|
|
|
+ fp = fopen ("result.txt", "w");
|
|
|
|
|
|
#ifdef HAVE_OPENCL
|
|
|
OCL_CHECK_ERROR (clGetPlatformIDs (1, &platform, NULL));
|
|
@@ -763,123 +839,73 @@ main (int argc, char **argv)
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_AMD_FFT
|
|
|
- amd_times = malloc (n_devices * N_ARRAYS * sizeof (double));
|
|
|
- amd_errors = malloc (n_devices * N_ARRAYS * sizeof (double));
|
|
|
+ amd_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * n_devices);
|
|
|
|
|
|
if (with_amd_fft) {
|
|
|
printf ("Testing AMD FFT ...\n");
|
|
|
- loop_data_opencl ("AMD", compute_amd_fft, context, queues, n_devices, amd_times, amd_errors, fp);
|
|
|
+ loop_data_opencl ("AMD", compute_amd_fft, context, queues, n_devices, outputType, amd_time_entries);
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_APPLE_FFT
|
|
|
- apple_times = malloc (n_devices * N_ARRAYS * sizeof (double));
|
|
|
- apple_errors = malloc (n_devices * N_ARRAYS * sizeof (double));
|
|
|
+ apple_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * n_devices);
|
|
|
|
|
|
if (with_apple_fft) {
|
|
|
printf ("Testing Apple FFT ...\n");
|
|
|
- loop_data_opencl ("APP", compute_apple_fft, context, queues, n_devices, apple_times, apple_errors, fp);
|
|
|
+ loop_data_opencl ("APP", compute_apple_fft, context, queues, n_devices, outputType, apple_time_entries);
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_CUDA_FFT
|
|
|
- cuda_times = malloc (1 * N_ARRAYS * sizeof (double));
|
|
|
- cuda_errors = malloc (1 * N_ARRAYS * sizeof (double));
|
|
|
+ cuda_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry) * 1);
|
|
|
|
|
|
if (with_cuda_fft) {
|
|
|
printf ("Testing CUDA FFT ...\n");
|
|
|
- loop_data_cuda ("CUDA", compute_cuda_fft, 1, cuda_times, cuda_errors, fp);
|
|
|
+ loop_data_cuda ("CUDA", compute_cuda_fft, 1, outputType, cuda_time_entries);
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_FFTW
|
|
|
- time_entries_fftw = (TimeEntry *)malloc(sizeof(TimeEntry));
|
|
|
+ fftw_time_entries = (TimeEntry *)malloc(sizeof(TimeEntry));
|
|
|
|
|
|
if (with_fftw) {
|
|
|
printf ("Testing FFTW3 ...\n");
|
|
|
- loop_data_fftw (&(time_entries_fftw[0]));
|
|
|
+ loop_data_fftw (outputType, &(fftw_time_entries[0]));
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
- for (int i = 0; i < N_DIMS; i++) {
|
|
|
- int min_power = N_POWERS_INTERVALS[i][0];
|
|
|
- int max_power = N_POWERS_INTERVALS[i][1];
|
|
|
-
|
|
|
- for (int j = min_power; j <= max_power; j++) {
|
|
|
- int side_size = pow(2,j);
|
|
|
- switch (DIMS[i]) {
|
|
|
- case 1:
|
|
|
- fprintf (fp_new, "%d ", side_size);
|
|
|
- fprintf (fp_new, "%d(Error) ", side_size);
|
|
|
- break;
|
|
|
- case 2:
|
|
|
- fprintf (fp_new, "%dx%d ", side_size, side_size);
|
|
|
- fprintf (fp_new, "%dx%d(Error) ", side_size, side_size);
|
|
|
- break;
|
|
|
- case 3:
|
|
|
- fprintf (fp_new, "%dx%dx%d ", side_size, side_size, side_size);
|
|
|
- fprintf (fp_new, "%dx%dx%d(Error) ", side_size, side_size, side_size);
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
- fprintf (fp_new, "\n");
|
|
|
-
|
|
|
- for (int i = 0; i < 1; i++) { //loop over time entries
|
|
|
- fprintf (fp_new, "%s ", time_entries_fftw[i].lib_name);
|
|
|
-
|
|
|
- DimEntry *dim_entries = time_entries_fftw[i].dim_entries;
|
|
|
-
|
|
|
- for (int dim = 0; dim < N_DIMS; dim++) {
|
|
|
- DimEntry dim_entry = dim_entries[dim];
|
|
|
-
|
|
|
- for (int j = 0; j < (N_POWERS_INTERVALS[dim][1] - N_POWERS_INTERVALS[dim][0] + 1); j++) {
|
|
|
- fprintf (fp_new, "%f %f ", dim_entry.times[j], dim_entry.errors[j]);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- for (int i = 0; i < N_ARRAYS; i++) {
|
|
|
- UPDATE_SIZE (size);
|
|
|
-
|
|
|
- fprintf (fp, "\n%zu ", size);
|
|
|
-
|
|
|
- for (int j = 0; j < n_devices; j++) {
|
|
|
- int index = j * N_ARRAYS + i;
|
|
|
+ /* Write headers */
|
|
|
+ write_headers_in_file (N_DIMS, only_time, fp);
|
|
|
|
|
|
#ifdef HAVE_AMD_FFT
|
|
|
- if (with_amd_fft) {
|
|
|
- fprintf (fp, "%f %f ", amd_times[index], amd_errors[index]);
|
|
|
- }
|
|
|
+ if (with_amd_fft) {
|
|
|
+ write_time_entries_in_file (amd_time_entries, n_devices, N_DIMS, only_time, new_line, fp);
|
|
|
+ }
|
|
|
#endif
|
|
|
|
|
|
#ifdef HAVE_APPLE_FFT
|
|
|
- if (with_apple_fft) {
|
|
|
- fprintf (fp, "%f %f ", apple_times[index], apple_errors[index]);
|
|
|
- }
|
|
|
-#endif
|
|
|
- }
|
|
|
-/*
|
|
|
-#ifdef HAVE_FFTW
|
|
|
- if (with_fftw) {
|
|
|
- fprintf (fp, "%f %f ", fftw_times[i], fftw_errors[i]);
|
|
|
- }
|
|
|
+ if (with_apple_fft) {
|
|
|
+ write_time_entries_in_file (apple_time_entries, n_devices, N_DIMS, only_time, new_line, fp);
|
|
|
+ }
|
|
|
#endif
|
|
|
-*/
|
|
|
+
|
|
|
#ifdef HAVE_CUDA_FFT
|
|
|
- if (with_cuda_fft) {
|
|
|
- fprintf (fp, "%f %f ", cuda_times[i], cuda_times[i]);
|
|
|
- }
|
|
|
+ if (with_cuda_fft) {
|
|
|
+ write_time_entries_in_file (cuda_time_entries, 1, N_DIMS, only_time, new_line, fp);
|
|
|
+ }
|
|
|
#endif
|
|
|
+
|
|
|
+#ifdef HAVE_FFTW
|
|
|
+ if (with_fftw) {
|
|
|
+ write_time_entries_in_file (fftw_time_entries, 1, N_DIMS, only_time, new_line, fp);
|
|
|
}
|
|
|
+#endif
|
|
|
|
|
|
- fprintf (fp, "\n");
|
|
|
|
|
|
#ifdef HAVE_OPENCL
|
|
|
- for (int i = 0; i < n_devices; i++)
|
|
|
+ for (int i = 0; i < n_devices; i++) {
|
|
|
clReleaseCommandQueue (queues[i]);
|
|
|
+ }
|
|
|
|
|
|
clReleaseContext (context);
|
|
|
|
|
@@ -889,6 +915,5 @@ main (int argc, char **argv)
|
|
|
|
|
|
fclose (fp);
|
|
|
|
|
|
- fclose (fp_new);
|
|
|
return 0;
|
|
|
}
|