|
@@ -0,0 +1,443 @@
|
|
|
+#include <stdlib.h>
|
|
|
+#include <stdio.h>
|
|
|
+#include <stdbool.h>
|
|
|
+#include <math.h>
|
|
|
+#include <unistd.h>
|
|
|
+#include <getopt.h>
|
|
|
+
|
|
|
+#if defined HAVE_AMD_FFT || defined HAVE_APPLE_FFT
|
|
|
+#define HAVE_OPENCL 1
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_OPENCL
|
|
|
+#include <CL/cl.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_AMD_FFT
|
|
|
+#include <clFFT.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_APPLE_FFT
|
|
|
+#include <oclfft/clFFT.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_FFTW
|
|
|
+#include <fftw3.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+#include "timer.h"
|
|
|
+
|
|
|
+
|
|
|
+const int N_RUNS = 4;
|
|
|
+const int N_ARRAYS = 6;
|
|
|
+
|
|
|
+
|
|
|
+#ifdef HAVE_OPENCL
|
|
|
+#define OCL_CHECK_ERROR(error) { \
|
|
|
+ if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>\n", __FILE__, __LINE__); }
|
|
|
+
|
|
|
+typedef bool (*OclBenchmarkFunc) (cl_context context, cl_command_queue queue, cl_mem dev_mem, cl_mem dev_out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer);
|
|
|
+
|
|
|
+static double
|
|
|
+sum_of_absolute_differences (float *a, float *b, int n, bool scale)
|
|
|
+{
|
|
|
+ double sum = 0.0;
|
|
|
+
|
|
|
+ for (int i = 0; i < n; i++)
|
|
|
+ sum += fabs (a[i] - b[i] / (n / 2.));
|
|
|
+
|
|
|
+ return sum;
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+loop_data_opencl (const char *vendor,
|
|
|
+ OclBenchmarkFunc func,
|
|
|
+ cl_context context,
|
|
|
+ cl_command_queue *queues,
|
|
|
+ int n_devices,
|
|
|
+ FILE *fp)
|
|
|
+{
|
|
|
+ Timer *timer;
|
|
|
+ cl_int err;
|
|
|
+ size_t size = 8;
|
|
|
+
|
|
|
+ timer = timer_new ();
|
|
|
+
|
|
|
+ for (int i = 0; i < N_ARRAYS; i++) {
|
|
|
+ size_t size_bytes;
|
|
|
+ float *host_orig_mem;
|
|
|
+ float *host_result_mem;
|
|
|
+ cl_mem dev_mem;
|
|
|
+ cl_mem dev_out_mem;
|
|
|
+
|
|
|
+ size *= 8;
|
|
|
+ size_bytes = size * 2 * sizeof (float);
|
|
|
+ host_orig_mem = malloc (size_bytes);
|
|
|
+ host_result_mem = malloc (size_bytes);
|
|
|
+
|
|
|
+ for (int j = 0; j < size * 2; j++)
|
|
|
+ host_orig_mem[j] = rand() / ((float) RAND_MAX);
|
|
|
+
|
|
|
+ dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
+
|
|
|
+ dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
+ printf (" %zu", size);
|
|
|
+
|
|
|
+ for (int j = 0; j < n_devices; j++) {
|
|
|
+ double time;
|
|
|
+ double mflops;
|
|
|
+ double sum;
|
|
|
+ bool scale;
|
|
|
+
|
|
|
+ printf (".");
|
|
|
+ fflush (stdout);
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem,
|
|
|
+ 0, NULL, NULL));
|
|
|
+
|
|
|
+ scale = func (context, queues[j], dev_mem, dev_out_mem, 1, &size, N_RUNS, timer);
|
|
|
+
|
|
|
+ /* Check precision */
|
|
|
+ OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
|
|
|
+ sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We use the "mflops" methodology from FFTW, which states that
|
|
|
+ * mflops = 5 * N * log_2 (N) / (time for one FFT in microseconds)
|
|
|
+ */
|
|
|
+ time = timer_get_seconds (timer) / N_RUNS / 1000.0;
|
|
|
+ mflops = 5 * size * log (size) / log (2) / time;
|
|
|
+ fprintf (fp, "\n%s %i %zu %f %f", vendor, j, size, mflops, sum / size);
|
|
|
+ }
|
|
|
+
|
|
|
+ free (host_orig_mem);
|
|
|
+ free (host_result_mem);
|
|
|
+ OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
|
|
|
+ OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
|
|
|
+ }
|
|
|
+
|
|
|
+ printf ("\n");
|
|
|
+ timer_destroy (timer);
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_FFTW
|
|
|
+static void
|
|
|
+loop_data_fftw (FILE *fp)
|
|
|
+{
|
|
|
+ Timer *timer;
|
|
|
+ size_t size = 8;
|
|
|
+
|
|
|
+ timer = timer_new ();
|
|
|
+
|
|
|
+ for (int i = 0; i < N_ARRAYS; i++) {
|
|
|
+ fftw_complex *host_orig_mem;
|
|
|
+ fftw_complex *host_result_mem;
|
|
|
+ fftw_complex *host_immediate_mem;
|
|
|
+ fftw_plan plan;
|
|
|
+ fftw_plan inverse_plan;
|
|
|
+ double time;
|
|
|
+ double mflops;
|
|
|
+ double sum = 0.0;
|
|
|
+
|
|
|
+ size *= 8;
|
|
|
+ host_orig_mem = fftw_malloc (sizeof (fftw_complex) * size);
|
|
|
+ host_immediate_mem = fftw_malloc (sizeof (fftw_complex) * size);
|
|
|
+ host_result_mem = fftw_malloc (sizeof (fftw_complex) * size);
|
|
|
+
|
|
|
+ plan = fftw_plan_dft_1d (size, host_orig_mem, host_immediate_mem, FFTW_FORWARD, FFTW_ESTIMATE);
|
|
|
+
|
|
|
+ for (int j = 0; j < size; j++) {
|
|
|
+ host_orig_mem[j][0] = rand() / ((double) RAND_MAX);
|
|
|
+ host_orig_mem[j][1] = rand() / ((double) RAND_MAX);
|
|
|
+ }
|
|
|
+
|
|
|
+ printf (" %zu...", size);
|
|
|
+ fflush (stdout);
|
|
|
+
|
|
|
+ timer_start (timer);
|
|
|
+
|
|
|
+ for (int j = 0; j < N_RUNS; j++) {
|
|
|
+ fftw_execute (plan);
|
|
|
+ }
|
|
|
+
|
|
|
+ timer_stop (timer);
|
|
|
+
|
|
|
+ /* Check precision */
|
|
|
+ inverse_plan = fftw_plan_dft_1d (size, host_immediate_mem, host_result_mem, FFTW_BACKWARD, FFTW_ESTIMATE);
|
|
|
+ fftw_execute (inverse_plan);
|
|
|
+
|
|
|
+ for (int j = 0; j < size; j++) {
|
|
|
+ sum += fabs (host_result_mem[j][0] / size - host_orig_mem[j][0]);
|
|
|
+ sum += fabs (host_result_mem[j][1] / size - host_orig_mem[j][1]);
|
|
|
+ }
|
|
|
+
|
|
|
+ time = timer_get_seconds (timer) / N_RUNS / 1000.0;
|
|
|
+ mflops = 5 * size * log (size) / log (2) / time;
|
|
|
+ fprintf (fp, "\nFFTW 0 %zu %f %f", size, mflops, sum / size);
|
|
|
+
|
|
|
+ fftw_destroy_plan (inverse_plan);
|
|
|
+ fftw_destroy_plan (plan);
|
|
|
+ fftw_free (host_orig_mem);
|
|
|
+ fftw_free (host_immediate_mem);
|
|
|
+ fftw_free (host_result_mem);
|
|
|
+ }
|
|
|
+
|
|
|
+ printf ("\n");
|
|
|
+ timer_destroy (timer);
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_AMD_FFT
|
|
|
+static bool
|
|
|
+compute_amd_fft (cl_context context,
|
|
|
+ cl_command_queue queue,
|
|
|
+ cl_mem dev_mem,
|
|
|
+ cl_mem out_mem,
|
|
|
+ int n_dims,
|
|
|
+ size_t *dims,
|
|
|
+ int n_runs,
|
|
|
+ Timer *timer)
|
|
|
+{
|
|
|
+ clfftSetupData setup;
|
|
|
+ clfftPlanHandle plan;
|
|
|
+ clfftDim dim;
|
|
|
+ cl_event event;
|
|
|
+ size_t size;
|
|
|
+
|
|
|
+ switch (n_dims) {
|
|
|
+ case 1:
|
|
|
+ dim = CLFFT_1D;
|
|
|
+ break;
|
|
|
+ case 2:
|
|
|
+ dim = CLFFT_2D;
|
|
|
+ break;
|
|
|
+ case 3:
|
|
|
+ dim = CLFFT_3D;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ fprintf (stderr, "Unknown FFT dimensions\n");
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clfftSetup (&setup));
|
|
|
+ OCL_CHECK_ERROR (clfftCreateDefaultPlan (&plan, context, dim, dims));
|
|
|
+ OCL_CHECK_ERROR (clfftSetPlanPrecision (plan, CLFFT_SINGLE));
|
|
|
+ OCL_CHECK_ERROR (clfftSetLayout (plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED));
|
|
|
+ OCL_CHECK_ERROR (clfftSetResultLocation (plan, CLFFT_OUTOFPLACE));
|
|
|
+ OCL_CHECK_ERROR (clfftBakePlan (plan, 1, &queue, NULL, NULL));
|
|
|
+
|
|
|
+ timer_start (timer);
|
|
|
+
|
|
|
+ for (int i = 0; i < n_runs; i++) {
|
|
|
+ OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_FORWARD, 1, &queue, 0, NULL, &event, &dev_mem, &out_mem, NULL));
|
|
|
+ OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
+ OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
+ }
|
|
|
+
|
|
|
+ timer_stop (timer);
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_BACKWARD, 1, &queue, 0, NULL, &event, &out_mem, &dev_mem, NULL));
|
|
|
+ OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
+ OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We rely on the fact, that out_mem contains the inverse which currently
|
|
|
+ * lies in dev_mem, so let's copy it back.
|
|
|
+ */
|
|
|
+ OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
|
|
|
+ OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
|
|
|
+ OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
+ OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clfftDestroyPlan (&plan));
|
|
|
+ clfftTeardown ();
|
|
|
+
|
|
|
+ return false;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_APPLE_FFT
|
|
|
+static bool
|
|
|
+compute_apple_fft (cl_context context,
|
|
|
+ cl_command_queue queue,
|
|
|
+ cl_mem dev_mem,
|
|
|
+ cl_mem out_mem,
|
|
|
+ int n_dims,
|
|
|
+ size_t *dims,
|
|
|
+ int n_runs,
|
|
|
+ Timer *timer)
|
|
|
+{
|
|
|
+ clFFT_Plan plan;
|
|
|
+ clFFT_Dimension dim;
|
|
|
+ clFFT_Dim3 dim_sizes = {.x = 1, .y = 1, .z = 1};
|
|
|
+ cl_event event;
|
|
|
+ cl_int err;
|
|
|
+ size_t size;
|
|
|
+
|
|
|
+ switch (n_dims) {
|
|
|
+ case 1:
|
|
|
+ dim = clFFT_1D;
|
|
|
+ dim_sizes.x = dims[0];
|
|
|
+ break;
|
|
|
+ case 2:
|
|
|
+ dim = clFFT_2D;
|
|
|
+ dim_sizes.x = dims[0];
|
|
|
+ dim_sizes.y = dims[1];
|
|
|
+ break;
|
|
|
+ case 3:
|
|
|
+ dim = clFFT_3D;
|
|
|
+ dim_sizes.x = dims[0];
|
|
|
+ dim_sizes.y = dims[1];
|
|
|
+ dim_sizes.z = dims[2];
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ fprintf (stderr, "Unknown FFT dimensions\n");
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ plan = clFFT_CreatePlan (context, dim_sizes, dim, clFFT_InterleavedComplexFormat, &err);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
+
|
|
|
+ timer_start (timer);
|
|
|
+
|
|
|
+ for (int i = 0; i < n_runs; i++) {
|
|
|
+ err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Forward, dev_mem, out_mem,
|
|
|
+ 0, NULL, NULL);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
+
|
|
|
+ /* Apple FFT does not return events, hence we need the hammer */
|
|
|
+ OCL_CHECK_ERROR (clFinish (queue));
|
|
|
+ }
|
|
|
+
|
|
|
+ timer_stop (timer);
|
|
|
+
|
|
|
+ err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Inverse, out_mem, dev_mem, 0, NULL, NULL);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
+ OCL_CHECK_ERROR (clFinish (queue));
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
|
|
|
+ OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
|
|
|
+ OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
+ OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
+
|
|
|
+ clFFT_DestroyPlan (plan);
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+int
|
|
|
+main (int argc, char **argv)
|
|
|
+{
|
|
|
+#ifdef HAVE_OPENCL
|
|
|
+ cl_platform_id platform;
|
|
|
+ cl_uint n_devices;
|
|
|
+ cl_device_id *devices;
|
|
|
+ cl_context context;
|
|
|
+ cl_command_queue *queues;
|
|
|
+ cl_int err;
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_AMD_FFT
|
|
|
+ static int with_amd_fft = 1;
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_APPLE_FFT
|
|
|
+ static int with_apple_fft = 1;
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_FFTW
|
|
|
+ static int with_fftw = 1;
|
|
|
+#endif
|
|
|
+
|
|
|
+ static int show_help = 0;
|
|
|
+
|
|
|
+ static struct option long_options[] =
|
|
|
+ {
|
|
|
+#ifdef HAVE_AMD_FFT
|
|
|
+ {"disable-amd", no_argument, &with_amd_fft, 0},
|
|
|
+#endif
|
|
|
+#ifdef HAVE_APPLE_FFT
|
|
|
+ {"disable-apple", no_argument, &with_apple_fft, 0},
|
|
|
+#endif
|
|
|
+#ifdef HAVE_FFTW
|
|
|
+ {"disable-fftw", no_argument, &with_fftw, 0},
|
|
|
+#endif
|
|
|
+ {"help", no_argument, &show_help, 1},
|
|
|
+ {0, 0, 0, 0}
|
|
|
+ };
|
|
|
+
|
|
|
+ FILE *fp;
|
|
|
+
|
|
|
+ /* Parse options */
|
|
|
+ while (getopt_long (argc, argv, "", long_options, NULL) != -1)
|
|
|
+ ;
|
|
|
+
|
|
|
+ if (show_help) {
|
|
|
+ printf ("Usage: %s [OPTIONS]\n", argv[0]);
|
|
|
+ printf ("Options:\n");
|
|
|
+
|
|
|
+ for (int i = 0; long_options[i].name != 0; i++)
|
|
|
+ printf(" --%s\n", long_options[i].name);
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Write header */
|
|
|
+ fp = fopen ("result.txt", "w");
|
|
|
+ fprintf (fp, "# vendor device size mflops accuracy");
|
|
|
+
|
|
|
+#ifdef HAVE_OPENCL
|
|
|
+ OCL_CHECK_ERROR (clGetPlatformIDs (1, &platform, NULL));
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clGetDeviceIDs (platform, CL_DEVICE_TYPE_ALL, 0, NULL, &n_devices));
|
|
|
+ devices = malloc (n_devices * sizeof (cl_device_id));
|
|
|
+ OCL_CHECK_ERROR (clGetDeviceIDs (platform, CL_DEVICE_TYPE_ALL, n_devices, devices, NULL));
|
|
|
+
|
|
|
+ context = clCreateContext (NULL, n_devices, devices, NULL, NULL, &err);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
+
|
|
|
+ queues = malloc (n_devices * sizeof (cl_command_queue));
|
|
|
+
|
|
|
+ for (int i = 0; i < n_devices; i++) {
|
|
|
+ queues[i] = clCreateCommandQueue (context, devices[i], 0, &err);
|
|
|
+ OCL_CHECK_ERROR (err);
|
|
|
+ }
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_AMD_FFT
|
|
|
+ if (with_amd_fft) {
|
|
|
+ printf ("Testing AMD FFT ...\n");
|
|
|
+ loop_data_opencl ("AMD", compute_amd_fft, context, queues, n_devices, fp);
|
|
|
+ }
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_APPLE_FFT
|
|
|
+ if (with_apple_fft) {
|
|
|
+ printf ("Testing Apple FFT ...\n");
|
|
|
+ loop_data_opencl ("APP", compute_apple_fft, context, queues, n_devices, fp);
|
|
|
+ }
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_FFTW
|
|
|
+ if (with_fftw) {
|
|
|
+ printf ("Testing FFTW3 ...\n");
|
|
|
+ loop_data_fftw (fp);
|
|
|
+ }
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef HAVE_OPENCL
|
|
|
+ for (int i = 0; i < n_devices; i++)
|
|
|
+ clReleaseCommandQueue (queues[i]);
|
|
|
+
|
|
|
+ clReleaseContext (context);
|
|
|
+
|
|
|
+ free (queues);
|
|
|
+ free (devices);
|
|
|
+#endif
|
|
|
+
|
|
|
+ fclose (fp);
|
|
|
+ return 0;
|
|
|
+}
|