Browse Source

Initial commit

Matthias Vogelgesang 10 years ago
commit
8e7cf950fa
4 changed files with 555 additions and 0 deletions
  1. 57 0
      Makefile
  2. 443 0
      benchmark.c
  3. 41 0
      timer.c
  4. 14 0
      timer.h

+ 57 - 0
Makefile

@@ -0,0 +1,57 @@
+BIN = benchmark
+SRC = $(wildcard *.c)
+OBJS = $(subst .c,.o,$(SRC))
+
+CFLAGS ?= -O3 -Wall -Werror -std=c99 -fmessage-length=0 -I/usr/local/cuda/include
+LDFLAGS ?= -lOpenCL
+LIBS_MSG = ""
+
+
+AMD_FFT_EXISTS = $(shell pkg-config --exists clFFT && echo "1" || echo "0")
+
+ifeq ($(AMD_FFT_EXISTS),1)
+	override CPPFLAGS += -DHAVE_AMD_FFT
+	override CFLAGS += $(shell pkg-config --cflags clFFT)
+	override LDFLAGS += $(shell pkg-config --libs clFFT)
+	LIBS_MSG += " +amd"
+endif
+
+
+APPLE_FFT_EXISTS = $(shell pkg-config --exists oclfft && echo "1" || echo "0")
+
+ifeq ($(APPLE_FFT_EXISTS),1)
+	override CPPFLAGS += -DHAVE_APPLE_FFT
+	override CFLAGS += $(shell pkg-config --cflags oclfft)
+	override LDFLAGS += $(shell pkg-config --libs oclfft)
+	LIBS_MSG += " +apple"
+endif
+
+
+FFTW_EXISTS = $(shell pkg-config --exists fftw3 && echo "1" || echo "0")
+
+ifeq ($(FFTW_EXISTS),1)
+	override CPPFLAGS += -DHAVE_FFTW
+	override CFLAGS += $(shell pkg-config --cflags fftw3)
+	override LDFLAGS += $(shell pkg-config --libs fftw3)
+	LIBS_MSG += " +fftw"
+endif
+
+.PHONY: all clean run
+
+all: $(BIN)
+
+%.o: %.c Makefile
+	@echo [CC] $<
+	@$(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@
+
+$(BIN): $(OBJS)
+	@echo [LD] $<
+	@$(CC) $(OBJS) -o $@ $(LDFLAGS)
+	@echo "     built with:$(LIBS_MSG)"
+
+run: $(BIN)
+	@echo "Run benchmark"
+	@./benchmark
+
+clean:
+	rm -f $(BIN) $(OBJS)

+ 443 - 0
benchmark.c

@@ -0,0 +1,443 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <math.h>
+#include <unistd.h>
+#include <getopt.h>
+
+#if defined HAVE_AMD_FFT || defined HAVE_APPLE_FFT
+#define HAVE_OPENCL     1
+#endif
+
+#ifdef HAVE_OPENCL
+#include <CL/cl.h>
+#endif
+
+#ifdef HAVE_AMD_FFT
+#include <clFFT.h>
+#endif
+
+#ifdef HAVE_APPLE_FFT
+#include <oclfft/clFFT.h>
+#endif
+
+#ifdef HAVE_FFTW
+#include <fftw3.h>
+#endif
+
+#include "timer.h"
+
+
+const int N_RUNS = 4;
+const int N_ARRAYS = 6;
+
+
+#ifdef HAVE_OPENCL
+#define OCL_CHECK_ERROR(error) { \
+    if ((error) != CL_SUCCESS) fprintf (stderr, "OpenCL error <%s:%i>\n", __FILE__, __LINE__); }
+
+typedef bool (*OclBenchmarkFunc) (cl_context context, cl_command_queue queue, cl_mem dev_mem, cl_mem dev_out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer);
+
+static double
+sum_of_absolute_differences (float *a, float *b, int n, bool scale)
+{
+    double sum = 0.0;
+
+    for (int i = 0; i < n; i++)
+        sum += fabs (a[i] - b[i] / (n / 2.));
+
+    return sum;
+}
+
+static void
+loop_data_opencl (const char *vendor,
+                  OclBenchmarkFunc func,
+                  cl_context context,
+                  cl_command_queue *queues,
+                  int n_devices,
+                  FILE *fp)
+{
+    Timer *timer;
+    cl_int err;
+    size_t size = 8;
+
+    timer = timer_new ();
+
+    for (int i = 0; i < N_ARRAYS; i++) {
+        size_t size_bytes;
+        float *host_orig_mem;
+        float *host_result_mem;
+        cl_mem dev_mem;
+        cl_mem dev_out_mem;
+
+        size *= 8;
+        size_bytes = size * 2 * sizeof (float);
+        host_orig_mem = malloc (size_bytes);
+        host_result_mem = malloc (size_bytes);
+
+        for (int j = 0; j < size * 2; j++)
+            host_orig_mem[j] = rand() / ((float) RAND_MAX);
+
+        dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
+        OCL_CHECK_ERROR (err);
+
+        dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err);
+        OCL_CHECK_ERROR (err);
+        printf (" %zu", size);
+
+        for (int j = 0; j < n_devices; j++) {
+            double time;
+            double mflops;
+            double sum;
+            bool scale;
+
+            printf (".");
+            fflush (stdout);
+
+            OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem,
+                                                   0, NULL, NULL));
+
+            scale = func (context, queues[j], dev_mem, dev_out_mem, 1, &size, N_RUNS, timer);
+
+            /* Check precision */
+            OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL));
+            sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale);
+
+            /*
+             * We use the "mflops" methodology from FFTW, which states that
+             *   mflops = 5 * N * log_2 (N) / (time for one FFT in microseconds)
+             */
+            time = timer_get_seconds (timer) / N_RUNS / 1000.0;
+            mflops = 5 * size * log (size) / log (2) / time;
+            fprintf (fp, "\n%s %i %zu %f %f", vendor, j, size, mflops, sum / size);
+        }
+
+        free (host_orig_mem);
+        free (host_result_mem);
+        OCL_CHECK_ERROR (clReleaseMemObject (dev_mem));
+        OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem));
+    }
+
+    printf ("\n");
+    timer_destroy (timer);
+}
+#endif
+
+#ifdef HAVE_FFTW
+static void
+loop_data_fftw (FILE *fp)
+{
+    Timer *timer;
+    size_t size = 8;
+
+    timer = timer_new ();
+
+    for (int i = 0; i < N_ARRAYS; i++) {
+        fftw_complex *host_orig_mem;
+        fftw_complex *host_result_mem;
+        fftw_complex *host_immediate_mem;
+        fftw_plan plan;
+        fftw_plan inverse_plan;
+        double time;
+        double mflops;
+        double sum = 0.0;
+
+        size *= 8;
+        host_orig_mem = fftw_malloc (sizeof (fftw_complex) * size);
+        host_immediate_mem = fftw_malloc (sizeof (fftw_complex) * size);
+        host_result_mem = fftw_malloc (sizeof (fftw_complex) * size);
+
+        plan = fftw_plan_dft_1d (size, host_orig_mem, host_immediate_mem, FFTW_FORWARD, FFTW_ESTIMATE);
+
+        for (int j = 0; j < size; j++) {
+            host_orig_mem[j][0] = rand() / ((double) RAND_MAX);
+            host_orig_mem[j][1] = rand() / ((double) RAND_MAX);
+        }
+
+        printf (" %zu...", size);
+        fflush (stdout);
+
+        timer_start (timer);
+
+        for (int j = 0; j < N_RUNS; j++) {
+            fftw_execute (plan);
+        }
+
+        timer_stop (timer);
+
+        /* Check precision */
+        inverse_plan = fftw_plan_dft_1d (size, host_immediate_mem, host_result_mem, FFTW_BACKWARD, FFTW_ESTIMATE);
+        fftw_execute (inverse_plan);
+
+        for (int j = 0; j < size; j++) {
+            sum += fabs (host_result_mem[j][0] / size - host_orig_mem[j][0]);
+            sum += fabs (host_result_mem[j][1] / size - host_orig_mem[j][1]);
+        }
+
+        time = timer_get_seconds (timer) / N_RUNS / 1000.0;
+        mflops = 5 * size * log (size) / log (2) / time;
+        fprintf (fp, "\nFFTW 0 %zu %f %f", size, mflops, sum / size);
+
+        fftw_destroy_plan (inverse_plan);
+        fftw_destroy_plan (plan);
+        fftw_free (host_orig_mem);
+        fftw_free (host_immediate_mem);
+        fftw_free (host_result_mem);
+    }
+
+    printf ("\n");
+    timer_destroy (timer);
+}
+#endif
+
+#ifdef HAVE_AMD_FFT
+static bool
+compute_amd_fft (cl_context context,
+                 cl_command_queue queue,
+                 cl_mem dev_mem,
+                 cl_mem out_mem,
+                 int n_dims,
+                 size_t *dims,
+                 int n_runs,
+                 Timer *timer)
+{
+    clfftSetupData setup;
+    clfftPlanHandle plan;
+    clfftDim dim;
+    cl_event event;
+    size_t size;
+
+    switch (n_dims) {
+        case 1:
+            dim = CLFFT_1D;
+            break;
+        case 2:
+            dim = CLFFT_2D;
+            break;
+        case 3:
+            dim = CLFFT_3D;
+            break;
+        default:
+            fprintf (stderr, "Unknown FFT dimensions\n");
+            return false;
+    }
+
+    OCL_CHECK_ERROR (clfftSetup (&setup));
+    OCL_CHECK_ERROR (clfftCreateDefaultPlan (&plan, context, dim, dims));
+    OCL_CHECK_ERROR (clfftSetPlanPrecision (plan, CLFFT_SINGLE));
+    OCL_CHECK_ERROR (clfftSetLayout (plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED));
+    OCL_CHECK_ERROR (clfftSetResultLocation (plan, CLFFT_OUTOFPLACE));
+    OCL_CHECK_ERROR (clfftBakePlan (plan, 1, &queue, NULL, NULL));
+
+    timer_start (timer);
+
+    for (int i = 0; i < n_runs; i++) {
+        OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_FORWARD, 1, &queue, 0, NULL, &event, &dev_mem, &out_mem, NULL));
+        OCL_CHECK_ERROR (clWaitForEvents (1, &event));
+        OCL_CHECK_ERROR (clReleaseEvent (event));
+    }
+
+    timer_stop (timer);
+
+    OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_BACKWARD, 1, &queue, 0, NULL, &event, &out_mem, &dev_mem, NULL));
+    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
+    OCL_CHECK_ERROR (clReleaseEvent (event));
+
+    /*
+     * We rely on the fact, that out_mem contains the inverse which currently
+     * lies in dev_mem, so let's copy it back.
+     */
+    OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
+    OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
+    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
+    OCL_CHECK_ERROR (clReleaseEvent (event));
+
+    OCL_CHECK_ERROR (clfftDestroyPlan (&plan));
+    clfftTeardown ();
+
+    return false;
+}
+#endif
+
+#ifdef HAVE_APPLE_FFT
+static bool
+compute_apple_fft (cl_context context,
+                   cl_command_queue queue,
+                   cl_mem dev_mem,
+                   cl_mem out_mem,
+                   int n_dims,
+                   size_t *dims,
+                   int n_runs,
+                   Timer *timer)
+{
+    clFFT_Plan plan;
+    clFFT_Dimension dim;
+    clFFT_Dim3 dim_sizes = {.x = 1, .y = 1, .z = 1};
+    cl_event event;
+    cl_int err;
+    size_t size;
+
+    switch (n_dims) {
+        case 1:
+            dim = clFFT_1D;
+            dim_sizes.x = dims[0];
+            break;
+        case 2:
+            dim = clFFT_2D;
+            dim_sizes.x = dims[0];
+            dim_sizes.y = dims[1];
+            break;
+        case 3:
+            dim = clFFT_3D;
+            dim_sizes.x = dims[0];
+            dim_sizes.y = dims[1];
+            dim_sizes.z = dims[2];
+            break;
+        default:
+            fprintf (stderr, "Unknown FFT dimensions\n");
+            return true;
+    }
+
+    plan = clFFT_CreatePlan (context, dim_sizes, dim, clFFT_InterleavedComplexFormat, &err);
+    OCL_CHECK_ERROR (err);
+
+    timer_start (timer);
+
+    for (int i = 0; i < n_runs; i++) {
+        err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Forward, dev_mem, out_mem,
+                                        0, NULL, NULL);
+        OCL_CHECK_ERROR (err);
+
+        /* Apple FFT does not return events, hence we need the hammer */
+        OCL_CHECK_ERROR (clFinish (queue));
+    }
+
+    timer_stop (timer);
+
+    err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Inverse, out_mem, dev_mem, 0, NULL, NULL);
+    OCL_CHECK_ERROR (err);
+    OCL_CHECK_ERROR (clFinish (queue));
+
+    OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL));
+    OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event));
+    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
+    OCL_CHECK_ERROR (clReleaseEvent (event));
+
+    clFFT_DestroyPlan (plan);
+
+    return true;
+}
+#endif
+
+int
+main (int argc, char **argv)
+{
+#ifdef HAVE_OPENCL
+    cl_platform_id platform;
+    cl_uint n_devices;
+    cl_device_id *devices;
+    cl_context context;
+    cl_command_queue *queues;
+    cl_int err;
+#endif
+
+#ifdef HAVE_AMD_FFT
+    static int with_amd_fft = 1;
+#endif
+
+#ifdef HAVE_APPLE_FFT
+    static int with_apple_fft = 1;
+#endif
+
+#ifdef HAVE_FFTW
+    static int with_fftw = 1;
+#endif
+
+    static int show_help = 0;
+
+    static struct option long_options[] =
+    {
+#ifdef HAVE_AMD_FFT
+        {"disable-amd", no_argument, &with_amd_fft, 0},
+#endif
+#ifdef HAVE_APPLE_FFT
+        {"disable-apple", no_argument, &with_apple_fft, 0},
+#endif
+#ifdef HAVE_FFTW
+        {"disable-fftw", no_argument, &with_fftw, 0},
+#endif
+        {"help", no_argument, &show_help, 1},
+        {0, 0, 0, 0}
+    };
+
+    FILE *fp;
+
+    /* Parse options */
+    while (getopt_long (argc, argv, "", long_options, NULL) != -1)
+        ;
+
+    if (show_help) {
+        printf ("Usage: %s [OPTIONS]\n", argv[0]);
+        printf ("Options:\n");
+
+        for (int i = 0; long_options[i].name != 0; i++)
+            printf("  --%s\n", long_options[i].name);
+        return 0;
+    }
+
+    /* Write header */
+    fp = fopen ("result.txt", "w");
+    fprintf (fp, "# vendor device size mflops accuracy");
+
+#ifdef HAVE_OPENCL
+    OCL_CHECK_ERROR (clGetPlatformIDs (1, &platform, NULL));
+
+    OCL_CHECK_ERROR (clGetDeviceIDs (platform, CL_DEVICE_TYPE_ALL, 0, NULL, &n_devices));
+    devices = malloc (n_devices * sizeof (cl_device_id));
+    OCL_CHECK_ERROR (clGetDeviceIDs (platform, CL_DEVICE_TYPE_ALL, n_devices, devices, NULL));
+
+    context = clCreateContext (NULL, n_devices, devices, NULL, NULL, &err);
+    OCL_CHECK_ERROR (err);
+
+    queues = malloc (n_devices * sizeof (cl_command_queue));
+
+    for (int i = 0; i < n_devices; i++) {
+        queues[i] = clCreateCommandQueue (context, devices[i], 0, &err);
+        OCL_CHECK_ERROR (err);
+    }
+#endif
+
+#ifdef HAVE_AMD_FFT
+    if (with_amd_fft) {
+        printf ("Testing AMD FFT ...\n");
+        loop_data_opencl ("AMD", compute_amd_fft, context, queues, n_devices, fp);
+    }
+#endif
+
+#ifdef HAVE_APPLE_FFT
+    if (with_apple_fft) {
+        printf ("Testing Apple FFT ...\n");
+        loop_data_opencl ("APP", compute_apple_fft, context, queues, n_devices, fp);
+    }
+#endif
+
+#ifdef HAVE_FFTW
+    if (with_fftw) {
+        printf ("Testing FFTW3 ...\n");
+        loop_data_fftw (fp);
+    }
+#endif
+
+#ifdef HAVE_OPENCL
+    for (int i = 0; i < n_devices; i++)
+        clReleaseCommandQueue (queues[i]);
+
+    clReleaseContext (context);
+
+    free (queues);
+    free (devices);
+#endif
+
+    fclose (fp);
+    return 0;
+}

+ 41 - 0
timer.c

@@ -0,0 +1,41 @@
+#include <stdlib.h>
+#include "timer.h"
+
+struct _Timer {
+    struct timeval start;
+    struct timeval end;
+};
+
+
+Timer *
+timer_new (void)
+{
+    Timer *t = (Timer *) malloc (sizeof (Timer));
+    return t;
+}
+
+void
+timer_destroy (Timer *t)
+{
+    free (t);
+}
+
+void
+timer_start (Timer *t)
+{
+    gettimeofday (&t->start, NULL);
+}
+
+void
+timer_stop (Timer *t)
+{
+    gettimeofday (&t->end, NULL);
+}
+
+double
+timer_get_seconds (Timer *t)
+{
+    long seconds = t->end.tv_sec - t->start.tv_sec;
+    long useconds = t->end.tv_usec - t->start.tv_usec;
+    return seconds + useconds / 1000.0 / 1000.0;
+}

+ 14 - 0
timer.h

@@ -0,0 +1,14 @@
+#ifndef TIMER_H
+#define TIMER_H
+
+#include <sys/time.h>
+
+typedef struct _Timer Timer;
+
+Timer * timer_new           (void);
+void    timer_destroy       (Timer *t);
+void    timer_start         (Timer *t);
+void    timer_stop          (Timer *t);
+double  timer_get_seconds   (Timer *t);
+
+#endif