#include #include #include "opencl_fft.h" double sum_of_absolute_differences (float *a, float *b, int n, bool scale) { double sum = 0.0; for (int i = 0; i < n; i++) sum += fabs (a[i] - b[i] / (n / 2.)); return sum; } void loop_data_opencl (const char *vendor, OclBenchmarkFunc func, cl_context context, cl_command_queue *queues, int n_devices, OutputType outputType, TimeEntry *time_entries) { Timer *timer; cl_int err; char *device_name; size_t device_name_size; cl_device_id *devices; size_t device_list_size; clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size); devices = (cl_device_id *) malloc(device_list_size); clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL); timer = timer_new (); for (int j = 0; j < n_devices; j++) { clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &device_name_size); device_name = (char *) malloc(device_name_size); clGetDeviceInfo(devices[j], CL_DEVICE_NAME, device_name_size, device_name, NULL); char vendor_name[100]; int v_len = sprintf(vendor_name, "%s (%s)", device_name, vendor); time_entries[j].lib_name = (char *)malloc(sizeof(char) * (v_len + 1)); strcpy(time_entries[j].lib_name, vendor_name); time_entries[j].dim_entries = (DimEntry *)malloc(N_DIMS * sizeof(DimEntry)); printf ("Device: %s\n", time_entries[j].lib_name); fflush (stdout); for (int k = 0; k < N_DIMS; k++) { int dim = DIMS[k]; int power_min = N_POWERS_INTERVALS[k][0]; int power_max = N_POWERS_INTERVALS[k][1]; int num_entries = power_max - power_min + 1; time_entries[j].dim_entries[k].n_dims = dim; time_entries[j].dim_entries[k].sizes = (unsigned int **)malloc(sizeof(unsigned int *) * num_entries); time_entries[j].dim_entries[k].times = (double *)malloc(sizeof(double) * num_entries); time_entries[j].dim_entries[k].errors = (double *)malloc(sizeof(double) * num_entries); PRINT_DIM (dim); fflush (stdout); for (int m = power_min, i = 0; m <= power_max; m++, i++) { size_t size_bytes; float *host_orig_mem; float *host_result_mem; cl_mem dev_mem; cl_mem dev_out_mem; size_t side_size = pow(2,m); size_t size = pow(side_size,dim); size_bytes = size * 2 * sizeof (float); host_orig_mem = malloc (size_bytes); host_result_mem = malloc (size_bytes); for (int l = 0; l < size * 2; l++) { host_orig_mem[l] = rand() / ((float) RAND_MAX); } dev_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err); OCL_CHECK_ERROR (err); dev_out_mem = clCreateBuffer (context, CL_MEM_READ_WRITE, size_bytes, NULL, &err); OCL_CHECK_ERROR (err); PRINT_DIMS(dim, side_size); fflush (stdout); double time_sec; double sum; bool scale; printf ("."); fflush (stdout); OCL_CHECK_ERROR (clEnqueueWriteBuffer (queues[j], dev_mem, CL_TRUE, 0, size_bytes, host_orig_mem, 0, NULL, NULL)); size_t fft_size[3] = { 1, 1, 1}; time_entries[j].dim_entries[k].sizes[i] = (unsigned int *)malloc(sizeof(unsigned int) * dim); for (int l = 0; l < dim; l++) { fft_size[l] = side_size; time_entries[j].dim_entries[k].sizes[i][j] = side_size; } scale = func (context, queues[j], dev_mem, dev_out_mem, dim, fft_size, N_RUNS, timer); /* Check precision */ OCL_CHECK_ERROR (clEnqueueReadBuffer (queues[j], dev_out_mem, CL_TRUE, 0, size_bytes, host_result_mem, 0, NULL, NULL)); sum = sum_of_absolute_differences (host_orig_mem, host_result_mem, size * 2, scale); time_sec = timer_get_seconds (timer) / N_RUNS; time_entries[j].dim_entries[k].times[i] = get_measurements_with_format(outputType, size_bytes, time_sec); time_entries[j].dim_entries[k].errors[i] = sum / size; free (host_orig_mem); free (host_result_mem); OCL_CHECK_ERROR (clReleaseMemObject (dev_mem)); OCL_CHECK_ERROR (clReleaseMemObject (dev_out_mem)); } printf ("\n"); fflush (stdout); } } printf ("\n"); timer_destroy (timer); } #ifdef HAVE_AMD_FFT bool compute_amd_fft (cl_context context, cl_command_queue queue, cl_mem dev_mem, cl_mem out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer) { clfftSetupData setup; clfftPlanHandle plan; clfftDim dim; cl_event event; size_t size; switch (n_dims) { case 1: dim = CLFFT_1D; break; case 2: dim = CLFFT_2D; break; case 3: dim = CLFFT_3D; break; default: fprintf (stderr, "Unknown FFT dimensions\n"); return false; } OCL_CHECK_ERROR (clfftSetup (&setup)); OCL_CHECK_ERROR (clfftCreateDefaultPlan (&plan, context, dim, dims)); OCL_CHECK_ERROR (clfftSetPlanPrecision (plan, CLFFT_SINGLE)); OCL_CHECK_ERROR (clfftSetLayout (plan, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED)); OCL_CHECK_ERROR (clfftSetResultLocation (plan, CLFFT_OUTOFPLACE)); OCL_CHECK_ERROR (clfftBakePlan (plan, 1, &queue, NULL, NULL)); timer_start (timer); for (int i = 0; i < n_runs; i++) { OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_FORWARD, 1, &queue, 0, NULL, &event, &dev_mem, &out_mem, NULL)); OCL_CHECK_ERROR (clWaitForEvents (1, &event)); OCL_CHECK_ERROR (clReleaseEvent (event)); } timer_stop (timer); OCL_CHECK_ERROR (clfftEnqueueTransform (plan, CLFFT_BACKWARD, 1, &queue, 0, NULL, &event, &out_mem, &dev_mem, NULL)); OCL_CHECK_ERROR (clWaitForEvents (1, &event)); OCL_CHECK_ERROR (clReleaseEvent (event)); /* * We rely on the fact, that out_mem contains the inverse which currently * lies in dev_mem, so let's copy it back. */ OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL)); OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event)); OCL_CHECK_ERROR (clWaitForEvents (1, &event)); OCL_CHECK_ERROR (clReleaseEvent (event)); OCL_CHECK_ERROR (clfftDestroyPlan (&plan)); clfftTeardown (); return false; } #endif #ifdef HAVE_APPLE_FFT bool compute_apple_fft (cl_context context, cl_command_queue queue, cl_mem dev_mem, cl_mem out_mem, int n_dims, size_t *dims, int n_runs, Timer *timer) { clFFT_Plan plan; clFFT_Dimension dim; clFFT_Dim3 dim_sizes = {.x = 1, .y = 1, .z = 1}; cl_event event; cl_int err; size_t size; switch (n_dims) { case 1: dim = clFFT_1D; dim_sizes.x = dims[0]; break; case 2: dim = clFFT_2D; dim_sizes.x = dims[0]; dim_sizes.y = dims[1]; break; case 3: dim = clFFT_3D; dim_sizes.x = dims[0]; dim_sizes.y = dims[1]; dim_sizes.z = dims[2]; break; default: fprintf (stderr, "Unknown FFT dimensions\n"); return true; } plan = clFFT_CreatePlan (context, dim_sizes, dim, clFFT_InterleavedComplexFormat, &err); OCL_CHECK_ERROR (err); timer_start (timer); for (int i = 0; i < n_runs; i++) { err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Forward, dev_mem, out_mem, 0, NULL, NULL); OCL_CHECK_ERROR (err); /* Apple FFT does not return events, hence we need the hammer */ OCL_CHECK_ERROR (clFinish (queue)); } timer_stop (timer); err = clFFT_ExecuteInterleaved (queue, plan, 1, clFFT_Inverse, out_mem, dev_mem, 0, NULL, NULL); OCL_CHECK_ERROR (err); OCL_CHECK_ERROR (clFinish (queue)); OCL_CHECK_ERROR (clGetMemObjectInfo (dev_mem, CL_MEM_SIZE, sizeof (size_t), &size, NULL)); OCL_CHECK_ERROR (clEnqueueCopyBuffer (queue, dev_mem, out_mem, 0, 0, size, 0, NULL, &event)); OCL_CHECK_ERROR (clWaitForEvents (1, &event)); OCL_CHECK_ERROR (clReleaseEvent (event)); clFFT_DestroyPlan (plan); return true; } #endif