|
@@ -1,9 +1,13 @@
|
|
|
+#define _XOPEN_SOURCE 500
|
|
|
+
|
|
|
#include <stdio.h>
|
|
|
#include <stdlib.h>
|
|
|
#include <stdbool.h>
|
|
|
#include <string.h>
|
|
|
+#include <unistd.h>
|
|
|
#include <pcilib.h>
|
|
|
#include <pcilib/bar.h>
|
|
|
+#include <pcilib/kmem.h>
|
|
|
#include <CL/cl.h>
|
|
|
#include <CL/cl_ext.h>
|
|
|
#include "ocl.h"
|
|
@@ -16,6 +20,10 @@ typedef struct {
|
|
|
pcilib_t *pci;
|
|
|
uint8_t *bar;
|
|
|
cl_ulong bar_phys;
|
|
|
+ uint8_t board_gen;
|
|
|
+ pcilib_kmem_handle_t *kdesc;
|
|
|
+ uintptr_t kdesc_bus;
|
|
|
+ volatile uint32_t *desc;
|
|
|
|
|
|
/* OpenCL */
|
|
|
OclPlatform *ocl;
|
|
@@ -24,14 +32,49 @@ typedef struct {
|
|
|
cl_context context;
|
|
|
cl_program program;
|
|
|
cl_kernel kernel;
|
|
|
- cl_mem fpga_write_buffer;
|
|
|
cl_mem check_buffer;
|
|
|
+
|
|
|
+ /* both */
|
|
|
+ cl_mem fpga_buffer;
|
|
|
+
|
|
|
+ struct {
|
|
|
+ cl_mem buffer;
|
|
|
+ cl_bus_address_amd addr;
|
|
|
+ } gpu;
|
|
|
} App;
|
|
|
|
|
|
+#define UNICODE_CHECK_MARK "\u2713"
|
|
|
+#define UNICODE_CROSS "\u2717"
|
|
|
+
|
|
|
+#define KMEM_DEFAULT_FLAGS PCILIB_KMEM_FLAG_HARDWARE | \
|
|
|
+ PCILIB_KMEM_FLAG_PERSISTENT | \
|
|
|
+ PCILIB_KMEM_FLAG_EXCLUSIVE
|
|
|
+
|
|
|
+#define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
|
|
|
+#define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
|
|
|
+
|
|
|
+#define REG_RESET_DMA 0x00
|
|
|
+#define REG_START_DMA 0x04
|
|
|
+#define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10
|
|
|
+#define REG_PACKET_LENGTH 0x0C
|
|
|
+#define REG_DESCRIPTOR_ADDRESS 0x50
|
|
|
+#define REG_UPDATE_ADDRESS 0x54
|
|
|
+#define REG_LAST_DESCRIPTOR 0x58
|
|
|
+#define REG_NUM_DESCRIPTORS 0x5C
|
|
|
+#define REG_UPDATE_THRESHOLD 0x60
|
|
|
+
|
|
|
+#define REG_ENABLE_COUNTER 0x9000
|
|
|
+#define REG_CONTROL 0x9040
|
|
|
+#define REG_NUM_ROWS 0x9168
|
|
|
+#define REG_NUM_FRAMES 0x9170
|
|
|
|
|
|
#define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
|
|
|
#define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
|
|
|
|
|
|
+#define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);
|
|
|
+
|
|
|
+static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL;
|
|
|
+
|
|
|
|
|
|
/* declaration should actually come from a distributed header file */
|
|
|
const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);
|
|
@@ -60,6 +103,12 @@ init_pcilib (App *app)
|
|
|
|
|
|
board = pcilib_get_board_info (app->pci);
|
|
|
app->bar_phys = board->bar_start[PCILIB_BAR0];
|
|
|
+ app->board_gen = RD32 (0x18) & 0xF;
|
|
|
+
|
|
|
+ app->kdesc = pcilib_alloc_kernel_memory (app->pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
|
|
|
+ app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0);
|
|
|
+ app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0);
|
|
|
+ memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));
|
|
|
|
|
|
return true;
|
|
|
}
|
|
@@ -67,29 +116,56 @@ init_pcilib (App *app)
|
|
|
static void
|
|
|
close_pcilib (App *app)
|
|
|
{
|
|
|
+ pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
|
|
|
pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
|
|
|
pcilib_close (app->pci);
|
|
|
}
|
|
|
|
|
|
-static cl_mem
|
|
|
-create_fpga_write_buffer (App *app, size_t size, cl_int *error)
|
|
|
+static cl_int
|
|
|
+create_fpga_buffer (App *app, size_t size)
|
|
|
{
|
|
|
cl_mem_flags flags;
|
|
|
cl_bus_address_amd addr;
|
|
|
+ cl_int error;
|
|
|
|
|
|
flags = CL_MEM_EXTERNAL_PHYSICAL_AMD;
|
|
|
addr.surface_bus_address = (cl_ulong) app->bar_phys;
|
|
|
addr.marker_bus_address = (cl_ulong) app->bar_phys;
|
|
|
|
|
|
- return clCreateBuffer (app->context, flags, size, &addr, error);
|
|
|
+ app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error);
|
|
|
+ return error;
|
|
|
+}
|
|
|
+
|
|
|
+static cl_int
|
|
|
+create_gpu_buffer (App *app, size_t size)
|
|
|
+{
|
|
|
+ cl_mem_flags flags;
|
|
|
+ cl_int error;
|
|
|
+ char *data;
|
|
|
+
|
|
|
+ data = malloc (size);
|
|
|
+ memset (data, 0, size);
|
|
|
+ flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR;
|
|
|
+
|
|
|
+ app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error);
|
|
|
+
|
|
|
+ if (error != CL_SUCCESS)
|
|
|
+ return error;
|
|
|
+
|
|
|
+ return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
|
|
|
}
|
|
|
|
|
|
static bool
|
|
|
init_opencl (App *app)
|
|
|
{
|
|
|
cl_int error;
|
|
|
+ cl_platform_id platform;
|
|
|
|
|
|
app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE);
|
|
|
+ platform = ocl_get_platform (app->ocl);
|
|
|
+
|
|
|
+ clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD");
|
|
|
+
|
|
|
app->device = ocl_get_devices (app->ocl)[0];
|
|
|
app->queue = ocl_get_cmd_queues (app->ocl)[0];
|
|
|
app->context = ocl_get_context (app->ocl);
|
|
@@ -103,8 +179,8 @@ init_opencl (App *app)
|
|
|
app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, 8, NULL, &error);
|
|
|
OCL_CHECK_ERROR (error);
|
|
|
|
|
|
- app->fpga_write_buffer = create_fpga_write_buffer (app, 1024 * 64, &error);
|
|
|
- OCL_CHECK_ERROR (error);
|
|
|
+ OCL_CHECK_ERROR (create_fpga_buffer (app, 1024 * 64));
|
|
|
+ OCL_CHECK_ERROR (create_gpu_buffer (app, 1024 * 64));
|
|
|
|
|
|
return error != CL_SUCCESS ? false : true;
|
|
|
}
|
|
@@ -114,22 +190,159 @@ close_opencl (App *app)
|
|
|
{
|
|
|
OCL_CHECK_ERROR (clReleaseKernel (app->kernel));
|
|
|
OCL_CHECK_ERROR (clReleaseProgram (app->program));
|
|
|
- OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_write_buffer));
|
|
|
+ OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
|
|
|
OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
|
|
|
ocl_free (app->ocl);
|
|
|
}
|
|
|
|
|
|
static void
|
|
|
-check_value (App *app, uint32_t addr, uint32_t expected)
|
|
|
+debug_wait (const char *message)
|
|
|
{
|
|
|
- uint32_t value;
|
|
|
+ printf ("%-32s", message);
|
|
|
+ fflush (stdout);
|
|
|
+}
|
|
|
|
|
|
- value = RD32 (addr);
|
|
|
+static void
|
|
|
+debug_assert (const char *message, bool condition)
|
|
|
+{
|
|
|
+ printf ("%-32s", message);
|
|
|
+
|
|
|
+ if (condition)
|
|
|
+ printf (UNICODE_CHECK_MARK"\n");
|
|
|
+ else
|
|
|
+ printf (UNICODE_CROSS"\n");
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+debug_assert_cmp (const char *message, uint32_t value, uint32_t expected)
|
|
|
+{
|
|
|
+ printf ("%-32s", message);
|
|
|
|
|
|
if (value != expected)
|
|
|
- printf ("failed [%u != %u]\n", value, expected);
|
|
|
+ printf (UNICODE_CROSS" [%u != %u]\n", value, expected);
|
|
|
else
|
|
|
- printf ("success\n");
|
|
|
+ printf (UNICODE_CHECK_MARK"\n");
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+debug_done (void)
|
|
|
+{
|
|
|
+ printf (UNICODE_CHECK_MARK"\n");
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+check_pcie (App *app)
|
|
|
+{
|
|
|
+ uint32_t value;
|
|
|
+
|
|
|
+ WR32 (REG_RESET_DMA, 1);
|
|
|
+ usleep (100000);
|
|
|
+ WR32 (REG_RESET_DMA, 0);
|
|
|
+ usleep (100000);
|
|
|
+
|
|
|
+ value = RD32 (REG_RESET_DMA);
|
|
|
+ debug_assert ("PCIe ready?", value == 335746816 || value == 335681280);
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+configure_dma (App *app)
|
|
|
+{
|
|
|
+ const unsigned TLP_SIZE = 32;
|
|
|
+
|
|
|
+ debug_wait ("Configure DMA ...");
|
|
|
+ WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, 1);
|
|
|
+
|
|
|
+ if (app->board_gen == 3) {
|
|
|
+ WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ WR32 (REG_PACKET_LENGTH, TLP_SIZE);
|
|
|
+ }
|
|
|
+
|
|
|
+ WR32 (REG_NUM_DESCRIPTORS, 0);
|
|
|
+ debug_done ();
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+configure_dma_descriptors (App *app)
|
|
|
+{
|
|
|
+ debug_wait ("Configure DMA descriptors ...");
|
|
|
+ WR32 (REG_LAST_DESCRIPTOR, 0);
|
|
|
+ WR32 (REG_UPDATE_THRESHOLD, 1);
|
|
|
+ WR32 (REG_UPDATE_ADDRESS, app->kdesc_bus);
|
|
|
+ usleep (100000);
|
|
|
+
|
|
|
+ usleep (1000);
|
|
|
+ WR32 (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
|
|
|
+ debug_done ();
|
|
|
+
|
|
|
+ debug_assert_cmp ("Descriptor address correct?", RD32 (REG_DESCRIPTOR_ADDRESS), app->gpu.addr.surface_bus_address);
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+start_dma (App *app)
|
|
|
+{
|
|
|
+ debug_wait ("Start DMA ... ");
|
|
|
+ WR32_sleep (REG_NUM_ROWS, 0);
|
|
|
+ WR32_sleep (REG_NUM_FRAMES, 0);
|
|
|
+ WR32_sleep (REG_CONTROL, 0);
|
|
|
+ WR32_sleep (REG_ENABLE_COUNTER, 0xFF);
|
|
|
+ WR32_sleep (REG_ENABLE_COUNTER, 1);
|
|
|
+ WR32 (REG_START_DMA, 1);
|
|
|
+ debug_done ();
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+stop_dma (App *app)
|
|
|
+{
|
|
|
+ debug_wait ("Stop DMA ... ");
|
|
|
+ WR32_sleep (REG_START_DMA, 0);
|
|
|
+ WR32 (REG_RESET_DMA, 1);
|
|
|
+ debug_done ();
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+transfer_data (App *app)
|
|
|
+{
|
|
|
+ uint32_t current_ptr;
|
|
|
+ uint32_t hardware_ptr;
|
|
|
+
|
|
|
+ debug_wait ("Transfer data ... ");
|
|
|
+ current_ptr = 0;
|
|
|
+
|
|
|
+ do {
|
|
|
+ hardware_ptr = app->desc[3]; /* only valid for board gen 3 */
|
|
|
+ } while (hardware_ptr == current_ptr);
|
|
|
+
|
|
|
+ debug_done ();
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+print_mem_words (App *app, cl_mem buffer, unsigned offset, unsigned range)
|
|
|
+{
|
|
|
+ uint32_t *data;
|
|
|
+ size_t size;
|
|
|
+
|
|
|
+ size = range * sizeof (uint32_t);
|
|
|
+ data = malloc (size);
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
|
|
|
+
|
|
|
+ for (unsigned i = 0; i < range; i++) {
|
|
|
+ printf ("0x%08x ", data[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+ printf ("\n");
|
|
|
+ free (data);
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+check_value (App *app, const char *message, uint32_t addr, uint32_t expected)
|
|
|
+{
|
|
|
+ uint32_t value;
|
|
|
+
|
|
|
+ value = RD32 (addr);
|
|
|
+ debug_assert_cmp (message, value, expected);
|
|
|
}
|
|
|
|
|
|
static void
|
|
@@ -142,17 +355,21 @@ launch_signal (App *app)
|
|
|
size_t global_work_size;
|
|
|
cl_ulong start, end, queued, submitted;
|
|
|
|
|
|
+ configure_dma (app);
|
|
|
+ configure_dma_descriptors (app);
|
|
|
+ start_dma (app);
|
|
|
+ transfer_data (app);
|
|
|
+
|
|
|
addr = 0x9168;
|
|
|
|
|
|
/* try to override defaultvalue */
|
|
|
value = 0xc001;
|
|
|
|
|
|
WR32 (addr, value);
|
|
|
- printf ("CPU WRITE ... ");
|
|
|
- check_value (app, addr, value);
|
|
|
+ check_value (app, "CPU write check ...", addr, value);
|
|
|
|
|
|
value = 0xdeadf00d;
|
|
|
- OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->fpga_write_buffer));
|
|
|
+ OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->fpga_buffer));
|
|
|
OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->check_buffer));
|
|
|
OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (uint32_t), &addr));
|
|
|
OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 3, sizeof (uint32_t), &value));
|
|
@@ -168,23 +385,24 @@ launch_signal (App *app)
|
|
|
OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
|
|
|
/* let's see if the GPU wrote anything */
|
|
|
- printf ("GPU WRITE ... ");
|
|
|
- check_value (app, addr, value);
|
|
|
+ check_value (app, "GPU write check ...", addr, value);
|
|
|
|
|
|
/* let's see if the kernel did at least something */
|
|
|
- printf ("SANITY ...... ");
|
|
|
+ printf ("%-32s", "Sanity check ...");
|
|
|
|
|
|
check[0] = check[1] = 0;
|
|
|
OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE, 0, 8, check, 0, NULL, NULL));
|
|
|
|
|
|
if (check[0] == addr && check[1] == value)
|
|
|
- printf ("success\n");
|
|
|
+ printf (UNICODE_CHECK_MARK"\n");
|
|
|
else
|
|
|
- printf ("failed [0x%x != %p || 0x%x != 0x%x]\n", check[0], (void *) addr, check[1], value);
|
|
|
+ printf (UNICODE_CROSS" [0x%x != %p || 0x%x != 0x%x]\n", check[0], (void *) addr, check[1], value);
|
|
|
|
|
|
printf ("> exec : %lu ns\n", end - start);
|
|
|
printf ("> submit: %lu ns\n", end - submitted);
|
|
|
printf ("> queue : %lu ns\n", end - queued);
|
|
|
+
|
|
|
+ stop_dma (app);
|
|
|
}
|
|
|
|
|
|
int
|
|
@@ -198,6 +416,7 @@ main (int argc, char const* argv[])
|
|
|
if (!init_opencl (&app))
|
|
|
return 1;
|
|
|
|
|
|
+ check_pcie (&app); /* FIXME: Without this, there are no data transfers */
|
|
|
launch_signal (&app);
|
|
|
|
|
|
close_opencl (&app);
|