Browse Source

Write from FPGA to GPU

Matthias Vogelgesang 8 years ago
parent
commit
fecd3db378
4 changed files with 252 additions and 22 deletions
  1. 6 2
      kernel.cl
  2. 6 0
      ocl.c
  3. 1 0
      ocl.h
  4. 239 20
      signal.c

+ 6 - 2
kernel.cl

@@ -1,9 +1,13 @@
+#define REG_DESCRIPTOR_ADDRESS  0x50
+
+#define WR32(buffer, addr, value) buffer[addr / 4] = value;
+
 kernel void
-write_to_fpga (global uint *buffer, global uint *check, uint addr, uint value)
+write_to_fpga (global uint *remote, global uint *check, uint addr, uint value)
 {
     if (get_global_id (0) == 0) {
         /* divide by four to avoid word addressing */
-        buffer[addr / 4] = value;
+        WR32 (remote, addr, value);
         check[0] = addr;
         check[1] = value;
     }

+ 6 - 0
ocl.c

@@ -298,6 +298,12 @@ ocl_free (OclPlatform *ocl)
     free (ocl);
 }
 
+cl_platform_id
+ocl_get_platform (OclPlatform *ocl)
+{
+    return ocl->platform;
+}
+
 char *
 ocl_get_platform_info (OclPlatform *ocl,
                        cl_platform_info param)

+ 1 - 0
ocl.h

@@ -42,6 +42,7 @@ OclPlatform *       ocl_new_from_args   (int                 argc,
                                                              queue_properties);
 void                ocl_print_usage     (void);
 void                ocl_free            (OclPlatform        *ocl);
+cl_platform_id      ocl_get_platform    (OclPlatform        *ocl);
 char *              ocl_get_platform_info
                                         (OclPlatform        *ocl,
                                          cl_platform_info    param);

+ 239 - 20
signal.c

@@ -1,9 +1,13 @@
+#define _XOPEN_SOURCE 500
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
+#include <unistd.h>
 #include <pcilib.h>
 #include <pcilib/bar.h>
+#include <pcilib/kmem.h>
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
 #include "ocl.h"
@@ -16,6 +20,10 @@ typedef struct {
     pcilib_t *pci;
     uint8_t *bar;
     cl_ulong bar_phys;
+    uint8_t board_gen;
+    pcilib_kmem_handle_t *kdesc;
+    uintptr_t kdesc_bus;
+    volatile uint32_t *desc;
 
     /* OpenCL */
     OclPlatform *ocl;
@@ -24,14 +32,49 @@ typedef struct {
     cl_context context;
     cl_program program;
     cl_kernel kernel;
-    cl_mem fpga_write_buffer;
     cl_mem check_buffer;
+
+    /* both */
+    cl_mem fpga_buffer;
+
+    struct {
+        cl_mem buffer;
+        cl_bus_address_amd addr;
+    } gpu;
 } App;
 
+#define UNICODE_CHECK_MARK      "\u2713"
+#define UNICODE_CROSS           "\u2717"
+
+#define KMEM_DEFAULT_FLAGS      PCILIB_KMEM_FLAG_HARDWARE | \
+                                PCILIB_KMEM_FLAG_PERSISTENT | \
+                                PCILIB_KMEM_FLAG_EXCLUSIVE
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+
+#define REG_RESET_DMA                   0x00
+#define REG_START_DMA                   0x04
+#define REG_NUM_PACKETS_PER_DESCRIPTOR  0x10
+#define REG_PACKET_LENGTH               0x0C
+#define REG_DESCRIPTOR_ADDRESS          0x50
+#define REG_UPDATE_ADDRESS              0x54
+#define REG_LAST_DESCRIPTOR             0x58
+#define REG_NUM_DESCRIPTORS             0x5C
+#define REG_UPDATE_THRESHOLD            0x60
+
+#define REG_ENABLE_COUNTER              0x9000
+#define REG_CONTROL                     0x9040
+#define REG_NUM_ROWS                    0x9168
+#define REG_NUM_FRAMES                  0x9170
 
 #define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
 #define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
 
+#define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);
+
+static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL;
+
 
 /* declaration should actually come from a distributed header file */
 const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);
@@ -60,6 +103,12 @@ init_pcilib (App *app)
 
     board = pcilib_get_board_info (app->pci);
     app->bar_phys = board->bar_start[PCILIB_BAR0];
+    app->board_gen = RD32 (0x18) & 0xF;
+
+    app->kdesc = pcilib_alloc_kernel_memory (app->pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0);
+    app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0);
+    memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));
 
     return true;
 }
@@ -67,29 +116,56 @@ init_pcilib (App *app)
 static void
 close_pcilib (App *app)
 {
+    pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
     pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
     pcilib_close (app->pci);
 }
 
-static cl_mem
-create_fpga_write_buffer (App *app, size_t size, cl_int *error)
+static cl_int
+create_fpga_buffer (App *app, size_t size)
 {
     cl_mem_flags flags;
     cl_bus_address_amd addr;
+    cl_int error;
 
     flags = CL_MEM_EXTERNAL_PHYSICAL_AMD;
     addr.surface_bus_address = (cl_ulong) app->bar_phys;
     addr.marker_bus_address = (cl_ulong) app->bar_phys;
 
-    return clCreateBuffer (app->context, flags, size, &addr, error);
+    app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error);
+    return error;
+}
+
+static cl_int
+create_gpu_buffer (App *app, size_t size)
+{
+    cl_mem_flags flags;
+    cl_int error;
+    char *data;
+
+    data = malloc (size);
+    memset (data, 0, size);
+    flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR;
+
+    app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error);
+
+    if (error != CL_SUCCESS)
+        return error;
+
+    return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
 }
 
 static bool
 init_opencl (App *app)
 {
     cl_int error;
+    cl_platform_id platform;
 
     app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE);
+    platform = ocl_get_platform (app->ocl);
+
+    clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD");
+
     app->device = ocl_get_devices (app->ocl)[0];
     app->queue = ocl_get_cmd_queues (app->ocl)[0];
     app->context = ocl_get_context (app->ocl);
@@ -103,8 +179,8 @@ init_opencl (App *app)
     app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, 8, NULL, &error);
     OCL_CHECK_ERROR (error);
 
-    app->fpga_write_buffer = create_fpga_write_buffer (app, 1024 * 64, &error);
-    OCL_CHECK_ERROR (error);
+    OCL_CHECK_ERROR (create_fpga_buffer (app, 1024 * 64));
+    OCL_CHECK_ERROR (create_gpu_buffer (app, 1024 * 64));
 
     return error != CL_SUCCESS ? false : true;
 }
@@ -114,22 +190,159 @@ close_opencl (App *app)
 {
     OCL_CHECK_ERROR (clReleaseKernel (app->kernel));
     OCL_CHECK_ERROR (clReleaseProgram (app->program));
-    OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_write_buffer));
+    OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
     OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
     ocl_free (app->ocl);
 }
 
 static void
-check_value (App *app, uint32_t addr, uint32_t expected)
+debug_wait (const char *message)
 {
-    uint32_t value;
+    printf ("%-32s", message);
+    fflush (stdout);
+}
 
-    value = RD32 (addr);
+static void
+debug_assert (const char *message, bool condition)
+{
+    printf ("%-32s", message);
+
+    if (condition)
+        printf (UNICODE_CHECK_MARK"\n");
+    else
+        printf (UNICODE_CROSS"\n");
+}
+
+static void
+debug_assert_cmp (const char *message, uint32_t value, uint32_t expected)
+{
+    printf ("%-32s", message);
 
     if (value != expected)
-        printf ("failed [%u != %u]\n", value, expected);
+        printf (UNICODE_CROSS" [%u != %u]\n", value, expected);
     else
-        printf ("success\n");
+        printf (UNICODE_CHECK_MARK"\n");
+}
+
+static void
+debug_done (void)
+{
+    printf (UNICODE_CHECK_MARK"\n");
+}
+
+static void
+check_pcie (App *app)
+{
+    uint32_t value;
+
+    WR32 (REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32 (REG_RESET_DMA, 0);
+    usleep (100000);
+
+    value = RD32 (REG_RESET_DMA);
+    debug_assert ("PCIe ready?", value == 335746816 || value == 335681280);
+}
+
+static void
+configure_dma (App *app)
+{
+    const unsigned TLP_SIZE = 32;
+
+    debug_wait ("Configure DMA ...");
+    WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, 1);
+
+    if (app->board_gen == 3) {
+        WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
+    }
+    else {
+        WR32 (REG_PACKET_LENGTH, TLP_SIZE);
+    }
+
+    WR32 (REG_NUM_DESCRIPTORS, 0);
+    debug_done ();
+}
+
+static void
+configure_dma_descriptors (App *app)
+{
+    debug_wait ("Configure DMA descriptors ...");
+    WR32 (REG_LAST_DESCRIPTOR, 0);
+    WR32 (REG_UPDATE_THRESHOLD, 1);
+    WR32 (REG_UPDATE_ADDRESS, app->kdesc_bus);
+    usleep (100000);
+
+    usleep (1000);
+    WR32 (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
+    debug_done ();
+
+    debug_assert_cmp ("Descriptor address correct?", RD32 (REG_DESCRIPTOR_ADDRESS), app->gpu.addr.surface_bus_address);
+}
+
+static void
+start_dma (App *app)
+{
+    debug_wait ("Start DMA ... ");
+    WR32_sleep (REG_NUM_ROWS, 0);
+    WR32_sleep (REG_NUM_FRAMES, 0);
+    WR32_sleep (REG_CONTROL, 0);
+    WR32_sleep (REG_ENABLE_COUNTER, 0xFF);
+    WR32_sleep (REG_ENABLE_COUNTER, 1);
+    WR32 (REG_START_DMA, 1);
+    debug_done ();
+}
+
+static void
+stop_dma (App *app)
+{
+    debug_wait ("Stop DMA ... ");
+    WR32_sleep (REG_START_DMA, 0);
+    WR32 (REG_RESET_DMA, 1);
+    debug_done ();
+}
+
+static void
+transfer_data (App *app)
+{
+    uint32_t current_ptr;
+    uint32_t hardware_ptr;
+
+    debug_wait ("Transfer data ... ");
+    current_ptr = 0;
+
+    do {
+        hardware_ptr = app->desc[3];   /* only valid for board gen 3 */
+    } while (hardware_ptr == current_ptr);
+
+    debug_done ();
+}
+
+static void
+print_mem_words (App *app, cl_mem buffer, unsigned offset, unsigned range)
+{
+    uint32_t *data;
+    size_t size;
+
+    size = range * sizeof (uint32_t);
+    data = malloc (size);
+
+    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
+
+    for (unsigned i = 0; i < range; i++) {
+        printf ("0x%08x ", data[i]);
+    }
+
+    printf ("\n");
+    free (data);
+}
+
+static void
+check_value (App *app, const char *message, uint32_t addr, uint32_t expected)
+{
+    uint32_t value;
+
+    value = RD32 (addr);
+    debug_assert_cmp (message, value, expected);
 }
 
 static void
@@ -142,17 +355,21 @@ launch_signal (App *app)
     size_t global_work_size;
     cl_ulong start, end, queued, submitted;
 
+    configure_dma (app);
+    configure_dma_descriptors (app);
+    start_dma (app);
+    transfer_data (app);
+
     addr = 0x9168;
 
     /* try to override defaultvalue */
     value = 0xc001;
 
     WR32 (addr, value);
-    printf ("CPU WRITE ... ");
-    check_value (app, addr, value);
+    check_value (app, "CPU write check ...", addr, value);
 
     value = 0xdeadf00d;
-    OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->fpga_write_buffer));
+    OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->fpga_buffer));
     OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->check_buffer));
     OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (uint32_t), &addr));
     OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 3, sizeof (uint32_t), &value));
@@ -168,23 +385,24 @@ launch_signal (App *app)
     OCL_CHECK_ERROR (clReleaseEvent (event));
 
     /* let's see if the GPU wrote anything */
-    printf ("GPU WRITE ... ");
-    check_value (app, addr, value);
+    check_value (app, "GPU write check ...", addr, value);
 
     /* let's see if the kernel did at least something */
-    printf ("SANITY ...... ");
+    printf ("%-32s", "Sanity check ...");
 
     check[0] = check[1] = 0;
     OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE, 0, 8, check, 0, NULL, NULL));
 
     if (check[0] == addr && check[1] == value)
-        printf ("success\n");
+        printf (UNICODE_CHECK_MARK"\n");
     else
-        printf ("failed [0x%x != %p || 0x%x != 0x%x]\n", check[0], (void *) addr, check[1], value);
+        printf (UNICODE_CROSS" [0x%x != %p || 0x%x != 0x%x]\n", check[0], (void *) addr, check[1], value);
 
     printf ("> exec  : %lu ns\n", end - start);
     printf ("> submit: %lu ns\n", end - submitted);
     printf ("> queue : %lu ns\n", end - queued);
+
+    stop_dma (app);
 }
 
 int
@@ -198,6 +416,7 @@ main (int argc, char const* argv[])
     if (!init_opencl (&app))
         return 1;
 
+    check_pcie (&app);   /* FIXME: Without this, there are no data transfers */
     launch_signal (&app);
 
     close_opencl (&app);