SHA1
--- a/check.c
+++ b/check.c
@@ -13,6 +13,8 @@
 
				 #include <CL/cl_ext.h>
			
 
				 #include "ocl.h"
			
 
				 
			
 
				+#include "hf-interface.h"
			
 
				+
			
 
				 /* this should actually come from the distributed pcitool sources */
			
 
				 #include "pciDriver.h"
			
 
				 
			
@@ -42,12 +44,24 @@ typedef struct {
 
				         cl_mem buffer;
			
 
				         cl_bus_address_amd addr;
			
 
				     } gpu;
			
 
				+    
			
 
				+    struct {
			
 
				+        cl_mem buffer;
			
 
				+        cl_bus_address_amd addr;
			
 
				+    } latency_data;
			
 
				 
			
 
				     struct {
			
 
				         pcilib_kmem_handle_t *kmem;
			
 
				         uint32_t *buffer;
			
 
				         uintptr_t addr;
			
 
				     } cpu;
			
 
				+    
			
 
				+    struct {
			
 
				+        pcilib_kmem_handle_t *kmem;
			
 
				+        uint32_t *buffer;
			
 
				+        uintptr_t addr;
			
 
				+    } cpu_latency_data;
			
 
				+    
			
 
				 } App;
			
 
				 
			
 
				 #define UNICODE_CHECK_MARK      "o"
			
@@ -60,33 +74,6 @@ typedef struct {
 
				 #define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
			
 
				 #define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
			
 
				 
			
 
				-#define REG_RESET_DMA                   0x00
			
 
				-#define REG_DMA                         0x04
			
 
				-#define REG_NUM_PACKETS_PER_DESCRIPTOR  0x10
			
 
				-#define REG_PERF_COUNTER                0x28
			
 
				-#define REG_PACKET_LENGTH               0x0C
			
 
				-#define REG_DESCRIPTOR_ADDRESS          0x50
			
 
				-#define REG_UPDATE_ADDRESS              0x58
			
 
				-#define REG_UPDATE_THRESHOLD            0x60
			
 
				-
			
 
				-#define REG_COUNTER                     0x9000
			
 
				-#define REG_VERSION                     0x9020
			
 
				-#define REG_CONTROL                     0x9040
			
 
				-#define REG_LATENCY                     0x9044
			
 
				-#define REG_NUM_ROWS                    0x9168
			
 
				-#define REG_NUM_FRAMES                  0x9170
			
 
				-
			
 
				-#define REG_DBG_RQ_RST                  0x9344
			
 
				-#define REG_DBG_CQ_RST                  0x93A4
			
 
				-
			
 
				-#define CMD_DMA_START           0x1
			
 
				-#define CMD_DMA_STOP            0x0
			
 
				-#define CMD_COUNTER_RESET       0xf0
			
 
				-#define CMD_COUNTER_START       0x1
			
 
				-#define CMD_COUNTER_STOP        0x0
			
 
				-#define CMD_LATENCY_RESET       0x0800000f
			
 
				-#define CMD_LATENCY_STOP        0xf0000000
			
 
				-
			
 
				 #define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
			
 
				 #define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
			
 
				 #define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);
			
@@ -106,7 +93,7 @@ static uint32_t         NUM_PAGES           = 1;
 
				 static size_t           GPU_BUFFER_SIZE     = 4096;
			
 
				 static size_t           CPU_BUFFER_SIZE     = 4096;
			
 
				 static size_t           CHECK_BUFFER_SIZE   = 8;
			
 
				-
			
 
				+static size_t           COUNTER_DATA_SIZE   = 4096; // 2 GB
			
 
				 
			
 
				 /* declaration should actually come from a distributed header file */
			
 
				 const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);
			
@@ -147,7 +134,7 @@ init_pcilib (App *app)
 
				 
			
 
				     memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));
			
 
				 
			
 
				-    printf ("%-16s 0x%X\n", "Firmware", RD32 (REG_VERSION));
			
 
				+    printf ("%-16s 0x%X\n", "Firmware", RD32 (HF_REG_VERSION));
			
 
				 
			
 
				     return true;
			
 
				 }
			
@@ -159,7 +146,9 @@ close_pcilib (App *app)
 
				 
			
 
				     pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
			
 
				     pcilib_free_kernel_memory (app->pci, app->cpu.kmem, KMEM_DEFAULT_FLAGS);
			
 
				-
			
 
				+    
			
 
				+    pcilib_free_kernel_memory (app->pci, app->cpu_latency_data.kmem, KMEM_DEFAULT_FLAGS);
			
 
				+    
			
 
				     pcilib_clean_kernel_memory (app->pci, KMEM_USE_DEFAULT, flags);
			
 
				 
			
 
				     pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
			
@@ -200,6 +189,26 @@ create_gpu_buffer (App *app, size_t size)
 
				     return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
			
 
				 }
			
 
				 
			
 
				+static cl_int
			
 
				+create_latency_data_buffer (App *app, size_t size)
			
 
				+{
			
 
				+    cl_mem_flags flags;
			
 
				+    cl_int error;
			
 
				+    char *data;
			
 
				+
			
 
				+    data = malloc (size);
			
 
				+    memset (data, 0xAD, size);
			
 
				+    flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE;
			
 
				+
			
 
				+    app->latency_data.buffer = clCreateBuffer (app->context, flags, size, data, &error);
			
 
				+
			
 
				+    if (error != CL_SUCCESS)
			
 
				+        return error;
			
 
				+
			
 
				+    return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->latency_data.buffer, CL_TRUE, &app->latency_data.addr, 0, NULL, NULL);
			
 
				+}
			
 
				+
			
 
				+
			
 
				 static cl_int
			
 
				 create_cpu_buffer (App *app, size_t size)
			
 
				 {
			
@@ -217,6 +226,24 @@ create_cpu_buffer (App *app, size_t size)
 
				     return 0;
			
 
				 }
			
 
				 
			
 
				+static cl_int
			
 
				+create_cpu_latency_data_buffer (App *app, size_t size)
			
 
				+{
			
 
				+    pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
			
 
				+
			
 
				+    app->cpu_latency_data.kmem = pcilib_alloc_kernel_memory (app->pci,
			
 
				+                     /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
			
 
				+                     1, size, 16, KMEM_USE_DEFAULT, flags);
			
 
				+
			
 
				+    app->cpu_latency_data.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu_latency_data.kmem, 0);
			
 
				+    app->cpu_latency_data.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu_latency_data.kmem, 0);
			
 
				+
			
 
				+    memset (app->cpu_latency_data.buffer, 0x8A, 16);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 static bool
			
 
				 init_opencl (App *app)
			
 
				 {
			
@@ -247,9 +274,15 @@ init_opencl (App *app)
 
				 
			
 
				     error |= create_gpu_buffer (app, GPU_BUFFER_SIZE);
			
 
				     OCL_CHECK_ERROR (error);
			
 
				+    
			
 
				+    error |= create_latency_data_buffer (app, 16);
			
 
				+    OCL_CHECK_ERROR (error);
			
 
				 
			
 
				     error |= create_cpu_buffer (app, CPU_BUFFER_SIZE);
			
 
				     OCL_CHECK_ERROR (error);
			
 
				+    
			
 
				+    error |= create_cpu_latency_data_buffer (app, 16);
			
 
				+    OCL_CHECK_ERROR (error);
			
 
				 
			
 
				     return error != CL_SUCCESS ? false : true;
			
 
				 }
			
@@ -261,6 +294,8 @@ close_opencl (App *app)
 
				     OCL_CHECK_ERROR (clReleaseProgram (app->program));
			
 
				     OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
			
 
				     OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
			
 
				+    OCL_CHECK_ERROR (clReleaseMemObject (app->gpu.buffer));
			
 
				+    OCL_CHECK_ERROR (clReleaseMemObject (app->latency_data.buffer));
			
 
				     ocl_free (app->ocl);
			
 
				 }
			
 
				 
			
@@ -302,21 +337,21 @@ configure_dma (App *app)
 
				 {
			
 
				     uint32_t value;
			
 
				 
			
 
				-    WR32 (REG_RESET_DMA, 1);
			
 
				+    WR32 (HF_REG_DMA, 1);
			
 
				     usleep (100000);
			
 
				-    WR32 (REG_RESET_DMA, 0);
			
 
				+    WR32 (HF_REG_DMA, 0);
			
 
				     usleep (100000);
			
 
				 
			
 
				-    value = RD32 (REG_RESET_DMA);
			
 
				+    value = RD32 (HF_REG_DMA);
			
 
				     debug_assert ("PCIe check", value == 335746816 || value == 335681280);
			
 
				 
			
 
				-    WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));
			
 
				+    WR32 (HF_REG_NUM_PACKETS, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));
			
 
				 
			
 
				     if (app->board_gen == 3) {
			
 
				-        WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
			
 
				+        WR32 (HF_REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
			
 
				     }
			
 
				     else {
			
 
				-        WR32 (REG_PACKET_LENGTH, TLP_SIZE);
			
 
				+        WR32 (HF_REG_PACKET_LENGTH, TLP_SIZE);
			
 
				     }
			
 
				 
			
 
				     /* reset host side addr */
			
@@ -326,37 +361,45 @@ configure_dma (App *app)
 
				 static void
			
 
				 configure_dma_descriptors (App *app)
			
 
				 {
			
 
				-    WR32 (REG_UPDATE_THRESHOLD, 0);
			
 
				-    WR64 (REG_UPDATE_ADDRESS, app->kdesc_bus);
			
 
				+    WR32 (HF_REG_UPDATE_THRESHOLD, 0);
			
 
				+    WR64 (HF_REG_UPDATE_ADDRESS, app->kdesc_bus);
			
 
				     usleep (100000);
			
 
				 }
			
 
				 
			
 
				 static void
			
 
				 setup_counter (App *app)
			
 
				 {
			
 
				-    WR32_sleep (REG_NUM_ROWS, 0);
			
 
				-    WR32_sleep (REG_NUM_FRAMES, 0);
			
 
				-    WR32_sleep (REG_CONTROL, CMD_LATENCY_RESET);
			
 
				-    WR32_sleep (REG_CONTROL, CMD_COUNTER_STOP);
			
 
				-    WR32_sleep (REG_COUNTER, CMD_COUNTER_RESET);
			
 
				-    WR32_sleep (REG_COUNTER, CMD_COUNTER_START);
			
 
				+    WR32_sleep (HF_REG_INTERCONNECT, 
			
 
				+                HF_INTERCONNECT_DDR_FROM_CNT | 
			
 
				+                HF_INTERCONNECT_DDR_TO_DMA |
			
 
				+                HF_INTERCONNECT_MASTER_DMA);
			
 
				+                
			
 
				+    WR32_sleep (HF_REG_DCG_UPPER_LIMIT, COUNTER_DATA_SIZE);
			
 
				+    WR32_sleep (HF_REG_DDR_UPPER_ADDR, 0xFFFFFFFF);
			
 
				+    WR32_sleep (HF_REG_DCG, HF_DCG_RESET );
			
 
				+    WR32_sleep (HF_REG_DCG, HF_DCG_START);
			
 
				+    sleep (3);  // sleep until ddr is complete 
			
 
				+    WR32_sleep (HF_REG_DCG, HF_DCG_STOP);   
			
 
				 }
			
 
				 
			
 
				 static void
			
 
				 stop_dma (App *app)
			
 
				 {
			
 
				-    WR32_sleep (REG_DMA, 0);
			
 
				+    WR32_sleep (HF_REG_DMA, 0);
			
 
				 }
			
 
				 
			
 
				 static void
			
 
				 reset_dbg (App *app)
			
 
				 {
			
 
				+    // RC
			
 
				+    WR32_sleep (HF_REG_DEBUG_RC_RESET, 1);
			
 
				+    WR32_sleep (HF_REG_DEBUG_RC_RESET, 0);
			
 
				     // TX
			
 
				-    WR32_sleep (REG_DBG_RQ_RST, 1);
			
 
				-    WR32_sleep (REG_DBG_RQ_RST, 0);
			
 
				+    WR32_sleep (HF_REG_DEBUG_REQUESTER_RESET, 1);
			
 
				+    WR32_sleep (HF_REG_DEBUG_REQUESTER_RESET, 0);
			
 
				     // RX
			
 
				-    WR32_sleep (REG_DBG_CQ_RST, 1);
			
 
				-    WR32_sleep (REG_DBG_CQ_RST, 0);
			
 
				+    WR32_sleep (HF_REG_DEBUG_COMPLETER_RESET, 1);
			
 
				+    WR32_sleep (HF_REG_DEBUG_COMPLETER_RESET, 0);
			
 
				 }
			
 
				 
			
 
				 static double
			
@@ -460,18 +503,18 @@ measure_fpga_to_gpu_latency_with_marker (App *app)
 
				 
			
 
				     configure_dma (app);
			
 
				 
			
 
				-    WR32 (REG_UPDATE_THRESHOLD, 0);
			
 
				-    WR64_sleep (REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);
			
 
				+    WR32 (HF_REG_UPDATE_THRESHOLD, 0);
			
 
				+    WR64_sleep (HF_REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);
			
 
				 
			
 
				     setup_counter (app);
			
 
				 
			
 
				-    WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
			
 
				+    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
			
 
				 
			
 
				     reset_dbg (app);
			
 
				     OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 0xd0dad0da, 0, NULL, &event));
			
 
				 
			
 
				     clock_gettime (CLOCK_MONOTONIC, &start);
			
 
				-    WR32 (REG_DMA, 1);
			
 
				+    WR32 (HF_REG_DMA, 1);
			
 
				 
			
 
				     OCL_CHECK_ERROR (clWaitForEvents (1, &event));
			
 
				 
			
@@ -479,7 +522,7 @@ measure_fpga_to_gpu_latency_with_marker (App *app)
 
				     stop_dma (app);
			
 
				     OCL_CHECK_ERROR (clReleaseEvent (event));
			
 
				 
			
 
				-    counter = RD32 (REG_PERF_COUNTER);
			
 
				+    counter = RD32 (HF_REG_PERF_COUNTER);
			
 
				     host_latency = elapsed_seconds (&start, &end);
			
 
				     debug_latency = compute_debug_latency (app);
			
 
				 
			
@@ -499,9 +542,11 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
 
				     struct timespec end;
			
 
				     double host_latency;
			
 
				     double debug_latency;
			
 
				+    cl_event signal_event;
			
 
				     cl_event event;
			
 
				     uint32_t check[CHECK_BUFFER_SIZE / 4];
			
 
				     size_t work_size = 1;
			
 
				+    cl_ulong times[4];
			
 
				 
			
 
				     printf ("\n** FPGA to GPU latency [kernel]\n\n");
			
 
				     fill_gpu_buffer (app);
			
@@ -510,7 +555,10 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
 
				     configure_dma_descriptors (app);
			
 
				     setup_counter (app);
			
 
				 
			
 
				-    WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
			
 
				+    WR32_sleep (HF_REG_CONTROL, HF_CONTROL_ENABLE_READ | HF_CONTROL_ENABLE_MULTI_READ);
			
 
				+    WR64_sleep (HF_REG_PERF_COUNTER_FEEDBACK_1, 0);
			
 
				+    WR64_sleep (HF_REG_DESCRIPTOR_AMD_SIGNAL, app->gpu.addr.marker_bus_address);
			
 
				+    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
			
 
				 
			
 
				     reset_dbg (app);
			
 
				 
			
@@ -518,37 +566,57 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
 
				     OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->fpga_buffer));
			
 
				     OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (cl_mem), &app->check_buffer));
			
 
				 
			
 
				-    OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1,
			
 
				-                                             NULL, &work_size, NULL,
			
 
				-                                             0, NULL, &event));
			
 
				+    WR32_sleep (HF_REG_DMA, 0);
			
 
				 
			
 
				-    WR32 (REG_DMA, 1);
			
 
				-    clock_gettime (CLOCK_MONOTONIC, &start);
			
 
				+    OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 1, 0, NULL, &signal_event));
			
 
				 
			
 
				-    clWaitForEvents (1, &event);
			
 
				+    /* OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1, */
			
 
				+    /*                                          NULL, &work_size, NULL, */
			
 
				+    /*                                          1, &signal_event, &event)); */
			
 
				+    usleep (100);
			
 
				+    WR32_sleep (HF_REG_DMA, 1);
			
 
				 
			
 
				+    clock_gettime (CLOCK_MONOTONIC, &start);
			
 
				+    OCL_CHECK_ERROR (clWaitForEvents (1, &signal_event));
			
 
				     clock_gettime (CLOCK_MONOTONIC, &end);
			
 
				-    OCL_CHECK_ERROR (clReleaseEvent (event));
			
 
				 
			
 
				-    counter = RD32 (REG_PERF_COUNTER);
			
 
				+    WR32 ( HF_REG_DMA, 0);
			
 
				+    WR32 ( 0x20, 1);
			
 
				+    WR32 ( 0x24, 1);
			
 
				+
			
 
				+    /* OCL_CHECK_ERROR (clReleaseEvent (event)); */
			
 
				+
			
 
				+    counter = RD32 (HF_REG_PERF_COUNTER);
			
 
				     host_latency = elapsed_seconds (&start, &end);
			
 
				     debug_latency = compute_debug_latency (app);
			
 
				 
			
 
				     check_data_transfer (app, PAGE_SIZE);
			
 
				 
			
 
				+    printf ("DMA running: %i\n", RD32 (HF_REG_DMA) & 0x1);
			
 
				+
			
 
				     OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE,
			
 
				                                           0, CHECK_BUFFER_SIZE, check, 0, NULL, NULL));
			
 
				 
			
 
				+    WR32 (HF_REG_DMA, 0);
			
 
				+
			
 
				     debug_assert_cmp ("Data check", check[0], 1);
			
 
				+    WR32_sleep (HF_REG_CONTROL, HF_CONTROL_ENABLE_READ);
			
 
				 
			
 
				-    printf ("\n%-16s %i\n", "Kernel count", check[1]);
			
 
				+    printf ("%-16s %f us\n", "FPGA latency", RD32 (0x20) * 4 / 1000.0);
			
 
				+
			
 
				+    ocl_get_event_times (signal_event, &times[0], &times[1], &times[2], &times[3]);
			
 
				+
			
 
				+    printf ("submit -> start = %f, queue -> start = %f, run_time=%lu\n",
			
 
				+            (times[0] - times[3]) / 1000.0, (times[0] - times[2]) / 1000.0, times[1] - times[0]);
			
 
				+    /* printf ("\n%-16s %i\n", "Kernel count", check[1]); */
			
 
				     printf ("%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
			
 
				-    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
			
 
				-    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
			
 
				-    printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
			
 
				+    /* printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0); */
			
 
				+    /* printf ("%-16s %f us\n", "FPGA [debug]", debug_latency); */
			
 
				+    /* printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.); */
			
 
				 }
			
 
				 
			
 
				 
			
 
				+
			
 
				 static void
			
 
				 measure_fpga_to_gpu_latency_with_cpu (App *app)
			
 
				 {
			
@@ -570,12 +638,12 @@ measure_fpga_to_gpu_latency_with_cpu (App *app)
 
				     configure_dma_descriptors (app);
			
 
				     setup_counter (app);
			
 
				 
			
 
				-    WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
			
 
				+    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
			
 
				 
			
 
				     reset_dbg (app);
			
 
				 
			
 
				     clock_gettime (CLOCK_MONOTONIC, &start);
			
 
				-    WR32 (REG_DMA, 1);
			
 
				+    WR32 (HF_REG_DMA, 1);
			
 
				 
			
 
				     do {
			
 
				         hardware_ptr = app->desc[flag_index];
			
@@ -585,7 +653,7 @@ measure_fpga_to_gpu_latency_with_cpu (App *app)
 
				     clock_gettime (CLOCK_MONOTONIC, &end);
			
 
				     stop_dma (app);
			
 
				 
			
 
				-    counter = RD32 (REG_PERF_COUNTER);
			
 
				+    counter = RD32 (HF_REG_PERF_COUNTER);
			
 
				     host_latency = elapsed_seconds (&start, &end);
			
 
				     debug_latency = compute_debug_latency (app);
			
 
				 
			
@@ -617,12 +685,12 @@ measure_fpga_to_cpu_latency (App *app)
 
				     configure_dma_descriptors (app);
			
 
				     setup_counter (app);
			
 
				 
			
 
				-    WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->cpu.addr);
			
 
				+    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->cpu.addr);
			
 
				 
			
 
				     reset_dbg (app);
			
 
				 
			
 
				     clock_gettime (CLOCK_MONOTONIC, &start);
			
 
				-    WR32 (REG_DMA, 1);
			
 
				+    WR32 (HF_REG_DMA, 1);
			
 
				 
			
 
				     do {
			
 
				         hardware_ptr = app->desc[flag_index];
			
@@ -635,7 +703,7 @@ measure_fpga_to_cpu_latency (App *app)
 
				     // Data back to FPGA
			
 
				     memcpy(app->bar + 0x9400, app->cpu.buffer, CPU_BUFFER_SIZE);
			
 
				 
			
 
				-    counter = RD32 (REG_PERF_COUNTER);
			
 
				+    counter = RD32 (HF_REG_PERF_COUNTER);
			
 
				     host_latency = elapsed_seconds (&start, &end);
			
 
				     debug_latency = compute_debug_latency (app);
			
 
				 
			
@@ -647,6 +715,112 @@ measure_fpga_to_cpu_latency (App *app)
 
				     printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
			
 
				 }
			
 
				 
			
 
				+static void
			
 
				+check_latency_results (App *app, size_t check_size)
			
 
				+{
			
 
				+    uint32_t *cnt;
			
 
				+
			
 
				+    cnt = malloc (16);
			
 
				+    memset (cnt, 0, 16);
			
 
				+
			
 
				+    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->latency_data.buffer, CL_TRUE, 0, 16, cnt, 0, NULL, NULL));
			
 
				+
			
 
				+    for (uint32_t i = 0; i < 4; i++) {
			
 
				+        printf("cnt %x\n", cnt[i]);
			
 
				+    }
			
 
				+    
			
 
				+    uint32_t *data;
			
 
				+    printf ("size %d\n", check_size);
			
 
				+    
			
 
				+    data = malloc (check_size);
			
 
				+    memset (data, 0xab, check_size);
			
 
				+    
			
 
				+    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));
			
 
				+
			
 
				+    for (uint32_t i = 0; i < check_size / 4; i++) {
			
 
				+        if (i < check_size / 4) {
			
 
				+            printf ("%-16s %x \n", "Latency FPGA->GPU [FPGA Master]", data[i]);
			
 
				+        }            
			
 
				+    }
			
 
				+
			
 
				+    debug_assert ("FPGA->GPU->FPGA latency", true);
			
 
				+
			
 
				+    free (data);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+measure_fpga_to_gpu_latency_master (App *app)
			
 
				+{
			
 
				+    
			
 
				+    printf ("\n** FPGA to GPU latency [FPGA MASTER]\n\n");
			
 
				+    
			
 
				+    // reset
			
 
				+    reset_dbg (app);
			
 
				+    WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x10);
			
 
				+    
			
 
				+    WR32_sleep( HF_REG_INTERCONNECT, 
			
 
				+		HF_INTERCONNECT_DDR_FROM_RX_MASTER |
			
 
				+		HF_INTERCONNECT_DDR_TO_DMA |
			
 
				+		HF_INTERCONNECT_MASTER_DMA); 
			
 
				+    
			
 
				+    // results array size in WORDS (32 bits)      
			
 
				+    WR32_sleep(HF_REG_LATENCY_NUM_MEAS, GPU_BUFFER_SIZE / 4); 
			
 
				+    
			
 
				+    // 3FF clock cycles * 4 ~= 4 us  -> every 4 us the read req is repeated                            
			
 
				+    WR32_sleep(HF_REG_LATENCY_REPEAT_MASK, 0x3FFFFF);                          
			
 
				+    
			
 
				+    // FFFFF clock cycles * 4 ~= 4 ms -> timeout -> WR is repeated then RD until HF_REG_CONF_DMA_TX_ENGINE != 1    
			
 
				+    WR32_sleep(HF_REG_LATENCY_TIMEOUT, 0xFFFFF);    
			
 
				+    
			
 
				+    WR64_sleep(HF_REG_DESC_LATENCY_DATA, app->latency_data.addr.surface_bus_address);
			
 
				+    WR64_sleep(HF_REG_DESC_LATENCY_RESULTS, app->gpu.addr.surface_bus_address);
			
 
				+    
			
 
				+    sleep(1);
			
 
				+    
			
 
				+    // Measurement starts with rising edge of bit 0 in HF_REG_CONF_DMA_TX_ENGINE
			
 
				+    WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x11);       // Mode of operation for taking latency measurements, any other number goes to the default mode
			
 
				+    
			
 
				+    //check_latency_results (app, GPU_BUFFER_SIZE);
			
 
				+    
			
 
				+    uint32_t result;
			
 
				+    result = RD32 (HF_REG_RESULT_LATENCY_MASTER);
			
 
				+    printf ("latency result : %u ns\n", result * 4 );
			
 
				+    
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+measure_fpga_to_cpu_latency_master (App *app)
			
 
				+{
			
 
				+
			
 
				+    printf ("\n** FPGA to CPU latency [FPGA MASTER]\n\n");
			
 
				+    
			
 
				+    // reset
			
 
				+    reset_dbg (app);
			
 
				+    WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x10);
			
 
				+    
			
 
				+    // results array size in WORDS (32 bits)      
			
 
				+    WR32_sleep(HF_REG_LATENCY_NUM_MEAS, GPU_BUFFER_SIZE / 4);    
			
 
				+    
			
 
				+    // 3FF clock cycles * 4 ~= 4 us  -> every 4 us the read req is repeated                            
			
 
				+    WR32_sleep(HF_REG_LATENCY_REPEAT_MASK, 0x3FFFFF);                          
			
 
				+    
			
 
				+    // FFFFF clock cycles * 4 ~= 4 ms -> timeout -> WR is repeated then RD until HF_REG_CONF_DMA_TX_ENGINE != 1    
			
 
				+    WR32_sleep(HF_REG_LATENCY_TIMEOUT, 0xFFFFF);    
			
 
				+    
			
 
				+    WR64_sleep(HF_REG_DESC_LATENCY_DATA, app->cpu_latency_data.addr);
			
 
				+    WR64_sleep(HF_REG_DESC_LATENCY_RESULTS, app->gpu.addr.surface_bus_address);
			
 
				+    
			
 
				+    sleep(1);
			
 
				+    // Measurement starts with rising edge of bit 0 in HF_REG_CONF_DMA_TX_ENGINE
			
 
				+    WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x11);       // Mode of operation for taking latency measurements, any other number goes to the default mode
			
 
				+    
			
 
				+    //check_latency_results (app, GPU_BUFFER_SIZE);
			
 
				+    uint32_t result;
			
 
				+    result = RD32 (HF_REG_RESULT_LATENCY_MASTER);
			
 
				+    printf ("latency result : %u ns\n", result * 4 );
			
 
				+    
			
 
				+}
			
 
				+
			
 
				 static void
			
 
				 measure_fpga_to_gpu_throughput (App *app)
			
 
				 {
			
@@ -671,10 +845,10 @@ measure_fpga_to_gpu_throughput (App *app)
 
				     addr = app->gpu.addr.surface_bus_address;
			
 
				     flag_index = app->board_gen == 3 ? 2 : 4;
			
 
				 
			
 
				-    WR64_sleep (REG_DESCRIPTOR_ADDRESS, addr);
			
 
				+    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, addr);
			
 
				 
			
 
				     clock_gettime (CLOCK_MONOTONIC, &start);
			
 
				-    WR32 (REG_DMA, 1);
			
 
				+    WR32 (HF_REG_DMA, 1);
			
 
				 
			
 
				     do {
			
 
				         do {
			
@@ -685,7 +859,7 @@ measure_fpga_to_gpu_throughput (App *app)
 
				         addr += NUM_PAGES * PAGE_SIZE;
			
 
				         transferred += NUM_PAGES * PAGE_SIZE;
			
 
				         current_ptr = hardware_ptr;
			
 
				-        WR64 (REG_DESCRIPTOR_ADDRESS, addr);
			
 
				+        WR64 (HF_REG_DESCRIPTOR_ADDRESS, addr);
			
 
				     }
			
 
				     while (transferred < GPU_BUFFER_SIZE);
			
 
				 
			
@@ -755,13 +929,16 @@ main (int argc, char const* argv[])
 
				 
			
 
				     if (!init_opencl (&app))
			
 
				         return 1;
			
 
				-
			
 
				+/*
			
 
				     measure_fpga_to_gpu_latency_with_marker (&app);
			
 
				     measure_fpga_to_gpu_latency_with_kernel (&app);
			
 
				     measure_fpga_to_gpu_latency_with_cpu (&app);
			
 
				     measure_fpga_to_cpu_latency (&app);
			
 
				     measure_gpu_to_fpga_latency (&app);
			
 
				     measure_fpga_to_gpu_throughput (&app);
			
 
				+*/    
			
 
				+    measure_fpga_to_gpu_latency_master (&app);
			
 
				+    //measure_fpga_to_cpu_latency_master (&app);
			
 
				 
			
 
				     close_opencl (&app);
			
 
				     close_pcilib (&app);
Author	SHA1 Message	Date
Luis Ardila	22e075a1ab latency master updated	8 years ago
Luis Ardila	5d5fc8f86b updated latency meass with kernel	8 years ago
Luis Ardila	631b7b5659 update WR and RD with appropiate names and added latency_master meassurement	8 years ago