|
@@ -13,6 +13,8 @@
|
|
|
#include <CL/cl_ext.h>
|
|
|
#include "ocl.h"
|
|
|
|
|
|
+#include "hf-interface.h"
|
|
|
+
|
|
|
/* this should actually come from the distributed pcitool sources */
|
|
|
#include "pciDriver.h"
|
|
|
|
|
@@ -42,12 +44,24 @@ typedef struct {
|
|
|
cl_mem buffer;
|
|
|
cl_bus_address_amd addr;
|
|
|
} gpu;
|
|
|
+
|
|
|
+ struct {
|
|
|
+ cl_mem buffer;
|
|
|
+ cl_bus_address_amd addr;
|
|
|
+ } latency_data;
|
|
|
|
|
|
struct {
|
|
|
pcilib_kmem_handle_t *kmem;
|
|
|
uint32_t *buffer;
|
|
|
uintptr_t addr;
|
|
|
} cpu;
|
|
|
+
|
|
|
+ struct {
|
|
|
+ pcilib_kmem_handle_t *kmem;
|
|
|
+ uint32_t *buffer;
|
|
|
+ uintptr_t addr;
|
|
|
+ } cpu_latency_data;
|
|
|
+
|
|
|
} App;
|
|
|
|
|
|
#define UNICODE_CHECK_MARK "o"
|
|
@@ -60,33 +74,6 @@ typedef struct {
|
|
|
#define KMEM_USE_RING PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
|
|
|
#define KMEM_USE_DEFAULT PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
|
|
|
|
|
|
-#define REG_RESET_DMA 0x00
|
|
|
-#define REG_DMA 0x04
|
|
|
-#define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10
|
|
|
-#define REG_PERF_COUNTER 0x28
|
|
|
-#define REG_PACKET_LENGTH 0x0C
|
|
|
-#define REG_DESCRIPTOR_ADDRESS 0x50
|
|
|
-#define REG_UPDATE_ADDRESS 0x58
|
|
|
-#define REG_UPDATE_THRESHOLD 0x60
|
|
|
-
|
|
|
-#define REG_COUNTER 0x9000
|
|
|
-#define REG_VERSION 0x9020
|
|
|
-#define REG_CONTROL 0x9040
|
|
|
-#define REG_LATENCY 0x9044
|
|
|
-#define REG_NUM_ROWS 0x9168
|
|
|
-#define REG_NUM_FRAMES 0x9170
|
|
|
-
|
|
|
-#define REG_DBG_RQ_RST 0x9344
|
|
|
-#define REG_DBG_CQ_RST 0x93A4
|
|
|
-
|
|
|
-#define CMD_DMA_START 0x1
|
|
|
-#define CMD_DMA_STOP 0x0
|
|
|
-#define CMD_COUNTER_RESET 0xf0
|
|
|
-#define CMD_COUNTER_START 0x1
|
|
|
-#define CMD_COUNTER_STOP 0x0
|
|
|
-#define CMD_LATENCY_RESET 0x0800000f
|
|
|
-#define CMD_LATENCY_STOP 0xf0000000
|
|
|
-
|
|
|
#define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
|
|
|
#define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
|
|
|
#define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);
|
|
@@ -106,7 +93,7 @@ static uint32_t NUM_PAGES = 1;
|
|
|
static size_t GPU_BUFFER_SIZE = 4096;
|
|
|
static size_t CPU_BUFFER_SIZE = 4096;
|
|
|
static size_t CHECK_BUFFER_SIZE = 8;
|
|
|
-
|
|
|
+static size_t COUNTER_DATA_SIZE = 4096; // 2 GB
|
|
|
|
|
|
/* declaration should actually come from a distributed header file */
|
|
|
const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);
|
|
@@ -147,7 +134,7 @@ init_pcilib (App *app)
|
|
|
|
|
|
memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));
|
|
|
|
|
|
- printf ("%-16s 0x%X\n", "Firmware", RD32 (REG_VERSION));
|
|
|
+ printf ("%-16s 0x%X\n", "Firmware", RD32 (HF_REG_VERSION));
|
|
|
|
|
|
return true;
|
|
|
}
|
|
@@ -159,7 +146,9 @@ close_pcilib (App *app)
|
|
|
|
|
|
pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
|
|
|
pcilib_free_kernel_memory (app->pci, app->cpu.kmem, KMEM_DEFAULT_FLAGS);
|
|
|
-
|
|
|
+
|
|
|
+ pcilib_free_kernel_memory (app->pci, app->cpu_latency_data.kmem, KMEM_DEFAULT_FLAGS);
|
|
|
+
|
|
|
pcilib_clean_kernel_memory (app->pci, KMEM_USE_DEFAULT, flags);
|
|
|
|
|
|
pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
|
|
@@ -200,6 +189,26 @@ create_gpu_buffer (App *app, size_t size)
|
|
|
return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
|
|
|
}
|
|
|
|
|
|
+static cl_int
|
|
|
+create_latency_data_buffer (App *app, size_t size)
|
|
|
+{
|
|
|
+ cl_mem_flags flags;
|
|
|
+ cl_int error;
|
|
|
+ char *data;
|
|
|
+
|
|
|
+ data = malloc (size);
|
|
|
+ memset (data, 0xAD, size);
|
|
|
+ flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE;
|
|
|
+
|
|
|
+ app->latency_data.buffer = clCreateBuffer (app->context, flags, size, data, &error);
|
|
|
+
|
|
|
+ if (error != CL_SUCCESS)
|
|
|
+ return error;
|
|
|
+
|
|
|
+ return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->latency_data.buffer, CL_TRUE, &app->latency_data.addr, 0, NULL, NULL);
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
static cl_int
|
|
|
create_cpu_buffer (App *app, size_t size)
|
|
|
{
|
|
@@ -217,6 +226,24 @@ create_cpu_buffer (App *app, size_t size)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+static cl_int
|
|
|
+create_cpu_latency_data_buffer (App *app, size_t size)
|
|
|
+{
|
|
|
+ pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;
|
|
|
+
|
|
|
+ app->cpu_latency_data.kmem = pcilib_alloc_kernel_memory (app->pci,
|
|
|
+ /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
|
|
|
+ 1, size, 16, KMEM_USE_DEFAULT, flags);
|
|
|
+
|
|
|
+ app->cpu_latency_data.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu_latency_data.kmem, 0);
|
|
|
+ app->cpu_latency_data.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu_latency_data.kmem, 0);
|
|
|
+
|
|
|
+ memset (app->cpu_latency_data.buffer, 0x8A, 16);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
static bool
|
|
|
init_opencl (App *app)
|
|
|
{
|
|
@@ -247,9 +274,15 @@ init_opencl (App *app)
|
|
|
|
|
|
error |= create_gpu_buffer (app, GPU_BUFFER_SIZE);
|
|
|
OCL_CHECK_ERROR (error);
|
|
|
+
|
|
|
+ error |= create_latency_data_buffer (app, 16);
|
|
|
+ OCL_CHECK_ERROR (error);
|
|
|
|
|
|
error |= create_cpu_buffer (app, CPU_BUFFER_SIZE);
|
|
|
OCL_CHECK_ERROR (error);
|
|
|
+
|
|
|
+ error |= create_cpu_latency_data_buffer (app, 16);
|
|
|
+ OCL_CHECK_ERROR (error);
|
|
|
|
|
|
return error != CL_SUCCESS ? false : true;
|
|
|
}
|
|
@@ -261,6 +294,8 @@ close_opencl (App *app)
|
|
|
OCL_CHECK_ERROR (clReleaseProgram (app->program));
|
|
|
OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
|
|
|
OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
|
|
|
+ OCL_CHECK_ERROR (clReleaseMemObject (app->gpu.buffer));
|
|
|
+ OCL_CHECK_ERROR (clReleaseMemObject (app->latency_data.buffer));
|
|
|
ocl_free (app->ocl);
|
|
|
}
|
|
|
|
|
@@ -302,21 +337,21 @@ configure_dma (App *app)
|
|
|
{
|
|
|
uint32_t value;
|
|
|
|
|
|
- WR32 (REG_RESET_DMA, 1);
|
|
|
+ WR32 (HF_REG_DMA, 1);
|
|
|
usleep (100000);
|
|
|
- WR32 (REG_RESET_DMA, 0);
|
|
|
+ WR32 (HF_REG_DMA, 0);
|
|
|
usleep (100000);
|
|
|
|
|
|
- value = RD32 (REG_RESET_DMA);
|
|
|
+ value = RD32 (HF_REG_DMA);
|
|
|
debug_assert ("PCIe check", value == 335746816 || value == 335681280);
|
|
|
|
|
|
- WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));
|
|
|
+ WR32 (HF_REG_NUM_PACKETS, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));
|
|
|
|
|
|
if (app->board_gen == 3) {
|
|
|
- WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
|
|
|
+ WR32 (HF_REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
|
|
|
}
|
|
|
else {
|
|
|
- WR32 (REG_PACKET_LENGTH, TLP_SIZE);
|
|
|
+ WR32 (HF_REG_PACKET_LENGTH, TLP_SIZE);
|
|
|
}
|
|
|
|
|
|
/* reset host side addr */
|
|
@@ -326,37 +361,45 @@ configure_dma (App *app)
|
|
|
static void
|
|
|
configure_dma_descriptors (App *app)
|
|
|
{
|
|
|
- WR32 (REG_UPDATE_THRESHOLD, 0);
|
|
|
- WR64 (REG_UPDATE_ADDRESS, app->kdesc_bus);
|
|
|
+ WR32 (HF_REG_UPDATE_THRESHOLD, 0);
|
|
|
+ WR64 (HF_REG_UPDATE_ADDRESS, app->kdesc_bus);
|
|
|
usleep (100000);
|
|
|
}
|
|
|
|
|
|
static void
|
|
|
setup_counter (App *app)
|
|
|
{
|
|
|
- WR32_sleep (REG_NUM_ROWS, 0);
|
|
|
- WR32_sleep (REG_NUM_FRAMES, 0);
|
|
|
- WR32_sleep (REG_CONTROL, CMD_LATENCY_RESET);
|
|
|
- WR32_sleep (REG_CONTROL, CMD_COUNTER_STOP);
|
|
|
- WR32_sleep (REG_COUNTER, CMD_COUNTER_RESET);
|
|
|
- WR32_sleep (REG_COUNTER, CMD_COUNTER_START);
|
|
|
+ WR32_sleep (HF_REG_INTERCONNECT,
|
|
|
+ HF_INTERCONNECT_DDR_FROM_CNT |
|
|
|
+ HF_INTERCONNECT_DDR_TO_DMA |
|
|
|
+ HF_INTERCONNECT_MASTER_DMA);
|
|
|
+
|
|
|
+ WR32_sleep (HF_REG_DCG_UPPER_LIMIT, COUNTER_DATA_SIZE);
|
|
|
+ WR32_sleep (HF_REG_DDR_UPPER_ADDR, 0xFFFFFFFF);
|
|
|
+ WR32_sleep (HF_REG_DCG, HF_DCG_RESET );
|
|
|
+ WR32_sleep (HF_REG_DCG, HF_DCG_START);
|
|
|
+ sleep (3); // sleep until ddr is complete
|
|
|
+ WR32_sleep (HF_REG_DCG, HF_DCG_STOP);
|
|
|
}
|
|
|
|
|
|
static void
|
|
|
stop_dma (App *app)
|
|
|
{
|
|
|
- WR32_sleep (REG_DMA, 0);
|
|
|
+ WR32_sleep (HF_REG_DMA, 0);
|
|
|
}
|
|
|
|
|
|
static void
|
|
|
reset_dbg (App *app)
|
|
|
{
|
|
|
+ // RC
|
|
|
+ WR32_sleep (HF_REG_DEBUG_RC_RESET, 1);
|
|
|
+ WR32_sleep (HF_REG_DEBUG_RC_RESET, 0);
|
|
|
// TX
|
|
|
- WR32_sleep (REG_DBG_RQ_RST, 1);
|
|
|
- WR32_sleep (REG_DBG_RQ_RST, 0);
|
|
|
+ WR32_sleep (HF_REG_DEBUG_REQUESTER_RESET, 1);
|
|
|
+ WR32_sleep (HF_REG_DEBUG_REQUESTER_RESET, 0);
|
|
|
// RX
|
|
|
- WR32_sleep (REG_DBG_CQ_RST, 1);
|
|
|
- WR32_sleep (REG_DBG_CQ_RST, 0);
|
|
|
+ WR32_sleep (HF_REG_DEBUG_COMPLETER_RESET, 1);
|
|
|
+ WR32_sleep (HF_REG_DEBUG_COMPLETER_RESET, 0);
|
|
|
}
|
|
|
|
|
|
static double
|
|
@@ -460,18 +503,18 @@ measure_fpga_to_gpu_latency_with_marker (App *app)
|
|
|
|
|
|
configure_dma (app);
|
|
|
|
|
|
- WR32 (REG_UPDATE_THRESHOLD, 0);
|
|
|
- WR64_sleep (REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);
|
|
|
+ WR32 (HF_REG_UPDATE_THRESHOLD, 0);
|
|
|
+ WR64_sleep (HF_REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);
|
|
|
|
|
|
setup_counter (app);
|
|
|
|
|
|
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
|
|
|
+ WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
|
|
|
|
|
|
reset_dbg (app);
|
|
|
OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 0xd0dad0da, 0, NULL, &event));
|
|
|
|
|
|
clock_gettime (CLOCK_MONOTONIC, &start);
|
|
|
- WR32 (REG_DMA, 1);
|
|
|
+ WR32 (HF_REG_DMA, 1);
|
|
|
|
|
|
OCL_CHECK_ERROR (clWaitForEvents (1, &event));
|
|
|
|
|
@@ -479,7 +522,7 @@ measure_fpga_to_gpu_latency_with_marker (App *app)
|
|
|
stop_dma (app);
|
|
|
OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
|
|
|
- counter = RD32 (REG_PERF_COUNTER);
|
|
|
+ counter = RD32 (HF_REG_PERF_COUNTER);
|
|
|
host_latency = elapsed_seconds (&start, &end);
|
|
|
debug_latency = compute_debug_latency (app);
|
|
|
|
|
@@ -510,7 +553,7 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
|
|
|
configure_dma_descriptors (app);
|
|
|
setup_counter (app);
|
|
|
|
|
|
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
|
|
|
+ WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
|
|
|
|
|
|
reset_dbg (app);
|
|
|
|
|
@@ -522,7 +565,7 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
|
|
|
NULL, &work_size, NULL,
|
|
|
0, NULL, &event));
|
|
|
|
|
|
- WR32 (REG_DMA, 1);
|
|
|
+ WR32 (HF_REG_DMA, 1);
|
|
|
clock_gettime (CLOCK_MONOTONIC, &start);
|
|
|
|
|
|
clWaitForEvents (1, &event);
|
|
@@ -530,7 +573,7 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
|
|
|
clock_gettime (CLOCK_MONOTONIC, &end);
|
|
|
OCL_CHECK_ERROR (clReleaseEvent (event));
|
|
|
|
|
|
- counter = RD32 (REG_PERF_COUNTER);
|
|
|
+ counter = RD32 (HF_REG_PERF_COUNTER);
|
|
|
host_latency = elapsed_seconds (&start, &end);
|
|
|
debug_latency = compute_debug_latency (app);
|
|
|
|
|
@@ -570,12 +613,12 @@ measure_fpga_to_gpu_latency_with_cpu (App *app)
|
|
|
configure_dma_descriptors (app);
|
|
|
setup_counter (app);
|
|
|
|
|
|
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
|
|
|
+ WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
|
|
|
|
|
|
reset_dbg (app);
|
|
|
|
|
|
clock_gettime (CLOCK_MONOTONIC, &start);
|
|
|
- WR32 (REG_DMA, 1);
|
|
|
+ WR32 (HF_REG_DMA, 1);
|
|
|
|
|
|
do {
|
|
|
hardware_ptr = app->desc[flag_index];
|
|
@@ -585,7 +628,7 @@ measure_fpga_to_gpu_latency_with_cpu (App *app)
|
|
|
clock_gettime (CLOCK_MONOTONIC, &end);
|
|
|
stop_dma (app);
|
|
|
|
|
|
- counter = RD32 (REG_PERF_COUNTER);
|
|
|
+ counter = RD32 (HF_REG_PERF_COUNTER);
|
|
|
host_latency = elapsed_seconds (&start, &end);
|
|
|
debug_latency = compute_debug_latency (app);
|
|
|
|
|
@@ -617,12 +660,12 @@ measure_fpga_to_cpu_latency (App *app)
|
|
|
configure_dma_descriptors (app);
|
|
|
setup_counter (app);
|
|
|
|
|
|
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->cpu.addr);
|
|
|
+ WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->cpu.addr);
|
|
|
|
|
|
reset_dbg (app);
|
|
|
|
|
|
clock_gettime (CLOCK_MONOTONIC, &start);
|
|
|
- WR32 (REG_DMA, 1);
|
|
|
+ WR32 (HF_REG_DMA, 1);
|
|
|
|
|
|
do {
|
|
|
hardware_ptr = app->desc[flag_index];
|
|
@@ -635,7 +678,7 @@ measure_fpga_to_cpu_latency (App *app)
|
|
|
// Data back to FPGA
|
|
|
memcpy(app->bar + 0x9400, app->cpu.buffer, CPU_BUFFER_SIZE);
|
|
|
|
|
|
- counter = RD32 (REG_PERF_COUNTER);
|
|
|
+ counter = RD32 (HF_REG_PERF_COUNTER);
|
|
|
host_latency = elapsed_seconds (&start, &end);
|
|
|
debug_latency = compute_debug_latency (app);
|
|
|
|
|
@@ -647,6 +690,120 @@ measure_fpga_to_cpu_latency (App *app)
|
|
|
printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
|
|
|
}
|
|
|
|
|
|
+static void
|
|
|
+check_latency_results (App *app, size_t check_size)
|
|
|
+{
|
|
|
+ uint32_t *cnt;
|
|
|
+
|
|
|
+ cnt = malloc (16);
|
|
|
+ memset (cnt, 0, 16);
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->latency_data.buffer, CL_TRUE, 0, 16, cnt, 0, NULL, NULL));
|
|
|
+
|
|
|
+ for (uint32_t i = 0; i < 4; i++) {
|
|
|
+ printf("cnt %x\n", cnt[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+ uint32_t *data;
|
|
|
+ printf ("size %d\n", check_size);
|
|
|
+
|
|
|
+ data = malloc (check_size);
|
|
|
+ memset (data, 0xab, check_size);
|
|
|
+
|
|
|
+ OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));
|
|
|
+
|
|
|
+ for (uint32_t i = 0; i < check_size / 4; i++) {
|
|
|
+ if (i < check_size / 4) {
|
|
|
+ printf ("%-16s %x \n", "Latency FPGA->GPU [FPGA Master]", data[i]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ debug_assert ("FPGA->GPU->FPGA latency", true);
|
|
|
+
|
|
|
+ free (data);
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+measure_fpga_to_gpu_latency_master (App *app)
|
|
|
+{
|
|
|
+
|
|
|
+ printf ("\n** FPGA to GPU latency [FPGA MASTER]\n\n");
|
|
|
+
|
|
|
+ // reset
|
|
|
+ reset_dbg (app);
|
|
|
+ WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x10);
|
|
|
+
|
|
|
+ WR32_sleep( HF_REG_INTERCONNECT,
|
|
|
+ HF_INTERCONNECT_DDR_FROM_RX_MASTER |
|
|
|
+ HF_INTERCONNECT_DDR_TO_DMA |
|
|
|
+ HF_INTERCONNECT_MASTER_DMA);
|
|
|
+
|
|
|
+ printf ("FPGA_bar=0x%x\n", app->bar);
|
|
|
+ printf ("latency_data_bar=0x%x\n", app->latency_data.addr.surface_bus_address);
|
|
|
+
|
|
|
+
|
|
|
+ // results array size in WORDS (32 bits)
|
|
|
+ WR32_sleep(HF_REG_LATENCY_NUM_MEAS, GPU_BUFFER_SIZE / 4);
|
|
|
+
|
|
|
+ // 3FF clock cycles * 4 ~= 4 us -> every 4 us the read req is repeated
|
|
|
+ WR32_sleep(HF_REG_LATENCY_REPEAT_MASK, 0x3FFFFF);
|
|
|
+
|
|
|
+ // FFFFF clock cycles * 4 ~= 4 ms -> timeout -> WR is repeated then RD until HF_REG_CONF_DMA_TX_ENGINE != 1
|
|
|
+ WR32_sleep(HF_REG_LATENCY_TIMEOUT, 0xFFFFF);
|
|
|
+
|
|
|
+ WR64_sleep(HF_REG_DESC_LATENCY_DATA, app->latency_data.addr.surface_bus_address);
|
|
|
+ WR64_sleep(HF_REG_DESC_LATENCY_RESULTS, app->gpu.addr.surface_bus_address);
|
|
|
+
|
|
|
+ sleep(1);
|
|
|
+
|
|
|
+ // Measurement starts with rising edge of bit 0 in HF_REG_CONF_DMA_TX_ENGINE
|
|
|
+ WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x11); // Mode of operation for taking latency measurements, any other number goes to the default mode
|
|
|
+
|
|
|
+ //check_latency_results (app, LATENCY_RESULTS_BUFFER_SIZE);
|
|
|
+ uint32_t result;
|
|
|
+ result = RD32 (HF_REG_RESULT_LATENCY_MASTER);
|
|
|
+ printf ("latency result : %u ns\n", result * 4 );
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+measure_fpga_to_cpu_latency_master (App *app)
|
|
|
+{
|
|
|
+
|
|
|
+ printf ("\n** FPGA to CPU latency [FPGA MASTER]\n\n");
|
|
|
+
|
|
|
+ // reset
|
|
|
+ reset_dbg (app);
|
|
|
+ WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x10);
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ printf ("FPGA_bar=0x%x\n", app->bar);
|
|
|
+ printf ("latency_data_bar=0x%x\n", app->cpu_latency_data.addr);
|
|
|
+
|
|
|
+ // results array size in WORDS (32 bits)
|
|
|
+ WR32_sleep(HF_REG_LATENCY_NUM_MEAS, GPU_BUFFER_SIZE / 4);
|
|
|
+
|
|
|
+ // 3FF clock cycles * 4 ~= 4 us -> every 4 us the read req is repeated
|
|
|
+ WR32_sleep(HF_REG_LATENCY_REPEAT_MASK, 0x3FFFFF);
|
|
|
+
|
|
|
+ // FFFFF clock cycles * 4 ~= 4 ms -> timeout -> WR is repeated then RD until HF_REG_CONF_DMA_TX_ENGINE != 1
|
|
|
+ WR32_sleep(HF_REG_LATENCY_TIMEOUT, 0xFFFFF);
|
|
|
+
|
|
|
+ WR64_sleep(HF_REG_DESC_LATENCY_DATA, app->cpu_latency_data.addr);
|
|
|
+ WR64_sleep(HF_REG_DESC_LATENCY_RESULTS, app->gpu.addr.surface_bus_address);
|
|
|
+
|
|
|
+ sleep(1);
|
|
|
+ // Measurement starts with rising edge of bit 0 in HF_REG_CONF_DMA_TX_ENGINE
|
|
|
+ WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x11); // Mode of operation for taking latency measurements, any other number goes to the default mode
|
|
|
+
|
|
|
+ //check_latency_results (app, LATENCY_RESULTS_BUFFER_SIZE);
|
|
|
+ uint32_t result;
|
|
|
+ result = RD32 (HF_REG_RESULT_LATENCY_MASTER);
|
|
|
+ printf ("latency result : %u ns\n", result * 4 );
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
static void
|
|
|
measure_fpga_to_gpu_throughput (App *app)
|
|
|
{
|
|
@@ -671,10 +828,10 @@ measure_fpga_to_gpu_throughput (App *app)
|
|
|
addr = app->gpu.addr.surface_bus_address;
|
|
|
flag_index = app->board_gen == 3 ? 2 : 4;
|
|
|
|
|
|
- WR64_sleep (REG_DESCRIPTOR_ADDRESS, addr);
|
|
|
+ WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, addr);
|
|
|
|
|
|
clock_gettime (CLOCK_MONOTONIC, &start);
|
|
|
- WR32 (REG_DMA, 1);
|
|
|
+ WR32 (HF_REG_DMA, 1);
|
|
|
|
|
|
do {
|
|
|
do {
|
|
@@ -685,7 +842,7 @@ measure_fpga_to_gpu_throughput (App *app)
|
|
|
addr += NUM_PAGES * PAGE_SIZE;
|
|
|
transferred += NUM_PAGES * PAGE_SIZE;
|
|
|
current_ptr = hardware_ptr;
|
|
|
- WR64 (REG_DESCRIPTOR_ADDRESS, addr);
|
|
|
+ WR64 (HF_REG_DESCRIPTOR_ADDRESS, addr);
|
|
|
}
|
|
|
while (transferred < GPU_BUFFER_SIZE);
|
|
|
|
|
@@ -755,13 +912,16 @@ main (int argc, char const* argv[])
|
|
|
|
|
|
if (!init_opencl (&app))
|
|
|
return 1;
|
|
|
-
|
|
|
+/*
|
|
|
measure_fpga_to_gpu_latency_with_marker (&app);
|
|
|
measure_fpga_to_gpu_latency_with_kernel (&app);
|
|
|
measure_fpga_to_gpu_latency_with_cpu (&app);
|
|
|
measure_fpga_to_cpu_latency (&app);
|
|
|
measure_gpu_to_fpga_latency (&app);
|
|
|
measure_fpga_to_gpu_throughput (&app);
|
|
|
+*/
|
|
|
+ measure_fpga_to_gpu_latency_master (&app);
|
|
|
+ //measure_fpga_to_cpu_latency_master (&app);
|
|
|
|
|
|
close_opencl (&app);
|
|
|
close_pcilib (&app);
|