ardila
/
pciutils
forked from matthias/pciutils


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946
							#define _XOPEN_SOURCE   500

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
#include <pcilib.h>
#include <pcilib/bar.h>
#include <pcilib/kmem.h>
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include "ocl.h"

#include "hf-interface.h"

/* this should actually come from the distributed pcitool sources */
#include "pciDriver.h"

typedef struct {
    /* pcilib */
    pcilib_t *pci;
    uint8_t *bar;
    cl_ulong bar_phys;
    uint8_t board_gen;
    pcilib_kmem_handle_t *kdesc;
    uintptr_t kdesc_bus;
    volatile uint32_t *desc;

    /* OpenCL */
    OclPlatform *ocl;
    cl_device_id device;
    cl_command_queue queue;
    cl_context context;
    cl_program program;
    cl_kernel kernel;
    cl_mem check_buffer;

    /* both */
    cl_mem fpga_buffer;

    struct {
        cl_mem buffer;
        cl_bus_address_amd addr;
    } gpu;
    
    struct {
        cl_mem buffer;
        cl_bus_address_amd addr;
    } latency_data;

    struct {
        pcilib_kmem_handle_t *kmem;
        uint32_t *buffer;
        uintptr_t addr;
    } cpu;
    
    struct {
        pcilib_kmem_handle_t *kmem;
        uint32_t *buffer;
        uintptr_t addr;
    } cpu_latency_data;
    
} App;

#define UNICODE_CHECK_MARK      "o"
#define UNICODE_CROSS           "x"

#define KMEM_DEFAULT_FLAGS      PCILIB_KMEM_FLAG_HARDWARE | \
                                PCILIB_KMEM_FLAG_PERSISTENT | \
                                PCILIB_KMEM_FLAG_EXCLUSIVE

#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)

#define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
#define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
#define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);

#define WR64(addr, value) *(uint64_t *) (app->bar + (addr)) = (value);
#define RD64(addr) (*(uint64_t *) (app->bar + (addr)))
#define WR64_sleep(addr, value) *(uint64_t *) (app->bar + (addr)) = (value); usleep (100);


static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL;
static clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD = NULL;

static const size_t     TLP_SIZE            = 64;
static const size_t     FPGA_BUFFER_SIZE    = 1024 * 64;
static uint32_t         PAGE_SIZE           = 4096;
static uint32_t         NUM_PAGES           = 1;
static size_t           GPU_BUFFER_SIZE     = 4096;
static size_t           CPU_BUFFER_SIZE     = 4096;
static size_t           CHECK_BUFFER_SIZE   = 8;
static size_t           COUNTER_DATA_SIZE   = 4096; // 2 GB

/* declaration should actually come from a distributed header file */
const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);


static bool
init_pcilib (App *app)
{
    static const char *DEVICE = "/dev/fpga0";
    const pcilib_board_info_t *board;

    app->pci = pcilib_open (DEVICE, "pci");

    if (app->pci == NULL) {
        printf ("Could not open `%s'", DEVICE);
        return false;
    }

    app->bar = pcilib_map_bar (app->pci, PCILIB_BAR0);

    if (app->bar == NULL) {
        printf ("Unable to map BAR\n");
        pcilib_close (app->pci);
        return false;
    }

    board = pcilib_get_board_info (app->pci);
    app->bar_phys = board->bar_start[PCILIB_BAR0];
    app->board_gen = RD32 (0x18) & 0xF;

    app->kdesc = pcilib_alloc_kernel_memory (app->pci,
                            PCILIB_KMEM_TYPE_CONSISTENT, 1, 128,
                            4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);

    app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0);

    app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0);

    memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));

    printf ("%-16s 0x%X\n", "Firmware", RD32 (HF_REG_VERSION));

    return true;
}

static void
close_pcilib (App *app)
{
    pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;

    pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
    pcilib_free_kernel_memory (app->pci, app->cpu.kmem, KMEM_DEFAULT_FLAGS);
    
    pcilib_free_kernel_memory (app->pci, app->cpu_latency_data.kmem, KMEM_DEFAULT_FLAGS);
    
    pcilib_clean_kernel_memory (app->pci, KMEM_USE_DEFAULT, flags);

    pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
    pcilib_close (app->pci);
}

static cl_int
create_fpga_buffer (App *app, size_t size)
{
    cl_mem_flags flags;
    cl_bus_address_amd addr;
    cl_int error;

    flags = CL_MEM_EXTERNAL_PHYSICAL_AMD;
    addr.surface_bus_address = (cl_ulong) app->bar_phys;
    addr.marker_bus_address = (cl_ulong) app->bar_phys;

    app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error);
    return error;
}

static cl_int
create_gpu_buffer (App *app, size_t size)
{
    cl_mem_flags flags;
    cl_int error;
    char *data;

    data = malloc (size);
    memset (data, 42, size);
    flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR;

    app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error);

    if (error != CL_SUCCESS)
        return error;

    return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
}

static cl_int
create_latency_data_buffer (App *app, size_t size)
{
    cl_mem_flags flags;
    cl_int error;
    char *data;

    data = malloc (size);
    memset (data, 0xAD, size);
    flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE;

    app->latency_data.buffer = clCreateBuffer (app->context, flags, size, data, &error);

    if (error != CL_SUCCESS)
        return error;

    return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->latency_data.buffer, CL_TRUE, &app->latency_data.addr, 0, NULL, NULL);
}


static cl_int
create_cpu_buffer (App *app, size_t size)
{
    pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;

    app->cpu.kmem = pcilib_alloc_kernel_memory (app->pci,
                     /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
                     1, size, CPU_BUFFER_SIZE, KMEM_USE_DEFAULT, flags);

    app->cpu.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu.kmem, 0);
    app->cpu.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu.kmem, 0);

    memset (app->cpu.buffer, 42, CPU_BUFFER_SIZE);

    return 0;
}

static cl_int
create_cpu_latency_data_buffer (App *app, size_t size)
{
    pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;

    app->cpu_latency_data.kmem = pcilib_alloc_kernel_memory (app->pci,
                     /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
                     1, size, 16, KMEM_USE_DEFAULT, flags);

    app->cpu_latency_data.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu_latency_data.kmem, 0);
    app->cpu_latency_data.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu_latency_data.kmem, 0);

    memset (app->cpu_latency_data.buffer, 0x8A, 16);

    return 0;
}


static bool
init_opencl (App *app)
{
    cl_int error;
    cl_platform_id platform;

    app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE);
    platform = ocl_get_platform (app->ocl);

    clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD");
    clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueWaitSignalAMD");

    app->device = ocl_get_devices (app->ocl)[0];
    app->queue = ocl_get_cmd_queues (app->ocl)[0];
    app->context = ocl_get_context (app->ocl);

    app->program = ocl_create_program_from_file (app->ocl, "kernel.cl", NULL, &error);
    OCL_CHECK_ERROR (error);

    app->kernel = clCreateKernel (app->program, "wait_and_write", &error);
    OCL_CHECK_ERROR (error);

    app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, CHECK_BUFFER_SIZE, NULL, &error);
    OCL_CHECK_ERROR (error);

    error |= create_fpga_buffer (app, FPGA_BUFFER_SIZE);
    OCL_CHECK_ERROR (error);

    error |= create_gpu_buffer (app, GPU_BUFFER_SIZE);
    OCL_CHECK_ERROR (error);
    
    error |= create_latency_data_buffer (app, 16);
    OCL_CHECK_ERROR (error);

    error |= create_cpu_buffer (app, CPU_BUFFER_SIZE);
    OCL_CHECK_ERROR (error);
    
    error |= create_cpu_latency_data_buffer (app, 16);
    OCL_CHECK_ERROR (error);

    return error != CL_SUCCESS ? false : true;
}

static void
close_opencl (App *app)
{
    OCL_CHECK_ERROR (clReleaseKernel (app->kernel));
    OCL_CHECK_ERROR (clReleaseProgram (app->program));
    OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
    OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
    OCL_CHECK_ERROR (clReleaseMemObject (app->gpu.buffer));
    OCL_CHECK_ERROR (clReleaseMemObject (app->latency_data.buffer));
    ocl_free (app->ocl);
}

static void
debug_assert (const char *message, bool condition)
{
    printf ("%-16s ", message);

    if (condition)
        printf (UNICODE_CHECK_MARK"\n");
    else
        printf (UNICODE_CROSS"\n");
}

static void
debug_assert_cmp (const char *message, uint32_t value, uint32_t expected)
{
    printf ("%-16s ", message);

    if (value != expected)
        printf (UNICODE_CROSS" [%x (%i) != %x (%i)]\n", value, value, expected, expected);
    else
        printf (UNICODE_CHECK_MARK"\n");
}

#if 0
static void
check_value (App *app, const char *message, uint32_t addr, uint32_t expected)
{
    uint32_t value;

    value = RD32 (addr);
    debug_assert_cmp (message, value, expected);
}
#endif

static void
configure_dma (App *app)
{
    uint32_t value;

    WR32 (HF_REG_DMA, 1);
    usleep (100000);
    WR32 (HF_REG_DMA, 0);
    usleep (100000);

    value = RD32 (HF_REG_DMA);
    debug_assert ("PCIe check", value == 335746816 || value == 335681280);

    WR32 (HF_REG_NUM_PACKETS, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));

    if (app->board_gen == 3) {
        WR32 (HF_REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
    }
    else {
        WR32 (HF_REG_PACKET_LENGTH, TLP_SIZE);
    }

    /* reset host side addr */
    app->desc[app->board_gen == 3 ? 2 : 4] = 0;
}

static void
configure_dma_descriptors (App *app)
{
    WR32 (HF_REG_UPDATE_THRESHOLD, 0);
    WR64 (HF_REG_UPDATE_ADDRESS, app->kdesc_bus);
    usleep (100000);
}

static void
setup_counter (App *app)
{
    WR32_sleep (HF_REG_INTERCONNECT, 
                HF_INTERCONNECT_DDR_FROM_CNT | 
                HF_INTERCONNECT_DDR_TO_DMA |
                HF_INTERCONNECT_MASTER_DMA);
                
    WR32_sleep (HF_REG_DCG_UPPER_LIMIT, COUNTER_DATA_SIZE);
    WR32_sleep (HF_REG_DDR_UPPER_ADDR, 0xFFFFFFFF);
    WR32_sleep (HF_REG_DCG, HF_DCG_RESET );
    WR32_sleep (HF_REG_DCG, HF_DCG_START);
    sleep (3);  // sleep until ddr is complete 
    WR32_sleep (HF_REG_DCG, HF_DCG_STOP);   
}

static void
stop_dma (App *app)
{
    WR32_sleep (HF_REG_DMA, 0);
}

static void
reset_dbg (App *app)
{
    // RC
    WR32_sleep (HF_REG_DEBUG_RC_RESET, 1);
    WR32_sleep (HF_REG_DEBUG_RC_RESET, 0);
    // TX
    WR32_sleep (HF_REG_DEBUG_REQUESTER_RESET, 1);
    WR32_sleep (HF_REG_DEBUG_REQUESTER_RESET, 0);
    // RX
    WR32_sleep (HF_REG_DEBUG_COMPLETER_RESET, 1);
    WR32_sleep (HF_REG_DEBUG_COMPLETER_RESET, 0);
}

static double
compute_debug_latency (App *app)
{
    uint64_t start, start_high, start_low;
    uint64_t end, end_high, end_low;

    WR32_sleep (0x93A0, 0);
    start_high = RD32 (0x93B8);
    start_low = RD32 (0x93BC);

    WR32_sleep (0x93A0,1)
    end_high = RD32 (0x93B8);
    end_low = RD32 (0x93BC);

    start = (start_high << 32) | start_low;
    end = (end_high << 32) | end_low;

    return ((end - start) * 4) / 1000.0;
}

static double
elapsed_seconds (struct timespec *start, struct timespec *end)
{
    return (end->tv_sec + end->tv_nsec / 1000000000.0) - (start->tv_sec + start->tv_nsec / 1000000000.0);
}

static double
elapsed_gpu_seconds (cl_event event)
{
    cl_ulong start;
    cl_ulong end;

    OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
    OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));

    return (end - start) / 1000.0 / 1000.0 / 1000.0;
}

static void
check_data_transfer (App *app, size_t check_size)
{
    uint32_t *data;

    data = malloc (check_size);
    memset (data, 0, check_size);

    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));

    for (uint32_t i = 0; i < check_size / 4; i++) {
        if (data[i] != i) {
            debug_assert_cmp ("FPGA->GPU write", data[i], i);
            goto finish_check_data_transfer;
        }
    }

    debug_assert ("FPGA->GPU write", true);

finish_check_data_transfer:
    free (data);
}

static void
check_data_transfer_cpu (App *app, size_t check_size)
{
    for (uint32_t i = 0; i < check_size / 4; i++) {
        if (app->cpu.buffer[i] != i) {
            debug_assert_cmp ("FPGA->CPU write", app->cpu.buffer[i], i);
            return;
        }
    }

    debug_assert ("FPGA->CPU write", true);
}

static void
fill_gpu_buffer (App *app)
{
    uint32_t pattern;
    cl_event event;

    pattern = 42;
    OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, app->gpu.buffer, &pattern, sizeof (pattern), 0, GPU_BUFFER_SIZE / 4, 0, NULL, &event));
    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
    OCL_CHECK_ERROR (clReleaseEvent (event));
}

static void
measure_fpga_to_gpu_latency_with_marker (App *app)
{
    uint32_t counter;
    struct timespec start;
    struct timespec end;
    double host_latency;
    double debug_latency;
    cl_event event;

    printf ("\n** FPGA to GPU latency [marker]\n\n");
    fill_gpu_buffer (app);

    configure_dma (app);

    WR32 (HF_REG_UPDATE_THRESHOLD, 0);
    WR64_sleep (HF_REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);

    setup_counter (app);

    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);

    reset_dbg (app);
    OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 0xd0dad0da, 0, NULL, &event));

    clock_gettime (CLOCK_MONOTONIC, &start);
    WR32 (HF_REG_DMA, 1);

    OCL_CHECK_ERROR (clWaitForEvents (1, &event));

    clock_gettime (CLOCK_MONOTONIC, &end);
    stop_dma (app);
    OCL_CHECK_ERROR (clReleaseEvent (event));

    counter = RD32 (HF_REG_PERF_COUNTER);
    host_latency = elapsed_seconds (&start, &end);
    debug_latency = compute_debug_latency (app);

    check_data_transfer (app, PAGE_SIZE);

    printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
    printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
}

static void
measure_fpga_to_gpu_latency_with_kernel (App *app)
{
    uint32_t counter;
    struct timespec start;
    struct timespec end;
    double host_latency;
    double debug_latency;
    cl_event signal_event;
    cl_event event;
    uint32_t check[CHECK_BUFFER_SIZE / 4];
    size_t work_size = 1;
    cl_ulong times[4];

    printf ("\n** FPGA to GPU latency [kernel]\n\n");
    fill_gpu_buffer (app);

    configure_dma (app);
    configure_dma_descriptors (app);
    setup_counter (app);

    WR32_sleep (HF_REG_CONTROL, HF_CONTROL_ENABLE_READ | HF_CONTROL_ENABLE_MULTI_READ);
    WR64_sleep (HF_REG_PERF_COUNTER_FEEDBACK_1, 0);
    WR64_sleep (HF_REG_DESCRIPTOR_AMD_SIGNAL, app->gpu.addr.marker_bus_address);
    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);

    reset_dbg (app);

    OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->gpu.buffer));
    OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->fpga_buffer));
    OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (cl_mem), &app->check_buffer));

    WR32_sleep (HF_REG_DMA, 0);

    OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 1, 0, NULL, &signal_event));

    /* OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1, */
    /*                                          NULL, &work_size, NULL, */
    /*                                          1, &signal_event, &event)); */
    usleep (100);
    WR32_sleep (HF_REG_DMA, 1);

    clock_gettime (CLOCK_MONOTONIC, &start);
    OCL_CHECK_ERROR (clWaitForEvents (1, &signal_event));
    clock_gettime (CLOCK_MONOTONIC, &end);

    WR32 ( HF_REG_DMA, 0);
    WR32 ( 0x20, 1);
    WR32 ( 0x24, 1);

    /* OCL_CHECK_ERROR (clReleaseEvent (event)); */

    counter = RD32 (HF_REG_PERF_COUNTER);
    host_latency = elapsed_seconds (&start, &end);
    debug_latency = compute_debug_latency (app);

    check_data_transfer (app, PAGE_SIZE);

    printf ("DMA running: %i\n", RD32 (HF_REG_DMA) & 0x1);

    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE,
                                          0, CHECK_BUFFER_SIZE, check, 0, NULL, NULL));

    WR32 (HF_REG_DMA, 0);

    debug_assert_cmp ("Data check", check[0], 1);
    WR32_sleep (HF_REG_CONTROL, HF_CONTROL_ENABLE_READ);

    printf ("%-16s %f us\n", "FPGA latency", RD32 (0x20) * 4 / 1000.0);

    ocl_get_event_times (signal_event, &times[0], &times[1], &times[2], &times[3]);

    printf ("submit -> start = %f, queue -> start = %f, run_time=%lu\n",
            (times[0] - times[3]) / 1000.0, (times[0] - times[2]) / 1000.0, times[1] - times[0]);
    /* printf ("\n%-16s %i\n", "Kernel count", check[1]); */
    printf ("%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
    /* printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0); */
    /* printf ("%-16s %f us\n", "FPGA [debug]", debug_latency); */
    /* printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.); */
}


static void
measure_fpga_to_gpu_latency_with_cpu (App *app)
{
    uint32_t hardware_ptr;
    uint32_t counter;
    unsigned flag_index;
    struct timespec start;
    struct timespec end;
    double host_latency;
    double debug_latency;

    printf ("\n** FPGA to GPU latency [CPU]\n\n");
    fill_gpu_buffer (app);

    hardware_ptr = 0;
    flag_index = app->board_gen == 3 ? 2 : 4;

    configure_dma (app);
    configure_dma_descriptors (app);
    setup_counter (app);

    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);

    reset_dbg (app);

    clock_gettime (CLOCK_MONOTONIC, &start);
    WR32 (HF_REG_DMA, 1);

    do {
        hardware_ptr = app->desc[flag_index];
    }
    while (hardware_ptr != (app->gpu.addr.surface_bus_address & 0xFFFFFFFF));

    clock_gettime (CLOCK_MONOTONIC, &end);
    stop_dma (app);

    counter = RD32 (HF_REG_PERF_COUNTER);
    host_latency = elapsed_seconds (&start, &end);
    debug_latency = compute_debug_latency (app);

    check_data_transfer (app, PAGE_SIZE);

    printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
    printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
}

static void
measure_fpga_to_cpu_latency (App *app)
{
    uint32_t hardware_ptr;
    uint32_t counter;
    unsigned flag_index;
    struct timespec start;
    struct timespec end;
    double host_latency;
    double debug_latency;

    printf ("\n** FPGA to CPU latency\n\n");

    hardware_ptr = 0;
    flag_index = app->board_gen == 3 ? 2 : 4;

    configure_dma (app);
    configure_dma_descriptors (app);
    setup_counter (app);

    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->cpu.addr);

    reset_dbg (app);

    clock_gettime (CLOCK_MONOTONIC, &start);
    WR32 (HF_REG_DMA, 1);

    do {
        hardware_ptr = app->desc[flag_index];
    }
    while (hardware_ptr != app->cpu.addr);

    clock_gettime (CLOCK_MONOTONIC, &end);
    stop_dma (app);

    // Data back to FPGA
    memcpy(app->bar + 0x9400, app->cpu.buffer, CPU_BUFFER_SIZE);

    counter = RD32 (HF_REG_PERF_COUNTER);
    host_latency = elapsed_seconds (&start, &end);
    debug_latency = compute_debug_latency (app);

    check_data_transfer_cpu (app, PAGE_SIZE);

    printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
    printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
}

static void
check_latency_results (App *app, size_t check_size)
{
    uint32_t *cnt;

    cnt = malloc (16);
    memset (cnt, 0, 16);

    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->latency_data.buffer, CL_TRUE, 0, 16, cnt, 0, NULL, NULL));

    for (uint32_t i = 0; i < 4; i++) {
        printf("cnt %x\n", cnt[i]);
    }
    
    uint32_t *data;
    printf ("size %d\n", check_size);
    
    data = malloc (check_size);
    memset (data, 0xab, check_size);
    
    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));

    for (uint32_t i = 0; i < check_size / 4; i++) {
        if (i < check_size / 4) {
            printf ("%-16s %x \n", "Latency FPGA->GPU [FPGA Master]", data[i]);
        }            
    }

    debug_assert ("FPGA->GPU->FPGA latency", true);

    free (data);
}

static void
measure_fpga_to_gpu_latency_master (App *app)
{
    
    printf ("\n** FPGA to GPU latency [FPGA MASTER]\n\n");
    
    // reset
    reset_dbg (app);
    WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x10);
    
    WR32_sleep( HF_REG_INTERCONNECT, 
		HF_INTERCONNECT_DDR_FROM_RX_MASTER |
		HF_INTERCONNECT_DDR_TO_DMA |
		HF_INTERCONNECT_MASTER_DMA); 
    
    // results array size in WORDS (32 bits)      
    WR32_sleep(HF_REG_LATENCY_NUM_MEAS, GPU_BUFFER_SIZE / 4); 
    
    // 3FF clock cycles * 4 ~= 4 us  -> every 4 us the read req is repeated                            
    WR32_sleep(HF_REG_LATENCY_REPEAT_MASK, 0x3FFFFF);                          
    
    // FFFFF clock cycles * 4 ~= 4 ms -> timeout -> WR is repeated then RD until HF_REG_CONF_DMA_TX_ENGINE != 1    
    WR32_sleep(HF_REG_LATENCY_TIMEOUT, 0xFFFFF);    
    
    WR64_sleep(HF_REG_DESC_LATENCY_DATA, app->latency_data.addr.surface_bus_address);
    WR64_sleep(HF_REG_DESC_LATENCY_RESULTS, app->gpu.addr.surface_bus_address);
    
    sleep(1);
    
    // Measurement starts with rising edge of bit 0 in HF_REG_CONF_DMA_TX_ENGINE
    WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x11);       // Mode of operation for taking latency measurements, any other number goes to the default mode
    
    //check_latency_results (app, GPU_BUFFER_SIZE);
    
    uint32_t result;
    result = RD32 (HF_REG_RESULT_LATENCY_MASTER);
    printf ("latency result : %u ns\n", result * 4 );
    
}

static void
measure_fpga_to_cpu_latency_master (App *app)
{

    printf ("\n** FPGA to CPU latency [FPGA MASTER]\n\n");
    
    // reset
    reset_dbg (app);
    WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x10);
    
    // results array size in WORDS (32 bits)      
    WR32_sleep(HF_REG_LATENCY_NUM_MEAS, GPU_BUFFER_SIZE / 4);    
    
    // 3FF clock cycles * 4 ~= 4 us  -> every 4 us the read req is repeated                            
    WR32_sleep(HF_REG_LATENCY_REPEAT_MASK, 0x3FFFFF);                          
    
    // FFFFF clock cycles * 4 ~= 4 ms -> timeout -> WR is repeated then RD until HF_REG_CONF_DMA_TX_ENGINE != 1    
    WR32_sleep(HF_REG_LATENCY_TIMEOUT, 0xFFFFF);    
    
    WR64_sleep(HF_REG_DESC_LATENCY_DATA, app->cpu_latency_data.addr);
    WR64_sleep(HF_REG_DESC_LATENCY_RESULTS, app->gpu.addr.surface_bus_address);
    
    sleep(1);
    // Measurement starts with rising edge of bit 0 in HF_REG_CONF_DMA_TX_ENGINE
    WR32_sleep(HF_REG_CONF_DMA_TX_ENGINE, 0x11);       // Mode of operation for taking latency measurements, any other number goes to the default mode
    
    //check_latency_results (app, GPU_BUFFER_SIZE);
    uint32_t result;
    result = RD32 (HF_REG_RESULT_LATENCY_MASTER);
    printf ("latency result : %u ns\n", result * 4 );
    
}

static void
measure_fpga_to_gpu_throughput (App *app)
{
    uint32_t current_ptr;
    uint32_t hardware_ptr;
    unsigned flag_index;
    size_t transferred;
    cl_ulong addr;
    double elapsed;
    struct timespec start;
    struct timespec end;

    printf ("\n** FPGA to GPU throughput\n\n");

    configure_dma (app);
    configure_dma_descriptors (app);
    setup_counter (app);

    current_ptr = 0;
    hardware_ptr = 0;
    transferred = 0;
    addr = app->gpu.addr.surface_bus_address;
    flag_index = app->board_gen == 3 ? 2 : 4;

    WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, addr);

    clock_gettime (CLOCK_MONOTONIC, &start);
    WR32 (HF_REG_DMA, 1);

    do {
        do {
            hardware_ptr = app->desc[flag_index];
        }
        while (hardware_ptr == current_ptr);    /* it should work to check against `addr` but it never does */

        addr += NUM_PAGES * PAGE_SIZE;
        transferred += NUM_PAGES * PAGE_SIZE;
        current_ptr = hardware_ptr;
        WR64 (HF_REG_DESCRIPTOR_ADDRESS, addr);
    }
    while (transferred < GPU_BUFFER_SIZE);

    clock_gettime (CLOCK_MONOTONIC, &end);

    stop_dma (app);
    check_data_transfer (app, GPU_BUFFER_SIZE);

    elapsed = elapsed_seconds (&start, &end);

    printf ("\n%-16s %.2f MB/s [%3.5f us]\n", "Host", GPU_BUFFER_SIZE / elapsed / 1024. / 1024., elapsed * 1000 * 1000);
}

static void
measure_gpu_to_fpga_latency (App *app)
{
    cl_event event;
    cl_mem src_buffer;
    cl_int err;
    uint32_t pattern;
    struct timespec start;
    struct timespec end;

    printf ("\n** GPU  to FPGA latency\n\n");

    src_buffer = clCreateBuffer (app->context, CL_MEM_READ_ONLY, PAGE_SIZE, NULL, &err);
    OCL_CHECK_ERROR (err);

    pattern = 0xdeadf00d;
    OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, src_buffer, &pattern, sizeof (pattern), 0, PAGE_SIZE, 0, NULL, &event));
    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
    OCL_CHECK_ERROR (clReleaseEvent (event));

    clock_gettime (CLOCK_MONOTONIC, &start);

        OCL_CHECK_ERROR (clEnqueueCopyBuffer (app->queue, src_buffer, app->fpga_buffer, 0, 0x9400, PAGE_SIZE, 0, NULL, &event));
        OCL_CHECK_ERROR (clWaitForEvents (1, &event));

    clock_gettime (CLOCK_MONOTONIC, &end);

    printf ("%-16s %f us\n", "Wall time", elapsed_seconds (&start, &end) * 1000 * 1000);
    printf ("%-16s %f us\n", "GPU time", elapsed_gpu_seconds (event) * 1000 * 1000);

    OCL_CHECK_ERROR (clReleaseEvent (event));
    OCL_CHECK_ERROR (clReleaseMemObject (src_buffer));
}

int
main (int argc, char const* argv[])
{
    App app;

    if (argc > 1) {
        PAGE_SIZE = GPU_BUFFER_SIZE = atoi (argv[1]);
    }

    if (argc > 2) {
        NUM_PAGES = atoi (argv[2]);
    }

    printf ("** Parameters\n\n");
    printf ("%-16s %u\n", "Pages", NUM_PAGES);
    printf ("%-16s %zu B\n", "Buffer size", GPU_BUFFER_SIZE);

    if (!init_pcilib (&app))
        return 1;

    if (!init_opencl (&app))
        return 1;
/*
    measure_fpga_to_gpu_latency_with_marker (&app);
    measure_fpga_to_gpu_latency_with_kernel (&app);
    measure_fpga_to_gpu_latency_with_cpu (&app);
    measure_fpga_to_cpu_latency (&app);
    measure_gpu_to_fpga_latency (&app);
    measure_fpga_to_gpu_throughput (&app);
*/    
    measure_fpga_to_gpu_latency_master (&app);
    //measure_fpga_to_cpu_latency_master (&app);

    close_opencl (&app);
    close_pcilib (&app);
    return 0;
}