#define _XOPEN_SOURCE   500

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
#include <pcilib.h>
#include <pcilib/bar.h>
#include <pcilib/kmem.h>
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include "ocl.h"

/* this should actually come from the distributed pcitool sources */
#include "pciDriver.h"

typedef struct {
    /* pcilib */
    pcilib_t *pci;
    uint8_t *bar;
    cl_ulong bar_phys;
    uint8_t board_gen;
    pcilib_kmem_handle_t *kdesc;
    uintptr_t kdesc_bus;
    volatile uint32_t *desc;

    /* OpenCL */
    OclPlatform *ocl;
    cl_device_id device;
    cl_command_queue queue;
    cl_context context;
    cl_program program;
    cl_kernel kernel;
    cl_mem check_buffer;

    /* both */
    cl_mem fpga_buffer;

    struct {
        cl_mem buffer;
        cl_bus_address_amd addr;
    } gpu;

    struct {
        pcilib_kmem_handle_t *kmem;
        uint32_t *buffer;
        uintptr_t addr;
    } cpu;
} App;

#define UNICODE_CHECK_MARK      "o"
#define UNICODE_CROSS           "x"

#define KMEM_DEFAULT_FLAGS      PCILIB_KMEM_FLAG_HARDWARE | \
                                PCILIB_KMEM_FLAG_PERSISTENT | \
                                PCILIB_KMEM_FLAG_EXCLUSIVE

#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)

#define REG_RESET_DMA                   0x00
#define REG_DMA                         0x04
#define REG_NUM_PACKETS_PER_DESCRIPTOR  0x10
#define REG_PERF_COUNTER                0x28
#define REG_PACKET_LENGTH               0x0C
#define REG_DESCRIPTOR_ADDRESS          0x50
#define REG_UPDATE_ADDRESS              0x58
#define REG_UPDATE_THRESHOLD            0x60

#define REG_COUNTER                     0x9000
#define REG_VERSION                     0x9020
#define REG_CONTROL                     0x9040
#define REG_LATENCY                     0x9044
#define REG_NUM_ROWS                    0x9168
#define REG_NUM_FRAMES                  0x9170

#define REG_DBG_RQ_RST                  0x9344
#define REG_DBG_CQ_RST                  0x93A4

#define CMD_DMA_START           0x1
#define CMD_DMA_STOP            0x0
#define CMD_COUNTER_RESET       0xf0
#define CMD_COUNTER_START       0x1
#define CMD_COUNTER_STOP        0x0
#define CMD_LATENCY_RESET       0x0800000f
#define CMD_LATENCY_STOP        0xf0000000

#define WR32(addr, value) *(uint32_t *) (app->bar + (addr)) = (value);
#define RD32(addr) (*(uint32_t *) (app->bar + (addr)))
#define WR32_sleep(addr, value) *(uint32_t *) (app->bar + (addr)) = (value); usleep (100);

#define WR64(addr, value) *(uint64_t *) (app->bar + (addr)) = (value);
#define RD64(addr) (*(uint64_t *) (app->bar + (addr)))
#define WR64_sleep(addr, value) *(uint64_t *) (app->bar + (addr)) = (value); usleep (100);


static clEnqueueMakeBuffersResidentAMD_fn clEnqueueMakeBuffersResidentAMD = NULL;
static clEnqueueWaitSignalAMD_fn clEnqueueWaitSignalAMD = NULL;

static const size_t     TLP_SIZE            = 64;
static const size_t     FPGA_BUFFER_SIZE    = 1024 * 64;
static uint32_t         PAGE_SIZE           = 4096;
static uint32_t         NUM_PAGES           = 1;
static size_t           GPU_BUFFER_SIZE     = 4096;
static size_t           CPU_BUFFER_SIZE     = 4096;
static size_t           CHECK_BUFFER_SIZE   = 8;


/* declaration should actually come from a distributed header file */
const pcilib_board_info_t *pcilib_get_board_info (pcilib_t *);


static bool
init_pcilib (App *app)
{
    static const char *DEVICE = "/dev/fpga0";
    const pcilib_board_info_t *board;

    app->pci = pcilib_open (DEVICE, "pci");

    if (app->pci == NULL) {
        printf ("Could not open `%s'", DEVICE);
        return false;
    }

    app->bar = pcilib_map_bar (app->pci, PCILIB_BAR0);

    if (app->bar == NULL) {
        printf ("Unable to map BAR\n");
        pcilib_close (app->pci);
        return false;
    }

    board = pcilib_get_board_info (app->pci);
    app->bar_phys = board->bar_start[PCILIB_BAR0];
    app->board_gen = RD32 (0x18) & 0xF;

    app->kdesc = pcilib_alloc_kernel_memory (app->pci,
                            PCILIB_KMEM_TYPE_CONSISTENT, 1, 128,
                            4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);

    app->kdesc_bus = pcilib_kmem_get_block_ba (app->pci, app->kdesc, 0);

    app->desc = (uint32_t *) pcilib_kmem_get_block_ua (app->pci, app->kdesc, 0);

    memset ((uint32_t *) app->desc, 0, 5 * sizeof (uint32_t));

    printf ("%-16s 0x%X\n", "Firmware", RD32 (REG_VERSION));

    return true;
}

static void
close_pcilib (App *app)
{
    pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;

    pcilib_free_kernel_memory (app->pci, app->kdesc, KMEM_DEFAULT_FLAGS);
    pcilib_free_kernel_memory (app->pci, app->cpu.kmem, KMEM_DEFAULT_FLAGS);

    pcilib_clean_kernel_memory (app->pci, KMEM_USE_DEFAULT, flags);

    pcilib_unmap_bar (app->pci, PCILIB_BAR0, (void *) app->bar);
    pcilib_close (app->pci);
}

static cl_int
create_fpga_buffer (App *app, size_t size)
{
    cl_mem_flags flags;
    cl_bus_address_amd addr;
    cl_int error;

    flags = CL_MEM_EXTERNAL_PHYSICAL_AMD;
    addr.surface_bus_address = (cl_ulong) app->bar_phys;
    addr.marker_bus_address = (cl_ulong) app->bar_phys;

    app->fpga_buffer = clCreateBuffer (app->context, flags, size, &addr, &error);
    return error;
}

static cl_int
create_gpu_buffer (App *app, size_t size)
{
    cl_mem_flags flags;
    cl_int error;
    char *data;

    data = malloc (size);
    memset (data, 42, size);
    flags = CL_MEM_BUS_ADDRESSABLE_AMD | CL_MEM_COPY_HOST_PTR;

    app->gpu.buffer = clCreateBuffer (app->context, flags, size, data, &error);

    if (error != CL_SUCCESS)
        return error;

    return clEnqueueMakeBuffersResidentAMD (app->queue, 1, &app->gpu.buffer, CL_TRUE, &app->gpu.addr, 0, NULL, NULL);
}

static cl_int
create_cpu_buffer (App *app, size_t size)
{
    pcilib_kmem_flags_t flags = PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE;

    app->cpu.kmem = pcilib_alloc_kernel_memory (app->pci,
                     /*PCILIB_KMEM_TYPE_DMA_C2S_PAGE*/ PCILIB_KMEM_TYPE_CONSISTENT,
                     1, size, CPU_BUFFER_SIZE, KMEM_USE_DEFAULT, flags);

    app->cpu.addr = pcilib_kmem_get_block_ba(app->pci, app->cpu.kmem, 0);
    app->cpu.buffer = (uint32_t*) pcilib_kmem_get_block_ua(app->pci, app->cpu.kmem, 0);

    memset (app->cpu.buffer, 42, CPU_BUFFER_SIZE);

    return 0;
}

static bool
init_opencl (App *app)
{
    cl_int error;
    cl_platform_id platform;

    app->ocl = ocl_new_with_queues (0, CL_DEVICE_TYPE_GPU, CL_QUEUE_PROFILING_ENABLE);
    platform = ocl_get_platform (app->ocl);

    clEnqueueMakeBuffersResidentAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueMakeBuffersResidentAMD");
    clEnqueueWaitSignalAMD = clGetExtensionFunctionAddressForPlatform (platform, "clEnqueueWaitSignalAMD");

    app->device = ocl_get_devices (app->ocl)[0];
    app->queue = ocl_get_cmd_queues (app->ocl)[0];
    app->context = ocl_get_context (app->ocl);

    app->program = ocl_create_program_from_file (app->ocl, "kernel.cl", NULL, &error);
    OCL_CHECK_ERROR (error);

    app->kernel = clCreateKernel (app->program, "wait_and_write", &error);
    OCL_CHECK_ERROR (error);

    app->check_buffer = clCreateBuffer (app->context, CL_MEM_WRITE_ONLY, CHECK_BUFFER_SIZE, NULL, &error);
    OCL_CHECK_ERROR (error);

    error |= create_fpga_buffer (app, FPGA_BUFFER_SIZE);
    OCL_CHECK_ERROR (error);

    error |= create_gpu_buffer (app, GPU_BUFFER_SIZE);
    OCL_CHECK_ERROR (error);

    error |= create_cpu_buffer (app, CPU_BUFFER_SIZE);
    OCL_CHECK_ERROR (error);

    return error != CL_SUCCESS ? false : true;
}

static void
close_opencl (App *app)
{
    OCL_CHECK_ERROR (clReleaseKernel (app->kernel));
    OCL_CHECK_ERROR (clReleaseProgram (app->program));
    OCL_CHECK_ERROR (clReleaseMemObject (app->fpga_buffer));
    OCL_CHECK_ERROR (clReleaseMemObject (app->check_buffer));
    ocl_free (app->ocl);
}

static void
debug_assert (const char *message, bool condition)
{
    printf ("%-16s ", message);

    if (condition)
        printf (UNICODE_CHECK_MARK"\n");
    else
        printf (UNICODE_CROSS"\n");
}

static void
debug_assert_cmp (const char *message, uint32_t value, uint32_t expected)
{
    printf ("%-16s ", message);

    if (value != expected)
        printf (UNICODE_CROSS" [%x (%i) != %x (%i)]\n", value, value, expected, expected);
    else
        printf (UNICODE_CHECK_MARK"\n");
}

#if 0
static void
check_value (App *app, const char *message, uint32_t addr, uint32_t expected)
{
    uint32_t value;

    value = RD32 (addr);
    debug_assert_cmp (message, value, expected);
}
#endif

static void
configure_dma (App *app)
{
    uint32_t value;

    WR32 (REG_RESET_DMA, 1);
    usleep (100000);
    WR32 (REG_RESET_DMA, 0);
    usleep (100000);

    value = RD32 (REG_RESET_DMA);
    debug_assert ("PCIe check", value == 335746816 || value == 335681280);

    WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, NUM_PAGES * PAGE_SIZE / (4 * TLP_SIZE));

    if (app->board_gen == 3) {
        WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
    }
    else {
        WR32 (REG_PACKET_LENGTH, TLP_SIZE);
    }

    /* reset host side addr */
    app->desc[app->board_gen == 3 ? 2 : 4] = 0;
}

static void
configure_dma_descriptors (App *app)
{
    WR32 (REG_UPDATE_THRESHOLD, 0);
    WR64 (REG_UPDATE_ADDRESS, app->kdesc_bus);
    usleep (100000);
}

static void
setup_counter (App *app)
{
    WR32_sleep (REG_NUM_ROWS, 0);
    WR32_sleep (REG_NUM_FRAMES, 0);
    WR32_sleep (REG_CONTROL, CMD_LATENCY_RESET);
    WR32_sleep (REG_CONTROL, CMD_COUNTER_STOP);
    WR32_sleep (REG_COUNTER, CMD_COUNTER_RESET);
    WR32_sleep (REG_COUNTER, CMD_COUNTER_START);
}

static void
stop_dma (App *app)
{
    WR32_sleep (REG_DMA, 0);
}

static void
reset_dbg (App *app)
{
    // TX
    WR32_sleep (REG_DBG_RQ_RST, 1);
    WR32_sleep (REG_DBG_RQ_RST, 0);
    // RX
    WR32_sleep (REG_DBG_CQ_RST, 1);
    WR32_sleep (REG_DBG_CQ_RST, 0);
}

static double
compute_debug_latency (App *app)
{
    uint64_t start, start_high, start_low;
    uint64_t end, end_high, end_low;

    WR32_sleep (0x93A0, 0);
    start_high = RD32 (0x93B8);
    start_low = RD32 (0x93BC);

    WR32_sleep (0x93A0,1)
    end_high = RD32 (0x93B8);
    end_low = RD32 (0x93BC);

    start = (start_high << 32) | start_low;
    end = (end_high << 32) | end_low;

    return ((end - start) * 4) / 1000.0;
}

static double
elapsed_seconds (struct timespec *start, struct timespec *end)
{
    return (end->tv_sec + end->tv_nsec / 1000000000.0) - (start->tv_sec + start->tv_nsec / 1000000000.0);
}

static double
elapsed_gpu_seconds (cl_event event)
{
    cl_ulong start;
    cl_ulong end;

    OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL));
    OCL_CHECK_ERROR (clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL));

    return (end - start) / 1000.0 / 1000.0 / 1000.0;
}

static void
check_data_transfer (App *app, size_t check_size)
{
    uint32_t *data;

    data = malloc (check_size);
    memset (data, 0, check_size);

    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->gpu.buffer, CL_TRUE, 0, check_size, data, 0, NULL, NULL));

    for (uint32_t i = 0; i < check_size / 4; i++) {
        if (data[i] != i) {
            debug_assert_cmp ("FPGA->GPU write", data[i], i);
            goto finish_check_data_transfer;
        }
    }

    debug_assert ("FPGA->GPU write", true);

finish_check_data_transfer:
    free (data);
}

static void
check_data_transfer_cpu (App *app, size_t check_size)
{
    for (uint32_t i = 0; i < check_size / 4; i++) {
        if (app->cpu.buffer[i] != i) {
            debug_assert_cmp ("FPGA->CPU write", app->cpu.buffer[i], i);
            return;
        }
    }

    debug_assert ("FPGA->CPU write", true);
}

static void
fill_gpu_buffer (App *app)
{
    uint32_t pattern;
    cl_event event;

    pattern = 42;
    OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, app->gpu.buffer, &pattern, sizeof (pattern), 0, GPU_BUFFER_SIZE / 4, 0, NULL, &event));
    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
    OCL_CHECK_ERROR (clReleaseEvent (event));
}

static void
measure_fpga_to_gpu_latency_with_marker (App *app)
{
    uint32_t counter;
    struct timespec start;
    struct timespec end;
    double host_latency;
    double debug_latency;
    cl_event event;

    printf ("\n** FPGA to GPU latency [marker]\n\n");
    fill_gpu_buffer (app);

    configure_dma (app);

    WR32 (REG_UPDATE_THRESHOLD, 0);
    WR64_sleep (REG_UPDATE_ADDRESS, app->gpu.addr.marker_bus_address);

    setup_counter (app);

    WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);

    reset_dbg (app);
    OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 0xd0dad0da, 0, NULL, &event));

    clock_gettime (CLOCK_MONOTONIC, &start);
    WR32 (REG_DMA, 1);

    OCL_CHECK_ERROR (clWaitForEvents (1, &event));

    clock_gettime (CLOCK_MONOTONIC, &end);
    stop_dma (app);
    OCL_CHECK_ERROR (clReleaseEvent (event));

    counter = RD32 (REG_PERF_COUNTER);
    host_latency = elapsed_seconds (&start, &end);
    debug_latency = compute_debug_latency (app);

    check_data_transfer (app, PAGE_SIZE);

    printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
    printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
}

static void
measure_fpga_to_gpu_latency_with_kernel (App *app)
{
    uint32_t counter;
    struct timespec start;
    struct timespec end;
    double host_latency;
    double debug_latency;
    cl_event event;
    uint32_t check[CHECK_BUFFER_SIZE / 4];
    size_t work_size = 1;

    printf ("\n** FPGA to GPU latency [kernel]\n\n");
    fill_gpu_buffer (app);

    configure_dma (app);
    configure_dma_descriptors (app);
    setup_counter (app);

    WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);

    reset_dbg (app);

    OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 0, sizeof (cl_mem), &app->gpu.buffer));
    OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->fpga_buffer));
    OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (cl_mem), &app->check_buffer));

    OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1,
                                             NULL, &work_size, NULL,
                                             0, NULL, &event));

    WR32 (REG_DMA, 1);
    clock_gettime (CLOCK_MONOTONIC, &start);

    clWaitForEvents (1, &event);

    clock_gettime (CLOCK_MONOTONIC, &end);
    OCL_CHECK_ERROR (clReleaseEvent (event));

    counter = RD32 (REG_PERF_COUNTER);
    host_latency = elapsed_seconds (&start, &end);
    debug_latency = compute_debug_latency (app);

    check_data_transfer (app, PAGE_SIZE);

    OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE,
                                          0, CHECK_BUFFER_SIZE, check, 0, NULL, NULL));

    debug_assert_cmp ("Data check", check[0], 1);

    printf ("\n%-16s %i\n", "Kernel count", check[1]);
    printf ("%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
    printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
}


static void
measure_fpga_to_gpu_latency_with_cpu (App *app)
{
    uint32_t hardware_ptr;
    uint32_t counter;
    unsigned flag_index;
    struct timespec start;
    struct timespec end;
    double host_latency;
    double debug_latency;

    printf ("\n** FPGA to GPU latency [CPU]\n\n");
    fill_gpu_buffer (app);

    hardware_ptr = 0;
    flag_index = app->board_gen == 3 ? 2 : 4;

    configure_dma (app);
    configure_dma_descriptors (app);
    setup_counter (app);

    WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);

    reset_dbg (app);

    clock_gettime (CLOCK_MONOTONIC, &start);
    WR32 (REG_DMA, 1);

    do {
        hardware_ptr = app->desc[flag_index];
    }
    while (hardware_ptr != (app->gpu.addr.surface_bus_address & 0xFFFFFFFF));

    clock_gettime (CLOCK_MONOTONIC, &end);
    stop_dma (app);

    counter = RD32 (REG_PERF_COUNTER);
    host_latency = elapsed_seconds (&start, &end);
    debug_latency = compute_debug_latency (app);

    check_data_transfer (app, PAGE_SIZE);

    printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
    printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
}

static void
measure_fpga_to_cpu_latency (App *app)
{
    uint32_t hardware_ptr;
    uint32_t counter;
    unsigned flag_index;
    struct timespec start;
    struct timespec end;
    double host_latency;
    double debug_latency;

    printf ("\n** FPGA to CPU latency\n\n");

    hardware_ptr = 0;
    flag_index = app->board_gen == 3 ? 2 : 4;

    configure_dma (app);
    configure_dma_descriptors (app);
    setup_counter (app);

    WR64_sleep (REG_DESCRIPTOR_ADDRESS, app->cpu.addr);

    reset_dbg (app);

    clock_gettime (CLOCK_MONOTONIC, &start);
    WR32 (REG_DMA, 1);

    do {
        hardware_ptr = app->desc[flag_index];
    }
    while (hardware_ptr != app->cpu.addr);

    clock_gettime (CLOCK_MONOTONIC, &end);
    stop_dma (app);

    // Data back to FPGA
    memcpy(app->bar + 0x9400, app->cpu.buffer, CPU_BUFFER_SIZE);

    counter = RD32 (REG_PERF_COUNTER);
    host_latency = elapsed_seconds (&start, &end);
    debug_latency = compute_debug_latency (app);

    check_data_transfer_cpu (app, PAGE_SIZE);

    printf ("\n%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
    printf ("%-16s %.2f MB/s\n", "Throughput", CPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
}

static void
measure_fpga_to_gpu_throughput (App *app)
{
    uint32_t current_ptr;
    uint32_t hardware_ptr;
    unsigned flag_index;
    size_t transferred;
    cl_ulong addr;
    double elapsed;
    struct timespec start;
    struct timespec end;

    printf ("\n** FPGA to GPU throughput\n\n");

    configure_dma (app);
    configure_dma_descriptors (app);
    setup_counter (app);

    current_ptr = 0;
    hardware_ptr = 0;
    transferred = 0;
    addr = app->gpu.addr.surface_bus_address;
    flag_index = app->board_gen == 3 ? 2 : 4;

    WR64_sleep (REG_DESCRIPTOR_ADDRESS, addr);

    clock_gettime (CLOCK_MONOTONIC, &start);
    WR32 (REG_DMA, 1);

    do {
        do {
            hardware_ptr = app->desc[flag_index];
        }
        while (hardware_ptr == current_ptr);    /* it should work to check against `addr` but it never does */

        addr += NUM_PAGES * PAGE_SIZE;
        transferred += NUM_PAGES * PAGE_SIZE;
        current_ptr = hardware_ptr;
        WR64 (REG_DESCRIPTOR_ADDRESS, addr);
    }
    while (transferred < GPU_BUFFER_SIZE);

    clock_gettime (CLOCK_MONOTONIC, &end);

    stop_dma (app);
    check_data_transfer (app, GPU_BUFFER_SIZE);

    elapsed = elapsed_seconds (&start, &end);

    printf ("\n%-16s %.2f MB/s [%3.5f us]\n", "Host", GPU_BUFFER_SIZE / elapsed / 1024. / 1024., elapsed * 1000 * 1000);
}

static void
measure_gpu_to_fpga_latency (App *app)
{
    cl_event event;
    cl_mem src_buffer;
    cl_int err;
    uint32_t pattern;
    struct timespec start;
    struct timespec end;

    printf ("\n** GPU  to FPGA latency\n\n");

    src_buffer = clCreateBuffer (app->context, CL_MEM_READ_ONLY, PAGE_SIZE, NULL, &err);
    OCL_CHECK_ERROR (err);

    pattern = 0xdeadf00d;
    OCL_CHECK_ERROR (clEnqueueFillBuffer (app->queue, src_buffer, &pattern, sizeof (pattern), 0, PAGE_SIZE, 0, NULL, &event));
    OCL_CHECK_ERROR (clWaitForEvents (1, &event));
    OCL_CHECK_ERROR (clReleaseEvent (event));

    clock_gettime (CLOCK_MONOTONIC, &start);

        OCL_CHECK_ERROR (clEnqueueCopyBuffer (app->queue, src_buffer, app->fpga_buffer, 0, 0x9400, PAGE_SIZE, 0, NULL, &event));
        OCL_CHECK_ERROR (clWaitForEvents (1, &event));

    clock_gettime (CLOCK_MONOTONIC, &end);

    printf ("%-16s %f us\n", "Wall time", elapsed_seconds (&start, &end) * 1000 * 1000);
    printf ("%-16s %f us\n", "GPU time", elapsed_gpu_seconds (event) * 1000 * 1000);

    OCL_CHECK_ERROR (clReleaseEvent (event));
    OCL_CHECK_ERROR (clReleaseMemObject (src_buffer));
}

int
main (int argc, char const* argv[])
{
    App app;

    if (argc > 1) {
        PAGE_SIZE = GPU_BUFFER_SIZE = atoi (argv[1]);
    }

    if (argc > 2) {
        NUM_PAGES = atoi (argv[2]);
    }

    printf ("** Parameters\n\n");
    printf ("%-16s %u\n", "Pages", NUM_PAGES);
    printf ("%-16s %zu B\n", "Buffer size", GPU_BUFFER_SIZE);

    if (!init_pcilib (&app))
        return 1;

    if (!init_opencl (&app))
        return 1;

    measure_fpga_to_gpu_latency_with_marker (&app);
    measure_fpga_to_gpu_latency_with_kernel (&app);
    measure_fpga_to_gpu_latency_with_cpu (&app);
    measure_fpga_to_cpu_latency (&app);
    measure_gpu_to_fpga_latency (&app);
    measure_fpga_to_gpu_throughput (&app);

    close_opencl (&app);
    close_pcilib (&app);
    return 0;
}