Selaa lähdekoodia

Initial commit

mathiasb 6 vuotta sitten
commit
545aa821e0

+ 67 - 0
CMakeLists.txt

@@ -0,0 +1,67 @@
+# File to generate the makefile.
+# Uses src, include and build folders.
+
+project(GPUFirstComm)
+
+cmake_minimum_required(VERSION 2.6)
+find_package(CUDA REQUIRED)
+
+include_directories(include)
+
+
+set(CUDA_NVCC_FLAGS
+  ${CUDA_NVCC_FLAGS};
+  -Wno-deprecated-gpu-targets;
+  -gencode arch=compute_60,code=sm_60;
+  -gencode arch=compute_35,code=sm_35
+  )#;-rdc=true)
+
+cuda_add_executable(floats_gpu-fpga
+  src/floats_gpu-fpga.cu
+  src/common.cu
+  src/kernels.cu
+  )
+
+cuda_add_executable(gpu-fpga
+  src/gpu-fpga.cu
+  src/common.cu
+  src/kernels.cu
+  )
+
+cuda_add_executable(matrix_gpu-fpga
+  src/matrix_gpu-fpga.cu
+  src/common.cu
+  src/kernels.cu
+  )
+
+cuda_add_executable(two_steps_dma
+  src/two_steps_dma.cu
+  src/common.cu
+  src/kernels.cu
+  )
+
+cuda_add_executable(cpu-fpga
+  src/cpu-fpga.cu
+  src/common.cu
+  src/kernels.cu
+  )
+
+cuda_add_executable(multi-gpu
+  src/multi-gpu.cu
+  src/common.cu
+  src/kernels.cu
+  )
+
+cuda_add_executable(loaded
+  src/loaded.cu
+  src/common.cu
+  src/kernels.cu
+  )
+
+target_link_libraries(floats_gpu-fpga cuda pcilib gdrapi)
+target_link_libraries(gpu-fpga cuda pcilib gdrapi)
+target_link_libraries(matrix_gpu-fpga cuda pcilib gdrapi)
+target_link_libraries(two_steps_dma cuda pcilib gdrapi)
+target_link_libraries(cpu-fpga cuda pcilib gdrapi)
+target_link_libraries(multi-gpu cuda pcilib gdrapi)
+target_link_libraries(loaded cuda pcilib gdrapi pthread)

+ 16 - 0
build/cpu-fpga.sh

@@ -0,0 +1,16 @@
+#!/bin/bash
+
+ITERATION=$1
+
+echo 4096 > cpu-fpga.csv
+for i in `seq 1 $ITERATION`;
+do
+    echo $i
+   ./cpu-fpga 4096 
+   echo , >> cpu-fpga.csv
+done
+
+echo \ >> cpu-fpga.csv
+
+cat cpu-fpga.csv | tr -d \\n > cpu-fpga.export.csv
+sed -i 's/4096/'4096\\n'/' cpu-fpga.export.csv

+ 25 - 0
build/gpu-fpga.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+
+ITERATION=$1
+
+echo P100 > gpu-fpga.csv
+# for i in `seq 1 $ITERATION`;
+# do
+#     echo $i
+#     CUDA_VISIBLE_DEVICES=0 ./gpu-fpga 4096 
+#     echo , >> gpu-fpga.csv
+# done
+
+echo \ >> gpu-fpga.csv
+
+echo K40 >> gpu-fpga.csv
+for i in `seq 1 $ITERATION`;
+do
+   echo $i
+   CUDA_VISIBLE_DEVICES=0 ./gpu-fpga 4096
+   echo , >> gpu-fpga.csv
+done
+
+cat gpu-fpga.csv | tr -d \\n > gpu-fpga.export.csv
+sed -i 's/P100/'P100\\n'/' gpu-fpga.export.csv
+sed -i 's/K40/'\\nK40\\n'/' gpu-fpga.export.csv

+ 25 - 0
build/loaded.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+
+ITERATION=$1
+
+echo P100 > loaded.csv
+for i in `seq 1 $ITERATION`;
+do
+    echo $i
+    CUDA_VISIBLE_DEVICES=0 ./loaded 4096 
+    echo , >> loaded.csv
+done
+
+echo \ >> loaded.csv
+
+echo K40 >> loaded.csv
+for i in `seq 1 $ITERATION`;
+do
+   echo $i
+   CUDA_VISIBLE_DEVICES=1 ./loaded 4096
+   echo , >> loaded.csv
+done
+
+cat loaded.csv | tr -d \\n > loaded.export.csv
+sed -i 's/P100/'P100\\n'/' loaded.export.csv
+sed -i 's/K40/'\\nK40\\n'/' loaded.export.csv

+ 14 - 0
build/multi-gpu.sh

@@ -0,0 +1,14 @@
+#!/bin/bash
+
+ITERATION=$1
+
+echo P100+K40 > multi-gpu.csv
+for i in `seq 1 $ITERATION`;
+do
+    echo $i
+    ./multi-gpu 4096
+    echo , >> multi-gpu.csv
+done
+
+cat multi-gpu.csv | tr -d \\n > multi-gpu.export.csv
+sed -i 's/K40/'K40\\n'/' multi-gpu.export.csv

+ 9 - 0
build/reload_cmake.sh

@@ -0,0 +1,9 @@
+#!/bin/bash
+
+make clean
+rm -rf CMakeFiles
+rm -f CMakeCache.txt
+rm -f cmake_install.cmake
+rm -f Makefile
+
+CC=clang CXX=clang cmake ..

+ 25 - 0
build/two_steps_dma.sh

@@ -0,0 +1,25 @@
+#!/bin/bash
+
+ITERATION=$1
+
+echo P100 > two_steps_dma.csv
+# for i in `seq 1 $ITERATION`;
+# do
+#     echo $i
+#     CUDA_VISIBLE_DEVICES=0 ./two_steps_dma 4096
+#     echo , >> two_steps_dma.csv
+# done
+
+echo \ >> two_steps_dma.csv
+
+echo K40 >> two_steps_dma.csv
+for i in `seq 1 $ITERATION`;
+do
+    echo $i
+    CUDA_VISIBLE_DEVICES=0 ./two_steps_dma 4096
+    echo , >> two_steps_dma.csv
+done
+
+cat two_steps_dma.csv | tr -d \\n > two_steps_dma.export.csv
+sed -i 's/P100/'P100\\n'/' two_steps_dma.export.csv
+sed -i 's/K40/'\\nK40\\n'/' two_steps_dma.export.csv

+ 41 - 0
include/common.h

@@ -0,0 +1,41 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+
+typedef struct matrix_t{
+    float* elements;
+    uint rows;
+    uint columns;
+    uint stride;
+} matrix;
+
+#define BLOCK_SIZE 16
+#define ASSERT_FAIL 0
+#define ASSERT_SUCCESS 1
+#define MATRIX_ROW_SIZE BLOCK_SIZE*16*1
+#define MATRIX_VALUE 2
+
+#define assert_cuda( err_id ) __assert_cuda(err_id,__FILE__,__LINE__)
+#define assert_cu( err_id ) __assert_cu(err_id,__FILE__,__LINE__)
+#define assert_gdr( err_id ) __assert_gdr(err_id,__FILE__,__LINE__)
+
+
+void __assert_cuda(cudaError_t err_id, const char* file, int line); /* for runtime api*/
+void __assert_cu(CUresult res_id, const char* file, int line); /* for driver api */
+void __assert_gdr(int gdr_id, const char* file, int line);
+void init_to_send(const void* dataPtr, size_t size, size_t nmemb);
+bool check_array(float* array, float value, size_t size);
+void deviceInformation(int device);
+void cpu_fill_array(float* array, float value, size_t size);
+void cpu_fill_array_random(float* array, size_t size);
+float cpu_average(float* array, size_t size);
+float cpu_dispersion(float* array, float average, size_t size);
+matrix identity_matrix(size_t row_size);
+void check_identity_matrix(matrix M);
+void check_matrix(matrix M, int value);
+void mult_matrix(matrix A, matrix B, matrix R);
+void fill_matrix_random(matrix M);
+
+#endif

+ 119 - 0
include/gdrapi.h

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __GDRAPI_H__
+#define __GDRAPI_H__
+
+#include <inttypes.h>
+#include <stddef.h>
+
+#define GPU_PAGE_SHIFT   16
+#define GPU_PAGE_SIZE    ((unsigned long)1 << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
+#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
+
+/*
+ * GDRCopy, a low-latency GPU memory copy library (and a kernel-mode
+ * driver) based on NVIDIA GPUDirect RDMA technology.
+ *
+ * supported environment variables:
+ *
+ * - GDRCOPY_ENABLE_LOGGING, if defined logging is enabled, default is
+ *   disabled.
+ *
+ * - GDRCOPY_LOG_LEVEL, overrides log threshold, default is to print errors
+ *   only.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Initialize the library, e.g. by opening a connection to the kernel-mode
+// driver. Returns an handle to the library state object.
+struct gdr;
+typedef struct gdr *gdr_t;
+gdr_t gdr_open();
+
+// Destroy library state object, e.g. it closes the connection to kernel-mode
+// driver.
+//
+// Note that altough BAR mappings of GPU memory are destroyed, user-space
+// mappings are not. So therefore user code is responsible of calling
+// gdr_unmap on all mappings before calling gdr_close.
+int gdr_close(gdr_t g);
+
+// Map device memory buffer on GPU BAR1, returning an handle.
+// Memory is still not accessible to user-space.
+typedef uint32_t gdr_mh_t;
+int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
+
+// Unmap the handle. 
+//
+// If there exists a corresponding user-space mapping, gdr_unmap should be
+// called before this one.
+int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
+
+// flag is set when the kernel callback (relative to the
+// nvidia_p2p_get_pages) gets invoked, e.g. cuMemFree() before
+// gdr_unpin_buffer.
+int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag);
+
+// After pinning, info struct contains details of the mapped area.  
+//
+// Note that both info->va and info->mapped_size might be different from
+// the original address passed to gdr_pin_buffer due to aligning happening
+// in the kernel-mode driver
+struct gdr_info {
+    uint64_t va;
+    uint64_t mapped_size;
+    uint32_t page_size;
+    uint64_t tm_cycles;
+    uint32_t cycles_per_ms;
+    uint64_t bus_addr; 
+};
+typedef struct gdr_info gdr_info_t;
+int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
+
+// create a user-space mapping for the BAR1 info, length is bar1->size
+// above.
+//
+// WARNING: the BAR physical address will be aligned to the page size
+// before being mapped in user-space, so the pointer returned might be
+// affected by an offset. gdr_get_info can be used to calculate that
+// offset.
+int gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+
+// get rid of a user-space mapping.
+// First invoke gdr_unmap() then gdr_unpin_buffer().
+int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+
+// gpubar_ptr is a user-space virtual address, i.e. one returned by gdr_map()
+int gdr_copy_to_bar(void  *gpubar_ptr, const void *cpumem_ptr, size_t size);
+int gdr_copy_from_bar(void *cpumem_ptr, const void *gpubar_ptr, size_t size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __GDRAPI_H__

+ 13 - 0
include/gdrconfig.h

@@ -0,0 +1,13 @@
+#pragma once
+
+#if defined __GNUC__
+#if defined(__powerpc__)
+#define GDRAPI_POWER
+#elif defined(__i386__) || defined(__x86_64__) || defined(__X86__)
+#define GDRAPI_X86
+#else
+#error "architecture is not supported"
+#endif // arch
+#else
+#error "compiler not supported"
+#endif // __GNUC__

+ 86 - 0
include/gdrdrv.h

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __GDR_DRV_H__
+#define __GDR_DRV_H__
+
+#define GDRDRV_IOCTL                 0xDA
+
+typedef __u32 gdr_hnd_t;
+#define GDR_HANDLE_MASK ((1UL<<32)-1)
+
+//-----------
+
+struct GDRDRV_IOC_PIN_BUFFER_PARAMS
+{
+    // in
+    __u64 addr;
+    __u64 size;
+    __u64 p2p_token;
+    __u32 va_space;
+    // out
+    gdr_hnd_t handle;
+};
+
+#define GDRDRV_IOC_PIN_BUFFER _IOWR(GDRDRV_IOCTL, 1, struct GDRDRV_IOC_PIN_BUFFER_PARAMS)
+
+//-----------
+
+struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+};
+
+#define GDRDRV_IOC_UNPIN_BUFFER _IOWR(GDRDRV_IOCTL, 2, struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS *)
+
+//-----------
+
+struct GDRDRV_IOC_GET_CB_FLAG_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+    // out
+    __u32 flag;
+};
+
+#define GDRDRV_IOC_GET_CB_FLAG _IOWR(GDRDRV_IOCTL, 3, struct GDRDRV_IOC_GET_CB_FLAG_PARAMS *)
+
+//-----------
+
+struct GDRDRV_IOC_GET_INFO_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+    // out
+    __u64 va;
+    __u64 mapped_size;
+    __u32 page_size;
+    __u32 tsc_khz;
+    __u64 tm_cycles;
+};
+
+#define GDRDRV_IOC_GET_INFO _IOWR(GDRDRV_IOCTL, 4, struct GDRDRV_IOC_GET_INFO_PARAMS *)
+
+//-----------
+
+#endif // __GDR_DRV_H__

+ 32 - 0
include/ipedma.h

@@ -0,0 +1,32 @@
+#define REG_RESET_DMA                   0x00
+#define REG_DMA                         0x04
+#define REG_NUM_PACKETS_PER_DESCRIPTOR  0x10
+#define REG_PERF_COUNTER		0x20
+//#define REG_PERF_COUNTER                0x28
+#define REG_PACKET_LENGTH               0x0C
+#define REG_DESCRIPTOR_ADDRESS          0x50
+#define REG_UPDATE_ADDRESS              0x58
+#define REG_UPDATE_THRESHOLD            0x60
+#define REG_UPDATE_COUNTER		0x70
+#define REG_TIMER_THRESHOLD          0x64
+
+#define REG_INTERCONNECT		0x9048
+#define REG_COUNTER                     0x9000
+
+
+#define DESCRIPTOR_OFFSET		256
+
+#define REG_CONTROL                  0x9040
+#define CONTROL_ENABLE_READ          0x00000400
+#define CONTROL_ENABLE_MULTI_READ    0x00000800
+#define CONTROL_SOURCE_RX_FIFO       0x00010000
+
+
+#define WR32(addr, value) *(volatile uint32_t *) (((char*)(pciVa)) + (addr)) = (value);
+#define RD32(addr) (*(volatile uint32_t *) (((char*)(pciVa)) + (addr)))
+#define WR32_sleep(addr, value) *(volatile uint32_t *) (((char*)(pciVa)) + (addr)) = (value); usleep (100);
+
+#define WR64(addr, value) *(volatile uint64_t *) (((char*)(pciVa)) + (addr)) = (value);
+#define RD64(addr) (*(volatile uint64_t *) (((char*)(pciVa)) + (addr)))
+#define WR64_sleep(addr, value) *(uint64_t *) (((char*)(pciVa)) + (addr)) = (value); usleep (100);
+

+ 24 - 0
include/kernels.h

@@ -0,0 +1,24 @@
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "gdrapi.h"
+#include "common.h"
+
+__device__ void add_two_device(CUdeviceptr number);
+__global__ void add_three_global(CUdeviceptr number);
+__global__ void add_one_global(CUdeviceptr number);
+__global__ void fill_array(int* array, int value, size_t size);
+__global__ void fill_float_array(float* array, float value, size_t size);
+extern __shared__ float _temp[];
+__global__ void get_reduction(float* array, float* output, size_t size, size_t stride);
+__global__ void get_average_reduction(float* array, float* average, size_t size, size_t stride, size_t divider);
+__global__ void get_dispersion_reduction(float* array, float* average, float* output, size_t size, size_t stride, size_t divider);
+void get_average(float* array, float* average, size_t size,dim3 blocks_per_grid, dim3 threads_per_block);
+void get_dispersion(float* array, float* average, float* dispersion, size_t size, dim3 blocks_per_grid, dim3 threads_per_block);
+__global__ void fill_matrix(int* M,int value);
+__global__ void add_matrix(int* A,int* B,int* C);
+__global__ void fill_matrix2(int* M,int value,size_t flat_size);
+__global__ void add_matrix2(int* A,int* B,int* C,size_t flat_size);
+__global__ void add_matrix_mod(int* B,int* C);
+__global__ void kern_identity_matrix(matrix A);
+__global__ void kern_mult_matrix_naive(matrix A, matrix B, matrix R);
+__global__ void kern_mult_matrix_shared(matrix A, matrix B, matrix R);

+ 11 - 0
launch.sh

@@ -0,0 +1,11 @@
+#!/bin/bash
+echo "Resetting the FPGA"
+/usr/local/bin/pci -w 0 1
+/usr/local/bin/pci -w 0 0
+/usr/local/bin/pci -w 9040 F
+/usr/local/bin/pci -w 9040 0
+sleep 1
+echo "Putting the data on the FPGA. WARNING, hardcoded size"
+#ddrio -v -i /home/mathiasb/sources/gpuFirstComm/to_send -s 0
+#ddrio -v -o /home/mathiasb/sources/gpuFirstComm/written -s 5
+ddrio -v -i /home/mathiasb/sources/benchmarking/to_send -s 0

+ 107 - 0
saves/array_operate.cu

@@ -0,0 +1,107 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include <unistd.h>
+#include <sys/time.h>
+#include <math.h>
+#include "kernels.h"
+
+#define ARRAY_SIZE 224*128*16*2
+#define VALUE 2.
+
+int main(int argc, char* argv[])
+{
+    assert_cuda( cudaSetDevice(1) );
+    cudaDeviceProp prop;
+    assert_cuda( cudaGetDeviceProperties(&prop,0) );
+    deviceInformation(0);
+    printf("%s\n\n",prop.name);
+
+    cudaError_t err;
+    float input[ARRAY_SIZE];
+    dim3 blocks_per_grid(224,1,1);
+    dim3 threads_per_block(128,1,1);
+    float* dev_array;
+
+    cudaEvent_t start,stop;
+    float ms;
+    assert_cuda( cudaEventCreate(&start) );
+    assert_cuda( cudaEventCreate(&stop) );
+
+    assert_cuda( cudaMalloc((void**)&dev_array,ARRAY_SIZE*sizeof(float)) );
+
+    assert_cuda( cudaEventRecord(start) );
+    fill_float_array<<< blocks_per_grid,threads_per_block >>>(dev_array,VALUE,ARRAY_SIZE);
+    err = cudaGetLastError();
+    printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err));
+
+    assert_cuda( cudaMemcpy(input,dev_array,ARRAY_SIZE*sizeof(float),cudaMemcpyDeviceToHost) );
+
+    float average=0.;
+    float* dev_average;
+    assert_cuda( cudaMalloc((void**)&dev_average,sizeof(float)) );
+    /* blocks_per_grid.x=16*14; */
+    /* threads_per_block.x=128; */
+
+    assert_cuda( cudaMemcpy(dev_array,input,ARRAY_SIZE*sizeof(float),cudaMemcpyHostToDevice) );
+
+    for(int k=0;k<ITERATION;k++)
+    {
+	assert_cuda( cudaEventRecord(start) );
+	get_average(dev_array,dev_average,ARRAY_SIZE,blocks_per_grid,threads_per_block);
+	err = cudaGetLastError();
+	printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err));
+	assert_cuda( cudaDeviceSynchronize() );
+	assert_cuda( cudaEventRecord(stop) );
+	assert_cuda( cudaEventSynchronize(stop) );
+	assert_cuda( cudaEventElapsedTime(&ms,start,stop) );
+	assert_cuda( cudaMemcpy(&average,dev_average,sizeof(float),cudaMemcpyDeviceToHost) );
+	printf("avg_gpu = %f\n",average);
+	printf("Elapsed time: %f ms\n",ms);
+    }
+
+
+    float dispersion;
+    float* dev_dispersion;
+    assert_cuda( cudaMalloc((void**)&dev_dispersion,sizeof(float)) );
+
+    for(int k=0;k<ITERATION;k++)
+    {
+	assert_cuda( cudaEventRecord(start) );
+	get_dispersion(dev_array,dev_average,dev_dispersion,ARRAY_SIZE,blocks_per_grid,threads_per_block);
+	err = cudaGetLastError();
+	printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err));
+	assert_cuda( cudaDeviceSynchronize() );
+	assert_cuda( cudaEventRecord(stop) );
+	assert_cuda( cudaEventSynchronize(stop) );
+	assert_cuda( cudaEventElapsedTime(&ms,start,stop) );
+	assert_cuda( cudaMemcpy(&average,dev_average,sizeof(float),cudaMemcpyDeviceToHost) );
+	assert_cuda( cudaMemcpy(&dispersion,dev_dispersion,sizeof(float),cudaMemcpyDeviceToHost) );
+	printf("avg_gpu = %f\n",average);
+	printf("disp_gpu = %f\n",dispersion);
+	printf("Elapsed time: %f ms\n",ms);
+    }
+
+
+    struct timeval t1,t2;
+    float time;
+
+    for(int k=0;k<ITERATION;k++)
+    {
+	gettimeofday(&t1,0);
+	average = cpu_average(input,ARRAY_SIZE);
+//	dispersion = cpu_dispersion(input,average,ARRAY_SIZE);
+	gettimeofday(&t2,0);
+	time = t2.tv_usec - t1.tv_usec;
+	printf("avg_cpu = %f\n",average);
+	printf("disp_cpu = %f\n",dispersion);
+	printf("Elapsed time: %f us\n",time);
+    }
+
+    assert_cuda( cudaEventDestroy(start) );
+    assert_cuda( cudaEventDestroy(stop) );
+    assert_cuda( cudaFree(dev_average) );
+    assert_cuda( cudaFree(dev_dispersion) );
+    assert_cuda( cudaFree(dev_array) );
+    exit( EXIT_SUCCESS );
+}

+ 124 - 0
saves/gdrapi.h.save

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __GDRAPI_H__
+#define __GDRAPI_H__
+
+#include <stdint.h> // for standard [u]intX_t types
+#include <stddef.h>
+
+#define GDR_API_MAJOR_VERSION    1
+#define GDR_API_MINOR_VERSION    2
+#define GDR_API_VERSION          ((GDR_API_MAJOR_VERSION << 16) | GDR_API_MINOR_VERSION)
+
+
+#define GPU_PAGE_SHIFT   16
+#define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
+#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
+
+/*
+ * GDRCopy, a low-latency GPU memory copy library (and a kernel-mode
+ * driver) based on NVIDIA GPUDirect RDMA technology.
+ *
+ * supported environment variables:
+ *
+ * - GDRCOPY_ENABLE_LOGGING, if defined logging is enabled, default is
+ *   disabled.
+ *
+ * - GDRCOPY_LOG_LEVEL, overrides log threshold, default is to print errors
+ *   only.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct gdr;
+typedef struct gdr *gdr_t;
+
+// Initialize the library, e.g. by opening a connection to the kernel-mode
+// driver. Returns an handle to the library state object.
+gdr_t gdr_open();
+
+// Destroy library state object, e.g. it closes the connection to kernel-mode
+// driver.
+//
+// Note that altough BAR mappings of GPU memory are destroyed, user-space
+// mappings are not. So therefore user code is responsible of calling
+// gdr_unmap on all mappings before calling gdr_close.
+int gdr_close(gdr_t g);
+
+// Map device memory buffer on GPU BAR1, returning an handle.
+// Memory is still not accessible to user-space.
+typedef uint32_t gdr_mh_t;
+int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
+
+// Unmap the handle. 
+//
+// If there exists a corresponding user-space mapping, gdr_unmap should be
+// called before this one.
+int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
+
+// flag is set when the kernel callback (relative to the
+// nvidia_p2p_get_pages) gets invoked, e.g. cuMemFree() before
+// gdr_unpin_buffer.
+int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag);
+
+// After pinning, info struct contains details of the mapped area.  
+//
+// Note that both info->va and info->mapped_size might be different from
+// the original address passed to gdr_pin_buffer due to aligning happening
+// in the kernel-mode driver
+struct gdr_info {
+    uint64_t va;
+    uint64_t mapped_size;
+    uint32_t page_size;
+    uint64_t tm_cycles;
+    uint32_t cycles_per_ms;
+};
+typedef struct gdr_info gdr_info_t;
+int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
+
+// create a user-space mapping for the BAR1 info, length is bar1->size
+// above.
+//
+// WARNING: the BAR physical address will be aligned to the page size
+// before being mapped in user-space, so the pointer returned might be
+// affected by an offset. gdr_get_info can be used to calculate that
+// offset.
+int gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+
+// get rid of a user-space mapping.
+// First invoke gdr_unmap() then gdr_unpin_buffer().
+int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+
+// gpubar_ptr is a user-space virtual address, i.e. one returned by gdr_map()
+int gdr_copy_to_bar(void  *gpubar_ptr, const void *cpumem_ptr, size_t size);
+int gdr_copy_from_bar(void *cpumem_ptr, const void *gpubar_ptr, size_t size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __GDRAPI_H__

+ 178 - 0
saves/kernels.save.cu

@@ -0,0 +1,178 @@
+/* This file contains the kernels i.e. the functions to be executed on the GPU */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "kernels.h"
+#include "common.h"
+
+__device__
+void add_two_device(CUdeviceptr number)
+{
+    (* (int*) number)+=2;
+}
+
+__global__
+void add_three_global(CUdeviceptr number)
+{
+    (* (int*) number)++;
+    add_two_device(number);
+}
+
+__global__
+void add_one_global(CUdeviceptr number)
+{
+    (* (int*) number)++;
+}
+
+__global__
+void fill_array(int* array, int value, size_t size)
+{
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    uint step = blockDim.x*gridDim.x;
+    for(int i=0;x+i<size;i+=step)
+	array[x+i] = value;
+}
+
+__global__
+void fill_float_array(float* array, float value, size_t size)
+{
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    uint step = blockDim.x*gridDim.x;
+    for(int i=0;x+i<size;i+=step)
+	array[x+i] = value;
+}
+
+
+extern __shared__ float _temp[];
+__global__ 
+void get_reduction(float* array, float* output, size_t size, size_t stride)
+{
+    float* temp = (float*)_temp;
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    if(x<size)
+	temp[threadIdx.x]=array[x];
+    __syncthreads();
+    for(uint s=1;s<blockDim.x;s*=2)
+    {
+	x = threadIdx.x*2*s;
+	if(x+s < blockDim.x)
+	    temp[x]+=temp[x+s];
+	__syncthreads();
+    }
+    if(threadIdx.x==0)
+    	output[blockIdx.x+stride*gridDim.x]=temp[0];
+}
+__global__ 
+void get_average_reduction(float* array, float* average, size_t size, size_t stride, size_t divider)
+{
+    float* temp = (float*)_temp;
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    if(x<size)
+	temp[threadIdx.x]=array[x];
+    __syncthreads();
+    for(uint s=1;s<blockDim.x;s*=2)
+    {
+	x = threadIdx.x*2*s;
+	if(x+s < blockDim.x)
+	    temp[x]+=temp[x+s];
+	__syncthreads();
+    }
+    if(threadIdx.x==0)
+    	*average=temp[0]/divider;
+}
+__global__ 
+void get_dispersion_reduction(float* array, float* average, float* output, size_t size, size_t stride, size_t divider)
+{
+    float* temp = (float*)_temp;
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    if(x<size)
+	temp[threadIdx.x]=(array[x]-*average) * (array[x]-*average);
+    __syncthreads();
+    for(uint s=1;s<blockDim.x;s*=2)
+    {
+	x = threadIdx.x*2*s;
+	if(x+s < blockDim.x)
+	    temp[x]+=temp[x+s];
+	__syncthreads();
+    }
+    if(threadIdx.x==0)
+    	output[blockIdx.x+stride*gridDim.x]=temp[0]/divider;
+}
+
+void get_average(float* array, float* average, size_t size,dim3 blocks_per_grid, dim3 threads_per_block)
+{
+    float *output,*extend;
+    size_t stride = size/(blocks_per_grid.x*threads_per_block.x);
+    size_t output_size = stride*blocks_per_grid.x;
+    size_t extend_size = stride;
+    assert_cuda( cudaMalloc((void**)&output,output_size*sizeof(float)) );
+    assert_cuda( cudaMalloc((void**)&extend,extend_size*sizeof(float)) );
+    stride=0;
+    while(stride*blocks_per_grid.x*threads_per_block.x < size)
+    {
+	get_reduction
+	    <<< blocks_per_grid,threads_per_block,threads_per_block.x*sizeof(float)  >>>
+	    (array+stride*blocks_per_grid.x*threads_per_block.x,
+	     output,
+	     blocks_per_grid.x*threads_per_block.x,
+	     stride);
+	stride++;
+    }
+    assert_cuda( cudaDeviceSynchronize() );
+    get_reduction
+	<<< extend_size,blocks_per_grid,blocks_per_grid.x*sizeof(float)  >>>
+	(output,
+	 extend,
+	 output_size,
+	 0);
+    assert_cuda( cudaDeviceSynchronize() );
+    get_average_reduction
+	<<< 1,extend_size,extend_size*sizeof(float)  >>>
+	(extend,
+	 average,
+	 extend_size,//blocks_per_grid.x,
+	 0,
+	 size);
+    assert_cuda( cudaFree(output) );
+    assert_cuda( cudaFree(extend) );
+}
+
+void get_dispersion(float* array, float* average, float* dispersion, size_t size, dim3 blocks_per_grid, dim3 threads_per_block)
+{
+    get_average(array,average,size,blocks_per_grid,threads_per_block);
+    float *output,*extend;
+    size_t stride = size/(blocks_per_grid.x*threads_per_block.x);
+    size_t output_size = stride*blocks_per_grid.x;
+    size_t extend_size = stride;
+    assert_cuda( cudaMalloc((void**)&output,output_size*sizeof(float)) );
+    assert_cuda( cudaMalloc((void**)&extend,extend_size*sizeof(float)) );
+    stride=0;
+    while(stride*blocks_per_grid.x*threads_per_block.x < size)
+    {
+	get_dispersion_reduction
+	    <<< blocks_per_grid,threads_per_block,threads_per_block.x*sizeof(float)  >>>
+	    (array+stride*blocks_per_grid.x*threads_per_block.x,
+	     average,
+	     output,
+	     blocks_per_grid.x*threads_per_block.x,
+	     stride,
+	     size);
+	stride++;
+    }
+    assert_cuda( cudaDeviceSynchronize() );
+    get_reduction
+	<<< extend_size,blocks_per_grid,blocks_per_grid.x*sizeof(float)  >>>
+	(output,
+	 extend,
+	 output_size,
+	 0);
+    assert_cuda( cudaDeviceSynchronize() );
+    get_reduction
+      	<<< 1,extend_size,blocks_per_grid.x*sizeof(float)  >>>
+    	(extend,
+    	 dispersion,
+    	 extend_size,//blocks_per_grid.x,
+    	 0);
+    assert_cuda( cudaFree(output) );
+    assert_cuda( cudaFree(extend) );
+}

+ 15 - 0
saves/kernels.save.h

@@ -0,0 +1,15 @@
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "gdrapi.h"
+
+__device__ void add_two_device(CUdeviceptr number);
+__global__ void add_three_global(CUdeviceptr number);
+__global__ void add_one_global(CUdeviceptr number);
+__global__ void fill_array(int* array, int value, size_t size);
+__global__ void fill_float_array(float* array, float value, size_t size);
+extern __shared__ float _temp[];
+__global__ void get_reduction(float* array, float* output, size_t size, size_t stride);
+__global__ void get_average_reduction(float* array, float* average, size_t size, size_t stride, size_t divider);
+__global__ void get_dispersion_reduction(float* array, float* average, float* output, size_t size, size_t stride, size_t divider);
+void get_average(float* array, float* average, size_t size,dim3 blocks_per_grid, dim3 threads_per_block);
+void get_dispersion(float* array, float* average, float* dispersion, size_t size, dim3 blocks_per_grid, dim3 threads_per_block);

+ 41 - 0
saves/mult_latency_test.sh

@@ -0,0 +1,41 @@
+#!/bin/bash
+
+ITERATION=$1
+
+declare -a NB_BYTES=('4096' '40960' '409600');
+declare -a NB_TRANSFER=('1' '10' '100');
+
+echo P100 > multiple_test.csv
+
+for j in `seq 0 3`;
+do
+    echo NB_BYTES,${NB_BYTES[$j]}, >> multiple_test.csv
+    for i in `seq 1 $ITERATION`;
+    do
+	CUDA_VISIBLE_DEVICES=0 ./multiple_test ${NB_BYTES[$j]} ${NB_TRANSFER[$j]}  
+    	echo , >> multiple_test.csv
+    done
+done
+
+echo \ >> multiple_test.csv
+echo K40 >> multiple_test.csv
+
+for j in `seq 0 3`;
+do
+    echo NB_BYTES,${NB_BYTES[$j]}, >> multiple_test.csv
+    for i in `seq 1 $ITERATION`;
+    do
+	CUDA_VISIBLE_DEVICES=1 ./multiple_test ${NB_BYTES[$j]} ${NB_TRANSFER[$j]}  
+    	echo , >> multiple_test.csv
+    done
+done
+
+cat multiple_test.csv | tr -d \\n > multiple_test.export.csv
+# sed -i 's/P100/'P100\\n'/' multiple_test.export.csv
+ sed -i 's/K40/'\\n\\nK40'/' multiple_test.export.csv
+
+for k in `seq 0 3`;
+do
+    echo $k
+    sed -i "s/NB_BYTES,${NB_BYTES[$k]},/\\nNB_BYTES,${NB_BYTES[$k]}\\n/g" multiple_test.export.csv
+done

+ 275 - 0
saves/multiple_test.cu

@@ -0,0 +1,275 @@
+/* A single test aimed at being looped with a script. Handles a number of Bytes. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+#include <unistd.h>
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+
+#define PCI_TRANSFER_SIZE 4096
+#define GPU_PAGE 65536
+#define DATA 0xBB
+
+
+/* argv[1] = number of bytes to be written */
+int main(int argc, char* argv[])
+{
+    FILE* fp = fopen("multiple_test.csv","a");
+    if( fp == NULL )
+    {
+	printf("Cannot open file multiple_test.csv\n");
+	exit( EXIT_FAILURE );
+    }
+
+    int nb_bytes = atoi(argv[argc -2]);
+    printf("nb_bytes = %d\n",nb_bytes);
+    int nb_transfer = atoi(argv[argc -1]);
+    printf("nb_transfer = %d\n",nb_transfer);
+    
+    unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
+    memset(data,DATA,nb_bytes);
+    init_to_send(data,sizeof(char),PCI_TRANSFER_SIZE);
+    system("/home/mathiasb/sources/benchmarking/launch.sh");	
+
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+    if( g==NULL)
+    {
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Manage NVIDIA GPU */
+    printf("\nInitialisation of the GPU\n");
+    CUdevice GPU;
+    CUdevprop GPUProp;
+    assert_cu( cuDeviceGet(&GPU,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
+    char gpu_name[GPU_NAME_LENGTH] = {0};
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU));
+    printf("GPU: %s\n", gpu_name);    
+    /* Check context */
+    CUcontext cuCtx;
+    assert_cu( cuCtxCreate(&cuCtx,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU) );
+    assert_cu( cuCtxSetCurrent(cuCtx) );
+    
+    /* Allocate memory on the device, pin and map */
+    uint8_t flagValueToSet = 1;
+    printf("\nMemory mapping with the GPU for pages\n");
+    CUdeviceptr gpuPagePtr;
+    assert_cu( cuMemAlloc(&gpuPagePtr,PCI_TRANSFER_SIZE) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr) );
+    gdr_mh_t GPUMemHandlePage;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr,PCI_TRANSFER_SIZE,0,0,&GPUMemHandlePage) );
+    void* gpuPageVa;
+    assert_gdr( gdr_map(g,GPUMemHandlePage,&gpuPageVa,PCI_TRANSFER_SIZE) );
+    gdr_info_t pageInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage,&pageInfo) );
+    printf("Memory mapping with the GPU for descriptors\n");
+    CUdeviceptr gpuDescPtr;
+    assert_cu( cuMemAlloc(&gpuDescPtr,GPU_PAGE) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr) );
+    gdr_mh_t GPUMemHandleDesc;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr,GPU_PAGE,0,0,&GPUMemHandleDesc) );
+    void* gpuDescVa;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc,&gpuDescVa,GPU_PAGE) );
+    gdr_info_t descInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc,&descInfo) );
+
+    /* PCI */
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL )
+    {
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL )
+    {
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    CUdeviceptr dBAR;
+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PCI_TRANSFER_SIZE%4096)?(4096 * (1 + PCI_TRANSFER_SIZE/4096)):PCI_TRANSFER_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL )
+    {
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL )
+    {
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 )
+    {
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL )
+    {
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL )
+    {
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 )
+    {
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+    double simple_write_meas1;
+    double simple_write_meas2;
+    double start_meas;
+    double meas_result;
+    unsigned char* getBack=(unsigned char*)calloc(4096,sizeof(*getBack));
+    meas_result = 0.;
+    volatile uint64_t *hwaddr = (uint64_t*)((char*)pciMemPtrDesc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
+
+    /* for(int j=0;j<4096;j++) */
+    /* 	printf("%hhx",data[j]); */
+    /* printf("\n"); */
+    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++) */
+    /* 	printf("%hhx",getBack[j]); */
+    /* printf("\n"); */
+	
+    printf("\nWorking on the FPGA\n");
+    for(int k=0;k<nb_transfer;k++)
+    {
+	WR32(REG_RESET_DMA, 1);
+	usleep(100000);
+	WR32(REG_RESET_DMA, 0);
+	usleep(100000);
+	WR32_sleep(REG_PERF_COUNTER,0);
+	WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,16); //16);
+	WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
+	WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+	WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+	WR64_sleep(REG_UPDATE_COUNTER,descInfo.bus_addr);
+	WR64_sleep(REG_UPDATE_ADDRESS,pciBusDesc+DESCRIPTOR_OFFSET);
+
+	WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+	WR32_sleep(REG_DMA,1);
+
+	WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
+	WR32_sleep(REG_COUNTER,0x1);
+	start_meas = 4. *RD32 (0x14)/ 1000;
+	*(int*)gpuDescVa=0;
+	simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
+	WR64(REG_DESCRIPTOR_ADDRESS,pageInfo.bus_addr+k*PCI_TRANSFER_SIZE);
+	while(!*(int*)gpuDescVa);
+	    simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
+	meas_result+=simple_write_meas2-simple_write_meas1;
+
+	usleep(1000);
+	assert_cuda( cudaMemcpy((void*)gpuPageVa,(const void*)gpuPagePtr,4096,cudaMemcpyDeviceToHost) );
+	/* memcpy(getBack,(const void*)pciMemPtrDesc,4096); */
+	memcpy(getBack,(const void*)gpuPageVa,4096);
+	/* for(int j=0;j<4096;j++) */
+	/* { */
+	/*     printf("%hhx",getBack[j]); */
+	/*     if( getBack[j]!=DATA ) */
+	/*     { */
+	/* 	printf("Last at %d\n",j); */
+	/* 	goto exit_failure; */
+	/*     } */
+	/* } */
+    }
+    printf("\n");
+    printf("number of descriptor: %d\n",*(int*)gpuDescVa);
+    printf("start_meas = %lf\n",start_meas);
+    printf("hwaddr = %lx\ngpuPagePtr = %llx\n",*hwaddr,pageInfo.bus_addr);
+
+    fprintf(fp,"%lf",meas_result);
+    
+
+    /* Close everything */
+    printf("\nClosing the connections\n");
+    free(getBack);
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,PCI_TRANSFER_SIZE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+    
+    fclose(fp);
+    
+    printf("All Cleared\n");
+    
+    exit(EXIT_SUCCESS);
+
+exit_failure:
+    printf("\nClosing the connections\n");
+    free(getBack);
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,PCI_TRANSFER_SIZE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+    fclose(fp);
+    printf("All Cleared\n");
+    exit(EXIT_FAILURE);
+
+}

+ 270 - 0
saves/test2.save.cu

@@ -0,0 +1,270 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+#include <unistd.h>
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+
+#define PCI_TRANSFER_SIZE 8192
+#define GPU_PAGE 65536
+#define DATA 0xbb
+
+
+/* argv[1] = number of bytes to be written */
+int main(int argc, char* argv[])
+{
+    FILE* fp = fopen("unit_test.csv","a");
+    if( fp == NULL )
+    {
+	printf("Cannot open file test1.csv\n");
+	exit( EXIT_FAILURE );
+    }
+
+    int nb_bytes = atoi(argv[argc -1]);
+    printf("nb_bytes = %d\n",nb_bytes);
+    int nb_transfer = (nb_bytes % PCI_TRANSFER_SIZE) ? nb_bytes/PCI_TRANSFER_SIZE +1:nb_bytes / PCI_TRANSFER_SIZE;
+
+    unsigned char* data=(unsigned char*)calloc(8192,sizeof(*data));
+    memset(data,DATA,nb_bytes); memset(data+nb_bytes,0x00,8192-nb_bytes);
+    init_to_send(data,sizeof(char),nb_bytes);
+    system("/home/mathiasb/sources/benchmarking/launch.sh");	
+
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+    if( g==NULL)
+    {
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Manage NVIDIA GPU */
+    printf("\nInitialisation of the GPU\n");
+    CUdevice GPU;
+    CUdevprop GPUProp;
+    assert_cu( cuDeviceGet(&GPU,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
+    char gpu_name[GPU_NAME_LENGTH] = {0};
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU));
+    printf("GPU: %s\n", gpu_name);    
+    /* Check context */
+    CUcontext cuCtx;
+    assert_cu( cuCtxCreate(&cuCtx,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU) );
+    assert_cu( cuCtxSetCurrent(cuCtx) );
+    
+    /* Allocate memory on the device, pin and map */
+    uint8_t flagValueToSet = 1;
+    printf("\nMemory mapping with the GPU for pages\n");
+    CUdeviceptr gpuPagePtr;
+    assert_cu( cuMemAlloc(&gpuPagePtr,PCI_TRANSFER_SIZE) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr) );
+    gdr_mh_t GPUMemHandlePage;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr,PCI_TRANSFER_SIZE,0,0,&GPUMemHandlePage) );
+    void* gpuPageVa;
+    assert_gdr( gdr_map(g,GPUMemHandlePage,&gpuPageVa,PCI_TRANSFER_SIZE) );
+    gdr_info_t pageInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage,&pageInfo) );
+    printf("Memory mapping with the GPU for descriptors\n");
+    CUdeviceptr gpuDescPtr;
+    assert_cu( cuMemAlloc(&gpuDescPtr,GPU_PAGE) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr) );
+    gdr_mh_t GPUMemHandleDesc;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr,GPU_PAGE,0,0,&GPUMemHandleDesc) );
+    void* gpuDescVa;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc,&gpuDescVa,GPU_PAGE) );
+    gdr_info_t descInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc,&descInfo) );
+
+    /* PCI */
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL )
+    {
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL )
+    {
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    CUdeviceptr dBAR;
+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PCI_TRANSFER_SIZE%4096)?(4096 * (1 + PCI_TRANSFER_SIZE/4096)):PCI_TRANSFER_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL )
+    {
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL )
+    {
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 )
+    {
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL )
+    {
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL )
+    {
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 )
+    {
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+    double simple_write_meas1;
+    double simple_write_meas2;
+    double start_meas;
+    double meas_result;
+    unsigned char* getBack=(unsigned char*)calloc(8192,sizeof(*getBack));
+
+    volatile uint64_t *hwaddr = (uint64_t*)((char*)pciMemPtrDesc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
+
+    /* for(int j=0;j<8192;j++) */
+    /* 	printf("%hhx",data[j]); */
+    /* printf("\n"); */
+    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++) */
+    /* 	printf("%hhx",getBack[j]); */
+    /* printf("\n"); */
+	
+
+    printf("\nWorking on the FPGA\n");
+    WR32(REG_RESET_DMA, 1);
+    usleep(100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep(100000);
+    WR32_sleep(REG_PERF_COUNTER,0);
+    WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,32); //16);
+    WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
+    WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+    WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+    WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
+    WR64_sleep(REG_UPDATE_ADDRESS,descInfo.bus_addr+DESCRIPTOR_OFFSET);
+
+    WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+    WR32_sleep(REG_DMA,1);
+
+    WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
+    WR32_sleep(REG_COUNTER,0x1);
+    start_meas = 4. *RD32 (0x14)/ 1000;
+    *(int*)pciMemPtrDesc=0;
+    simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
+    WR64(REG_DESCRIPTOR_ADDRESS,pciPageInfo);
+    while(!*(int*)pciMemPtrDesc)
+	simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
+    meas_result=simple_write_meas2-simple_write_meas1;
+
+    usleep(1000);
+    /* assert_cuda( cudaMemcpy((void*)gpuPageVa,(const void*)gpuPagePtr,8192,cudaMemcpyDeviceToHost) ); */
+    /* /\* memcpy(getBack,(const void*)pciMemPtrDesc,8192); *\/ */
+    /* memcpy(getBack,(const void*)gpuPageVa,8192); */
+    /* for(int j=0;j<8192;j++) */
+    /* { */
+    /* 	if( getBack[j]!=DATA ) */
+    /* 	{ */
+    /* 	    printf("Last at %d\n",j); */
+    /* 	    goto exit_failure; */
+    /* 	} */
+    /* 	printf("%hhx",getBack[j]); */
+    /* } */
+    /* printf("\n"); */
+    /* printf("number of descriptor: %d\n",*(int*)gpuDescVa); */
+    /* printf("start_meas = %lf\n",start_meas); */
+    /* printf("hwaddr = %lx\ngpuPagePtr = %llx\n",*hwaddr,pageInfo.bus_addr); */
+
+    fprintf(fp,"%lf",meas_result);
+    
+
+    /* Close everything */
+    printf("\nClosing the connections\n");
+    free(getBack);
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,PCI_TRANSFER_SIZE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+    
+    fclose(fp);
+    
+    printf("All Cleared\n");
+    
+    exit(EXIT_SUCCESS);
+
+exit_failure:
+    printf("\nClosing the connections\n");
+    free(getBack);
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,PCI_TRANSFER_SIZE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+    fclose(fp);
+    printf("All Cleared\n");
+    exit(EXIT_FAILURE);
+
+}

+ 195 - 0
src/common.cu

@@ -0,0 +1,195 @@
+/* This files contains useful fonctions like assertions */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+void __assert_cuda(cudaError_t err_id, const char* file, int line)
+{
+    if( err_id != cudaSuccess )
+    {
+	printf("%s at %s:%i\n",cudaGetErrorString(err_id),file,line);
+	exit(EXIT_FAILURE);
+    }
+}
+
+void __assert_cu(CUresult res_id, const char* file, int line)
+{
+    if( res_id != CUDA_SUCCESS )
+    {
+	printf("Error in driver api returned with code: %d at %s:%i\n",res_id,file,line);
+	exit(EXIT_FAILURE);
+    }
+}
+
+void __assert_gdr(int gdr_id, const char* file, int line)
+{
+    if( gdr_id != 0 )
+    {
+	printf("Error in gdr api returned with code: %d at %s:%i\n",gdr_id,file,line);
+	exit(EXIT_FAILURE);
+    }
+}
+
+void init_to_send(const void* dataPtr, size_t size, size_t nmemb)
+{
+    FILE* filePtr = fopen("/home/mathiasb/sources/benchmarking/to_send","wb");
+    if( filePtr == NULL )
+    {
+	printf("Could not open to_send file. Exiting...\n");
+	exit( EXIT_FAILURE );
+    }
+    int errCheck;
+    /* for(int i=0;i<nmemb;i++) */
+    /* 	errCheck = fprintf(filePtr,"%f",dataPtr[i]); */
+    errCheck = fwrite(dataPtr,size,nmemb,filePtr);
+    if( errCheck == 0 )
+    {
+	printf("Could not write the items. Exiting...\n");
+	exit( EXIT_FAILURE );
+    }
+    fclose(filePtr);
+}
+
+bool check_array(float* array, float value, size_t size)
+{
+    for(int i=0;i<size;i++)
+    {
+	if(array[i] != value)
+	{
+	    printf("Value error at %d: got %f, expected %f\n",i,array[i],value);
+	    return 0;
+	}
+    }
+    printf("Array checked: no error\n");
+    return 1;
+}
+
+void deviceInformation(int device)
+{
+    int* value=(int*)malloc(5*sizeof(int));
+    const char * query[]={"Threads per block","Block dim X","Grid dim X","Clock rate","Multiprocessor","Threads per Multiprocessor"};
+    assert_cuda( cudaDeviceGetAttribute(value+0,cudaDevAttrMaxThreadsPerBlock,0) );
+    assert_cuda( cudaDeviceGetAttribute(value+1,cudaDevAttrMaxBlockDimX,0) );
+    assert_cuda( cudaDeviceGetAttribute(value+2,cudaDevAttrMaxGridDimX,0) );
+    assert_cuda( cudaDeviceGetAttribute(value+3,cudaDevAttrClockRate,0) );
+    assert_cuda( cudaDeviceGetAttribute(value+4,cudaDevAttrMultiProcessorCount,0) );
+    assert_cuda( cudaDeviceGetAttribute(value+5,cudaDevAttrMaxThreadsPerMultiProcessor,0) );
+    for(int i=0;i<5;i++)
+	printf("%s: %d\n",query[i],value[i]);
+    free(value);
+}
+
+void cpu_fill_array(float* array, float value, size_t size)
+{
+    for(int i=0;i<size;i++)
+	array[i] = value;
+}
+
+void cpu_fill_array_random(float* array, size_t size)
+{
+    srand(time(NULL));
+    float value;
+    for(int i=0;i<size;i++)
+    {
+	value = ( (float)rand() / (float)RAND_MAX );
+	array[i] = value;
+    }
+}
+
+float cpu_average(float* array, size_t size)
+{
+    float sum = 0.;
+    for(int i=0;i<size;i++)
+	sum+=array[i];
+    sum /= size;
+    return sum;
+}
+
+float cpu_dispersion(float* array, float average, size_t size)
+{
+    float sum = 0.;
+    for(int i=0;i<size;i++)
+	sum+=(array[i]-average)*(array[i]-average);
+    sum /= size;
+    return sum;
+}
+
+matrix identity_matrix(size_t row_size)
+{
+    matrix I ;
+    I.rows = row_size;
+    I.columns = row_size;
+    I.stride = I.columns;
+    I.elements = (float*)malloc(I.rows*I.columns*sizeof(float));
+    if( I.elements==NULL ){
+	printf("Cannot create Identity matrix\n");
+	exit( EXIT_FAILURE );
+    }
+    for(int i=0;i<I.rows;i++){
+	for(int j=0;j<I.columns;j++){
+	    if( i==j)
+		I.elements[i+j*I.rows] = 1;
+	    else
+		I.elements[i+j*I.rows] = 0;
+	}
+    }
+    return I;
+}
+	
+void check_identity_matrix(matrix M)
+{
+    for(int i=0;i<M.rows;i++){
+	for(int j=0;j<M.rows;j++){
+	    printf("%1.0f-",M.elements[i+j*M.rows]);
+	    /* if( (i==j && M.elements[i+j*M.rows]!=1) || (i!=j && M.elements[i+j*M.rows]!=0) ){ */
+	    /* 	printf("Error in identity matrix at [%d][%d]\n",i,j); */
+	    /* 	    exit( EXIT_FAILURE ); */
+	    /* 	} */
+	}
+	printf("\n");
+    }
+    printf("Matrix checked: no error\n");
+}
+
+void check_matrix(matrix M, int value)
+{
+    for(int i=0;i<M.rows;i++){
+	for(int j=0;j<M.columns;j++){
+//	    printf("%d",M.elements[i*M.columns+j]);
+	    if(M.elements[i*M.columns+j]!=value){
+		printf("Error in matrix at [%d][%d]\n",i,j);
+		exit( EXIT_FAILURE );
+	    }
+	}
+//	printf("\n");
+    }
+    printf("Matrix checked: no error\n");
+}
+
+void mult_matrix(matrix A, matrix B, matrix R)
+{
+    int sum;
+    for(int i=0;i<A.rows;i++){
+        for(int j=0;j<B.columns;j++){
+	    sum = 0;
+            for(int k=0;k<A.columns;k++){
+		sum += A.elements[i*A.columns+k] * B.elements[k*B.columns+j];
+            }
+	    R.elements[i*R.columns+j] = sum;
+        }
+    }
+}
+
+
+void fill_matrix_random(matrix M)
+{
+    srand(time(NULL));
+    int value;
+    for(int i=0;i<M.rows;i++){
+        for(int j=0;j<M.columns;j++){
+	    value = (int)rand();
+	    M.elements[i*M.columns+j] = value;
+	}
+    }
+}

+ 200 - 0
src/cpu-fpga.cu

@@ -0,0 +1,200 @@
+/* Tansfers data from FPGA to CPU */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+#include <unistd.h>
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+
+#define PCI_TRANSFER_SIZE 16384
+#define GPU_PAGE 65536
+#define DATA 0xbb
+
+
+/* argv[1] = number of bytes to be written */
+int main(int argc, char* argv[])
+{
+    FILE* fp = fopen("cpu-fpga.csv","a");
+    if( fp == NULL )
+    {
+	printf("Cannot open file cpu-fpga.csv\n");
+	exit( EXIT_FAILURE );
+    }
+
+    int nb_bytes = atoi(argv[argc -1]);
+    printf("nb_bytes = %d\n",nb_bytes);
+    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
+
+    unsigned char* data=(unsigned char*)calloc(16384,sizeof(*data));
+    memset(data,DATA,nb_bytes); memset(data+nb_bytes,0x00,16384-nb_bytes);
+    init_to_send(data,sizeof(char),nb_bytes);
+    system("/home/mathiasb/sources/benchmarking/launch.sh");	
+
+    /* Initialisation of the APIs */
+    gdr_t g = gdr_open();
+    if( g==NULL)
+    {
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+
+    /* PCI */
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL )
+    {
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL )
+    {
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PCI_TRANSFER_SIZE%4096)?(4096 * (1 + PCI_TRANSFER_SIZE/4096)):PCI_TRANSFER_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL )
+    {
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL )
+    {
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 )
+    {
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL )
+    {
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL )
+    {
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 )
+    {
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+    double simple_write_meas1;
+    double simple_write_meas2;
+    double start_meas;
+    double meas_result;
+    unsigned char* getBack=(unsigned char*)calloc(16384,sizeof(*getBack));
+
+    volatile uint64_t *hwaddr = (uint64_t*)((char*)pciMemPtrDesc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
+
+    /* for(int j=0;j<16384;j++) */
+    /* 	printf("%hhx",data[j]); */
+    /* printf("\n"); */
+    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++) */
+    /* 	printf("%hhx",getBack[j]); */
+    /* printf("\n"); */
+	
+
+    printf("\nWorking on the FPGA\n");
+    WR32(REG_RESET_DMA, 1);
+    usleep(100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep(100000);
+    WR32_sleep(REG_PERF_COUNTER,0);
+    WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,nb_transfer); //16);
+    WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
+    WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+    WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+    WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
+//    WR64_sleep(REG_UPDATE_ADDRESS,descInfo.bus_addr+DESCRIPTOR_OFFSET);
+
+    WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+    WR32_sleep(REG_DMA,1);
+
+    WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
+    WR32_sleep(REG_COUNTER,0x1);
+    start_meas = 4. *RD32 (0x14)/ 1000;
+    *(int*)pciMemPtrDesc=0;
+    simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
+    WR64(REG_DESCRIPTOR_ADDRESS,pciBusPage);
+    while(!*(int*)pciMemPtrDesc)
+	simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
+    meas_result=simple_write_meas2-simple_write_meas1;
+
+    usleep(1000);
+    /* assert_cuda( cudaMemcpy((void*)gpuPageVa,(const void*)gpuPagePtr,16384,cudaMemcpyDeviceToHost) ); */
+    /* /\* memcpy(getBack,(const void*)pciMemPtrDesc,16384); *\/ */
+    /* memcpy(getBack,(const void*)gpuPageVa,16384); */
+    /* for(int j=0;j<16384;j++) */
+    /* { */
+    /* 	if( getBack[j]!=DATA ) */
+    /* 	{ */
+    /* 	    printf("Last at %d\n",j); */
+    /* 	    goto exit_failure; */
+    /* 	} */
+    /* 	printf("%hhx",getBack[j]); */
+    /* } */
+    /* printf("\n"); */
+    /* printf("number of descriptor: %d\n",*(int*)gpuDescVa); */
+    /* printf("start_meas = %lf\n",start_meas); */
+    /* printf("hwaddr = %lx\ngpuPagePtr = %llx\n",*hwaddr,pageInfo.bus_addr); */
+
+    fprintf(fp,"%lf",meas_result);
+    
+
+    /* Close everything */
+    printf("\nClosing the connections\n");
+    free(getBack);
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_close(g) );
+    
+    fclose(fp);
+    
+    printf("All Cleared\n");
+    
+    exit(EXIT_SUCCESS);
+
+}

+ 226 - 0
src/floats_gpu-fpga.cu

@@ -0,0 +1,226 @@
+/* Test 1: copying MAX_SIZE floats from the FPGA to the GPU through DMA */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+#include <unistd.h>
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+#define PAGE_SIZE 1024
+
+
+
+#define MAX_SIZE 1024*1
+#define GPU_PAGE (MAX_SIZE/PAGE_SIZE)*65536
+
+#define VALUE 5.
+
+
+int main(int argc, char* argv[])
+{
+    FILE* fp = fopen("test1.csv","a");
+    if( fp == NULL )
+    {
+	printf("Cannot open file test1.csv\n");
+	exit( EXIT_FAILURE );
+    }
+    
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+    if( g==NULL)
+    {
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Manage NVIDIA GPU */
+    printf("\nInitialisation of the GPU\n");
+    CUdevice GPU;
+    CUdevprop GPUProp;
+    assert_cu( cuDeviceGet(&GPU,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
+    char gpu_name[GPU_NAME_LENGTH] = {0};
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU));
+    printf("GPU: %s\n", gpu_name);    
+    /* Check context */
+    CUcontext cuCtx;
+    assert_cu( cuCtxCreate(&cuCtx,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU) );
+    assert_cu( cuCtxSetCurrent(cuCtx) );
+    
+    /* Allocate memory on the device, pin and map */
+    uint8_t flagValueToSet = 1;
+    printf("\nMemory mapping with the GPU for pages\n");
+    CUdeviceptr gpuPagePtr;
+    assert_cu( cuMemAlloc(&gpuPagePtr,MAX_SIZE*sizeof(float)) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr) );
+    gdr_mh_t GPUMemHandlePage;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr,MAX_SIZE*sizeof(float),0,0,&GPUMemHandlePage) );
+    void* gpuPageVa;
+    assert_gdr( gdr_map(g,GPUMemHandlePage,&gpuPageVa,MAX_SIZE*sizeof(float)) );
+    gdr_info_t pageInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage,&pageInfo) );
+    printf("Memory mapping with the GPU for descriptors\n");
+    CUdeviceptr gpuDescPtr;
+    assert_cu( cuMemAlloc(&gpuDescPtr,GPU_PAGE*sizeof(float)) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr) );
+    gdr_mh_t GPUMemHandleDesc;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr,GPU_PAGE*sizeof(float),0,0,&GPUMemHandleDesc) );
+    void* gpuDescVa;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc,&gpuDescVa,GPU_PAGE*sizeof(float)) );
+    gdr_info_t descInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc,&descInfo) );
+
+    /* PCI */
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL )
+    {
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL )
+    {
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    CUdeviceptr dBAR;
+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PAGE_SIZE%4096)?(4096 * (1 + PAGE_SIZE/4096)):PAGE_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL )
+    {
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL )
+    {
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 )
+    {
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL )
+    {
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL )
+    {
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 )
+    {
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+
+
+    float* data = (float*)malloc(MAX_SIZE*sizeof(float));
+    float* getback = (float*)malloc(MAX_SIZE*sizeof(float));
+    cpu_fill_array(data,VALUE,MAX_SIZE);
+    double meas_result;
+    double mult_write_meas1;
+    double mult_write_meas2;
+    if( !check_array(data,VALUE,MAX_SIZE) )
+	goto exit;
+
+    init_to_send(data,sizeof(float),MAX_SIZE);
+    
+    /* FPGA */
+    system("/home/mathiasb/sources/benchmarking/launch.sh");
+           
+    WR32(REG_RESET_DMA, 1);
+    usleep(100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep(100000);
+    WR32_sleep(REG_PERF_COUNTER,0);
+    WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,16);
+    WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64);
+    WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+    WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+    WR64_sleep(REG_UPDATE_COUNTER,descInfo.bus_addr);
+    WR64_sleep(REG_UPDATE_ADDRESS,pciBusDesc+DESCRIPTOR_OFFSET);	
+    
+    WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+    WR32(REG_DMA,1);
+    WR32(REG_INTERCONNECT, 0x232);
+    WR32(REG_COUNTER,0x1);
+    *(int*)gpuDescVa=0;
+    mult_write_meas1 = 4. *RD32 (0x14)/ 1000;
+    WR64(REG_DESCRIPTOR_ADDRESS,pageInfo.bus_addr);
+    while(*(int*)gpuDescVa == 0)
+	mult_write_meas2 = 4. *RD32 (0x14)/ 1000;
+    meas_result = mult_write_meas2 - mult_write_meas1;
+
+    for(int i=0;i<MAX_SIZE;i++){
+	printf("%f-",((float*)gpuPageVa)[i]);
+    }
+
+    if( !check_array((float*)gpuPageVa,VALUE,MAX_SIZE) )
+	goto exit;
+    printf("\n");
+    
+    printf("meas_result = %lf\n",meas_result);
+/* free(getBack); */
+
+    
+/* Close everything */
+exit:
+    printf("\nClosing the connections\n");
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+    
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,MAX_SIZE*sizeof(float)) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE*sizeof(float)) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+
+    fclose( fp );
+    printf("All Cleared\n");
+    
+    exit( EXIT_SUCCESS );
+}

+ 234 - 0
src/gpu-fpga.cu

@@ -0,0 +1,234 @@
+/* A single test aimed at being looped with a script. Handles the number of Bytes passed in argv[1]. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+#include <unistd.h>
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+
+#define GPU_PAGE 65536
+#define DATA 0xa2
+
+
+int main(int argc, char* argv[])
+{
+    FILE* fp = fopen("gpu-fpga.csv","a");
+    if( fp == NULL ){
+	printf("Cannot open file gpu-fpga.csv\n");
+	exit( EXIT_FAILURE );
+    }
+
+    int nb_bytes = atoi(argv[argc -1]);
+    printf("nb_bytes = %d\n",nb_bytes);
+    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
+    
+    unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
+    memset(data,DATA,nb_bytes);
+    init_to_send(data,sizeof(char),nb_bytes);
+    system("/home/mathiasb/sources/benchmarking/launch.sh");	
+
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+    if( g==NULL){
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Manage NVIDIA GPU */
+    printf("\nInitialisation of the GPU\n");
+    CUdevice GPU;
+    CUdevprop GPUProp;
+    assert_cu( cuDeviceGet(&GPU,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
+    char gpu_name[GPU_NAME_LENGTH] = {0};
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU));
+    printf("GPU: %s\n", gpu_name);    
+    /* Check context */
+    CUcontext cuCtx;
+    assert_cu( cuCtxCreate(&cuCtx,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU) );
+    assert_cu( cuCtxSetCurrent(cuCtx) );
+    
+    /* Allocate memory on the device, pin and map */
+    uint8_t flagValueToSet = 1;
+    printf("\nMemory mapping with the GPU for pages\n");
+    CUdeviceptr gpuPagePtr;
+    assert_cu( cuMemAlloc(&gpuPagePtr,nb_bytes) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr) );
+    gdr_mh_t GPUMemHandlePage;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr,nb_bytes,0,0,&GPUMemHandlePage) );
+    void* gpuPageVa;
+    assert_gdr( gdr_map(g,GPUMemHandlePage,&gpuPageVa,nb_bytes) );
+    gdr_info_t pageInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage,&pageInfo) );
+    printf("Memory mapping with the GPU for descriptors\n");
+    CUdeviceptr gpuDescPtr;
+    assert_cu( cuMemAlloc(&gpuDescPtr,GPU_PAGE) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr) );
+    gdr_mh_t GPUMemHandleDesc;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr,GPU_PAGE,0,0,&GPUMemHandleDesc) );
+    void* gpuDescVa;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc,&gpuDescVa,GPU_PAGE) );
+    gdr_info_t descInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc,&descInfo) );
+
+    /* PCI */
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL ){
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL ){
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    CUdeviceptr dBAR;
+    /* assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) ); */
+    /* assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) ); */
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((nb_bytes%4096)?(4096 * (1 + nb_bytes/4096)):nb_bytes), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+    double simple_write_meas1;
+    double simple_write_meas2;
+    double start_meas;
+    double meas_result;
+    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
+
+    volatile uint64_t *hwaddr = (uint64_t*)((char*)gpuDescVa + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
+
+    /* for(int j=0;j<nb_bytes;j++) */
+    /* 	printf("%hhx",data[j]); */
+    /* printf("\n"); */
+    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++) */
+    /* 	printf("%hhx",getBack[j]); */
+    /* printf("\n"); */
+	
+
+    printf("\nWorking on the FPGA\n");
+    WR32(REG_RESET_DMA, 1);
+    usleep(100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep(100000);
+    WR32_sleep(REG_PERF_COUNTER,0);
+    WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,nb_transfer); //16);
+WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
+    WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+    WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+    WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
+    WR64_sleep(REG_UPDATE_ADDRESS,descInfo.bus_addr+DESCRIPTOR_OFFSET);
+
+    WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+    WR32_sleep(REG_DMA,1);
+
+    WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
+    WR32_sleep(REG_COUNTER,0x1);
+    start_meas = 4. *RD32 (0x14)/ 1000;
+    *(int*)pciMemPtrDesc=0;
+    simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
+    WR64(REG_DESCRIPTOR_ADDRESS,pageInfo.bus_addr);
+    while(!*(int*)pciMemPtrDesc)
+	simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
+    meas_result=simple_write_meas2-simple_write_meas1;
+
+    usleep(1000);
+    
+    /* assert_cuda( cudaMemcpy((void*)gpuPageVa,(const void*)gpuPagePtr,nb_bytes,cudaMemcpyDeviceToHost) ); */
+    /* /\* memcpy(getBack,(const void*)pciMemPtrDesc,nb_bytes); *\/ */
+    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++){ */
+    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
+    /* 	/\*     printf("Last at %d\n",j); *\/ */
+    /* 	/\*     goto exit; *\/ */
+    /* 	/\* } *\/ */
+    /* 	printf("%hhx",getBack[j]); */
+    /* } */
+    /* printf("\n"); */
+    /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
+    /* printf("start_meas = %lf\n",start_meas); */
+    /* printf("hwaddr = %p\ngpuPagePtr = %p\n",*hwaddr,pageInfo.bus_addr); */
+
+    fprintf(fp,"%lf",meas_result);
+    
+
+    /* Close everything */
+exit:
+    printf("\nClosing the connections\n");
+    free(getBack);
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    /* assert_cu( cuMemHostUnregister((void*) pciVa) ); */
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,nb_bytes) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+    
+    fclose(fp);
+    
+    printf("All Cleared\n");
+    
+    exit(EXIT_SUCCESS);
+}

+ 300 - 0
src/kernels.cu

@@ -0,0 +1,300 @@
+/* This file contains the kernels i.e. the functions to be executed on the GPU */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "kernels.h"
+
+
+
+
+__device__
+void add_two_device(CUdeviceptr number)
+{
+    (* (int*) number)+=2;
+}
+
+
+__global__
+void add_three_global(CUdeviceptr number)
+{
+    (* (int*) number)++;
+    add_two_device(number);
+}
+
+
+__global__
+void add_one_global(CUdeviceptr number)
+{
+    (* (int*) number)++;
+}
+
+
+__global__
+void fill_array(int* array, int value, size_t size)
+{
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    uint step = blockDim.x*gridDim.x;
+    for(int i=0;x+i<size;i+=step)
+	array[x+i] = value;
+}
+
+
+__global__
+void fill_float_array(float* array, float value, size_t size)
+{
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    uint step = blockDim.x*gridDim.x;
+    for(int i=0;x+i<size;i+=step)
+	array[x+i] = value;
+}
+
+
+extern __shared__ float _temp[];
+
+
+__global__ 
+void get_reduction(float* array, float* output, size_t size, size_t stride)
+{
+    float* temp = (float*)_temp;
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    if(x<size)
+	temp[threadIdx.x]=array[x];
+    __syncthreads();
+    for(uint s=1;s<blockDim.x;s*=2)
+    {
+	x = threadIdx.x*2*s;
+	if(x+s < blockDim.x)
+	    temp[x]+=temp[x+s];
+	__syncthreads();
+    }
+    if(threadIdx.x==0)
+    	output[blockIdx.x+stride*gridDim.x]=temp[0];
+}
+
+
+__global__ 
+void get_average_reduction(float* array, float* average, size_t size, size_t stride, size_t divider)
+{
+    float* temp = (float*)_temp;
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    if(x<size)
+	temp[threadIdx.x]=array[x];
+    __syncthreads();
+    for(uint s=1;s<blockDim.x;s*=2)
+    {
+	x = threadIdx.x*2*s;
+	if(x+s < blockDim.x)
+	    temp[x]+=temp[x+s];
+	__syncthreads();
+    }
+    if(threadIdx.x==0)
+    	*average=temp[0]/divider;
+}
+
+
+__global__ 
+void get_dispersion_reduction(float* array, float* average, float* output, size_t size, size_t stride, size_t divider)
+{
+    float* temp = (float*)_temp;
+    uint x = (blockIdx.x*blockDim.x)+threadIdx.x;
+    if(x<size)
+	temp[threadIdx.x]=(array[x]-*average) * (array[x]-*average);
+    __syncthreads();
+    for(uint s=1;s<blockDim.x;s*=2)
+    {
+	x = threadIdx.x*2*s;
+	if(x+s < blockDim.x)
+	    temp[x]+=temp[x+s];
+	__syncthreads();
+    }
+    if(threadIdx.x==0)
+    	output[blockIdx.x+stride*gridDim.x]=temp[0]/divider;
+}
+
+
+void get_average(float* array, float* average, size_t size,dim3 blocks_per_grid, dim3 threads_per_block)
+{
+    float *output,*extend;
+    size_t stride = size/(blocks_per_grid.x*threads_per_block.x);
+    size_t output_size = stride*blocks_per_grid.x;
+    size_t extend_size = stride;
+    assert_cuda( cudaMalloc((void**)&output,output_size*sizeof(float)) );
+    assert_cuda( cudaMalloc((void**)&extend,extend_size*sizeof(float)) );
+    stride=0;
+    while(stride*blocks_per_grid.x*threads_per_block.x < size)
+    {
+	get_reduction
+	    <<< blocks_per_grid,threads_per_block,threads_per_block.x*sizeof(float)  >>>
+	    (array+stride*blocks_per_grid.x*threads_per_block.x,
+	     output,
+	     blocks_per_grid.x*threads_per_block.x,
+	     stride);
+	stride++;
+    }
+    assert_cuda( cudaDeviceSynchronize() );
+    get_reduction
+	<<< extend_size,blocks_per_grid,blocks_per_grid.x*sizeof(float)  >>>
+	(output,
+	 extend,
+	 output_size,
+	 0);
+    assert_cuda( cudaDeviceSynchronize() );
+    get_average_reduction
+	<<< 1,extend_size,extend_size*sizeof(float)  >>>
+	(extend,
+	 average,
+	 extend_size,//blocks_per_grid.x,
+	 0,
+	 size);
+    assert_cuda( cudaFree(output) );
+    assert_cuda( cudaFree(extend) );
+}
+
+
+void get_dispersion(float* array, float* average, float* dispersion, size_t size, dim3 blocks_per_grid, dim3 threads_per_block)
+{
+    get_average(array,average,size,blocks_per_grid,threads_per_block);
+    float *output,*extend;
+    size_t stride = size/(blocks_per_grid.x*threads_per_block.x);
+    size_t output_size = stride*blocks_per_grid.x;
+    size_t extend_size = stride;
+    assert_cuda( cudaMalloc((void**)&output,output_size*sizeof(float)) );
+    assert_cuda( cudaMalloc((void**)&extend,extend_size*sizeof(float)) );
+    stride=0;
+    while(stride*blocks_per_grid.x*threads_per_block.x < size)
+    {
+	get_dispersion_reduction
+	    <<< blocks_per_grid,threads_per_block,threads_per_block.x*sizeof(float)  >>>
+	    (array+stride*blocks_per_grid.x*threads_per_block.x,
+	     average,
+	     output,
+	     blocks_per_grid.x*threads_per_block.x,
+	     stride,
+	     size);
+	stride++;
+    }
+    assert_cuda( cudaDeviceSynchronize() );
+    get_reduction
+	<<< extend_size,blocks_per_grid,blocks_per_grid.x*sizeof(float)  >>>
+	(output,
+	 extend,
+	 output_size,
+	 0);
+    assert_cuda( cudaDeviceSynchronize() );
+    get_reduction
+      	<<< 1,extend_size,blocks_per_grid.x*sizeof(float)  >>>
+    	(extend,
+    	 dispersion,
+    	 extend_size,//blocks_per_grid.x,
+    	 0);
+    assert_cuda( cudaFree(output) );
+    assert_cuda( cudaFree(extend) );
+}
+
+
+__global__
+void fill_matrix(int* M,int value)
+{
+    uint x = blockIdx.x*blockDim.x +threadIdx.x;
+    uint y = blockIdx.y*blockDim.x +threadIdx.y;
+    uint step = blockDim.x*gridDim.x;
+    for(int i =0;i<blockDim.x;i+=blockDim.y)
+	M[x+(y+i)*step] = value;
+}
+
+
+__global__
+void add_matrix(int* A,int* B,int* C)
+{
+    uint i =(blockIdx.x*blockDim.x)+threadIdx.x;
+    uint j =(blockIdx.y*blockDim.y)+threadIdx.y;
+    C[i*blockDim.x*gridDim.y+j] = A[i*blockDim.x*gridDim.y+j]+B[i*blockDim.x*gridDim.y+j];
+}
+
+
+__global__
+void fill_matrix2(int* M,int value,size_t flat_size)
+{
+    uint i =(blockIdx.x*blockDim.x)+threadIdx.x;
+    //   printf("i=%d",blockDim.x);
+    if( i<flat_size )
+	M[i] = value;
+}
+
+
+__global__
+void add_matrix2(int* A,int* B,int* C,size_t flat_size)
+{
+    uint i =(blockIdx.x*blockDim.x)+threadIdx.x;
+    if( i<flat_size )
+	C[i] = A[i]+B[i];
+}
+
+
+__global__
+void add_matrix_mod(int* B,int* C)
+{
+    uint x =(blockIdx.x*blockDim.x)+threadIdx.x;
+    uint y =(blockIdx.y*blockDim.x)+threadIdx.y;
+    uint step = blockDim.x*gridDim.x;
+    for(int i=0;i<blockDim.x;i+=blockDim.y)
+	C[x+(y+i)*step]+=B[x+(y+i)*step];
+}
+
+
+__global__ 
+void kern_identity_matrix(matrix I)
+{
+    uint x = blockIdx.x*blockDim.x +threadIdx.x;
+    uint y = blockIdx.y*blockDim.y +threadIdx.y;
+/*    uint step = blockDim.x*gridDim.x;
+    for(int i =0;i<blockDim.x;i+=blockDim.y)
+    I.elements[x+(y+i)*step] = (x==(y+i))? 1:0; */
+    if( x>=I.columns || y>=I.rows )
+	return;
+    I.elements[x+y*I.columns] = (x==y)? 1:0;
+}
+
+
+__global__
+void kern_mult_matrix_naive(matrix A, matrix B, matrix R)
+{
+    int sum = 0;
+    uint col =(blockIdx.x*blockDim.x)+threadIdx.x;
+    uint row =(blockIdx.y*blockDim.y)+threadIdx.y;
+    if( row > A.rows || col > B.columns ) return;
+    for(int k=0;k<R.rows;k++)
+	sum += A.elements[row*A.columns+k] * B.elements[k*B.columns+col];
+    R.elements[row*R.columns+col] = sum;
+}
+
+
+__global__
+void kern_mult_matrix_shared(matrix A, matrix B, matrix R)
+{
+    int sum = 0;
+    uint col = (blockIdx.x*blockDim.x)+threadIdx.x;
+    uint row = (blockIdx.y*blockDim.y)+threadIdx.y;
+    if( row > A.rows || col > B.columns ) return;
+
+    for(int m=0;m<gridDim.x;m++){
+	matrix subA;
+	matrix subB;
+	subA.stride = A.stride;
+	subB.stride = B.stride;
+	subA.elements = A.elements + subA.stride*BLOCK_SIZE*blockIdx.y + m*BLOCK_SIZE;
+	subB.elements = B.elements + subB.stride*BLOCK_SIZE*m + blockIdx.x*BLOCK_SIZE;
+	
+	__shared__ int shared_subA[BLOCK_SIZE][BLOCK_SIZE];
+	__shared__ int shared_subB[BLOCK_SIZE][BLOCK_SIZE];
+	
+	shared_subA[threadIdx.y][threadIdx.x] = subA.elements[threadIdx.y*subA.stride+threadIdx.x];
+	shared_subB[threadIdx.y][threadIdx.x] = subB.elements[threadIdx.y*subB.stride+threadIdx.x];
+	__syncthreads();
+	for(int k=0;k<BLOCK_SIZE;k++){
+	    sum += shared_subA[threadIdx.y][k] * shared_subB[k][threadIdx.x];
+	    __syncthreads();
+	}
+    }
+    R.elements[((blockIdx.y*blockDim.y)+threadIdx.y)*R.stride+(blockIdx.x*blockDim.x)+threadIdx.x] = sum;
+}

+ 305 - 0
src/loaded.cu

@@ -0,0 +1,305 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+#include <unistd.h>
+#include <pthread.h>
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+
+#define GPU_PAGE 65536
+#define DATA 0xa2
+
+int stop_process = 0;
+
+void* cpu_load_compute(void* arg)
+{
+    /* Loops a matrix multiplication */
+    matrix I,matx,result;
+    I.rows = MATRIX_ROW_SIZE;
+    I.columns = I.rows;
+    I.stride = I.columns;
+    matx.rows = MATRIX_ROW_SIZE;
+    matx.columns = matx.rows;
+    matx.stride = matx.columns;
+    result.rows = MATRIX_ROW_SIZE;
+    result.columns = result.rows;
+    result.stride = result.columns;
+    I.elements = (float*)malloc(I.rows*I.columns*sizeof(float));
+    matx.elements = (float*)malloc(matx.rows*matx.columns*sizeof(float));
+    result.elements = (float*)malloc(result.rows*result.columns*sizeof(float));
+
+    fill_matrix_random(I);
+    fill_matrix_random(matx);
+    
+    while(!stop_process){
+	mult_matrix(I, matx, result);
+    }
+    free(I.elements);
+    free(result.elements);
+    free(matx.elements);
+    return NULL;
+}
+
+void* gpu_load_compute(void* arg)
+{
+    /* Loops a kernel that multiplies matrix */
+    dim3 blocks_per_grid(MATRIX_ROW_SIZE/BLOCK_SIZE,MATRIX_ROW_SIZE/BLOCK_SIZE);
+    dim3 threads_per_block(BLOCK_SIZE,BLOCK_SIZE);
+    matrix dev_I,dev_matx,dev_result;
+
+    dev_I.rows = MATRIX_ROW_SIZE;
+    dev_I.columns = dev_I.rows;
+    dev_I.stride = dev_I.columns;
+    dev_matx.rows = MATRIX_ROW_SIZE;
+    dev_matx.columns = dev_matx.rows;
+    dev_matx.rows = dev_matx.columns;
+    dev_result.rows = MATRIX_ROW_SIZE;
+    dev_result.columns = dev_result.rows;
+    dev_result.stride = dev_result.columns;
+    assert_cuda( cudaMalloc((void**)&dev_I.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+    assert_cuda( cudaMalloc((void**)&dev_matx.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+    assert_cuda( cudaMalloc((void**)&dev_result.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+
+    kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_I);
+    kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_matx);
+
+    while(!stop_process){
+	kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result);
+    }
+    assert_cuda( cudaFree(dev_I.elements) );
+    assert_cuda( cudaFree(dev_result.elements) );
+    assert_cuda( cudaFree(dev_matx.elements) );
+    return NULL;
+}
+
+void* cpu_load_memory(void* arg)
+{
+    char* foo = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
+    char* bar = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
+    while(!stop_process){
+	memcpy(foo, bar, MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char));
+    }
+    free(foo);
+    free(bar);
+    return NULL;
+}
+
+void* gpu_load_memory(void* arg)
+{
+    char *dev_foo,*dev_bar;
+    assert_cuda( cudaMalloc((void**)&dev_foo,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
+    assert_cuda( cudaMalloc((void**)&dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
+    while(!stop_process){
+	assert_cuda( cudaMemcpy(dev_foo,dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char),cudaMemcpyDeviceToDevice) );
+    }
+    assert_cuda( cudaFree(dev_foo) );
+    assert_cuda( cudaFree(dev_bar) );
+    return NULL;
+}
+
+
+int main(int argc, char* argv[])
+{
+    FILE* fp = fopen("loaded.csv","a");
+    if( fp == NULL ){
+	printf("Cannot open file loaded.csv\n");
+	exit( EXIT_FAILURE );
+    }
+
+    int nb_bytes = atoi(argv[argc -1]);
+    printf("nb_bytes = %d\n",nb_bytes);
+    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
+    
+    unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
+    memset(data,DATA,nb_bytes);
+    init_to_send(data,sizeof(char),nb_bytes);
+    system("/home/mathiasb/sources/benchmarking/launch.sh");	
+
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+    if( g==NULL){
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Manage NVIDIA GPU */
+    printf("\nInitialisation of the GPU\n");
+    CUdevice GPU;
+    CUdevprop GPUProp;
+    assert_cu( cuDeviceGet(&GPU,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
+    char gpu_name[GPU_NAME_LENGTH] = {0};
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU));
+    printf("GPU: %s\n", gpu_name);    
+    /* Check context */
+    CUcontext cuCtx;
+    assert_cu( cuCtxCreate(&cuCtx,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU) );
+    assert_cu( cuCtxSetCurrent(cuCtx) );
+    
+    /* Allocate memory on the device, pin and map */
+    uint8_t flagValueToSet = 1;
+    printf("\nMemory mapping with the GPU for pages\n");
+    CUdeviceptr gpuPagePtr;
+    assert_cu( cuMemAlloc(&gpuPagePtr,nb_bytes) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr) );
+    gdr_mh_t GPUMemHandlePage;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr,nb_bytes,0,0,&GPUMemHandlePage) );
+    void* gpuPageVa;
+    assert_gdr( gdr_map(g,GPUMemHandlePage,&gpuPageVa,nb_bytes) );
+    gdr_info_t pageInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage,&pageInfo) );
+    printf("Memory mapping with the GPU for descriptors\n");
+    CUdeviceptr gpuDescPtr;
+    assert_cu( cuMemAlloc(&gpuDescPtr,GPU_PAGE) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr) );
+    gdr_mh_t GPUMemHandleDesc;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr,GPU_PAGE,0,0,&GPUMemHandleDesc) );
+    void* gpuDescVa;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc,&gpuDescVa,GPU_PAGE) );
+    gdr_info_t descInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc,&descInfo) );
+
+    /* PCI */
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL ){
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL ){
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    CUdeviceptr dBAR;
+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((nb_bytes%4096)?(4096 * (1 + nb_bytes/4096)):nb_bytes), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+    double simple_write_meas1;
+    double simple_write_meas2;
+    double meas_result;
+    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
+
+    pthread_t cpu_compute,cpu_mem,gpu_compute,gpu_mem;
+    pthread_create(&cpu_compute, NULL, cpu_load_compute, NULL);
+    pthread_create(&gpu_compute, NULL, gpu_load_compute, NULL);
+    pthread_create(&cpu_mem, NULL, cpu_load_memory, NULL);
+    pthread_create(&gpu_mem, NULL, gpu_load_memory, NULL);
+
+    printf("\nWorking on the FPGA\n");
+    WR32(REG_RESET_DMA, 1);
+    usleep(100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep(100000);
+    WR32_sleep(REG_PERF_COUNTER,0);
+    WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,nb_transfer); //16);
+WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
+    WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+    WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+    WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
+    WR64_sleep(REG_UPDATE_ADDRESS,descInfo.bus_addr+DESCRIPTOR_OFFSET);
+
+    WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+    WR32_sleep(REG_DMA,1);
+
+    WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
+    WR32_sleep(REG_COUNTER,0x1);
+    *(int*)pciMemPtrDesc=0;
+    simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
+    WR64(REG_DESCRIPTOR_ADDRESS,pageInfo.bus_addr);
+    while(!*(int*)pciMemPtrDesc)
+	simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
+    meas_result=simple_write_meas2-simple_write_meas1;
+
+    usleep(1000);
+    
+    fprintf(fp,"%lf",meas_result);
+    
+
+    /* Close everything */
+    printf("\nClosing the connections\n");
+
+    stop_process = 1;
+    pthread_join(cpu_compute,NULL);
+    pthread_join(gpu_compute,NULL);
+    pthread_join(cpu_mem,NULL);
+    pthread_join(gpu_mem,NULL);
+
+    free(getBack);
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,nb_bytes) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+    
+    fclose(fp);
+    
+    printf("All Cleared\n");
+    
+    exit(EXIT_SUCCESS);
+}

+ 254 - 0
src/matrix_gpu-fpga.cu

@@ -0,0 +1,254 @@
+/* Square matrix with size multiple of BLOCK_SIZE */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include <unistd.h>
+#include <sys/time.h>
+#include <math.h>
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+#define PAGE_SIZE 1024
+#define MAX_SIZE 1024*1
+#define GPU_PAGE (MAX_SIZE/PAGE_SIZE)*65536
+#define VALUE 5.
+
+
+int main(int argc, char* argv[])
+{
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+    if( g==NULL){
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+    
+    /* Manage NVIDIA GPU */
+    printf("\nInitialisation of the GPU\n");
+    CUdevice GPU;
+    CUdevprop GPUProp;
+    assert_cu( cuDeviceGet(&GPU,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
+    char gpu_name[GPU_NAME_LENGTH] = {0};
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU));
+    printf("GPU: %s\n", gpu_name);    
+    /* Check context */
+    CUcontext cuCtx;
+    assert_cu( cuCtxCreate(&cuCtx,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU) );
+    assert_cu( cuCtxSetCurrent(cuCtx) );
+    
+    /* Allocate memory on the device, pin and map */
+    uint8_t flagValueToSet = 1;
+    printf("\nMemory mapping with the GPU for pages\n");
+    CUdeviceptr gpuPagePtr;
+    assert_cu( cuMemAlloc(&gpuPagePtr,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr) );
+    gdr_mh_t GPUMemHandlePage;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float),0,0,&GPUMemHandlePage) );
+    void* gpuPageVa;
+    assert_gdr( gdr_map(g,GPUMemHandlePage,&gpuPageVa,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+    gdr_info_t pageInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage,&pageInfo) );
+    printf("Memory mapping with the GPU for descriptors\n");
+    CUdeviceptr gpuDescPtr;
+    assert_cu( cuMemAlloc(&gpuDescPtr,GPU_PAGE*sizeof(float)) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr) );
+    gdr_mh_t GPUMemHandleDesc;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr,GPU_PAGE*sizeof(float),0,0,&GPUMemHandleDesc) );
+    void* gpuDescVa;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc,&gpuDescVa,GPU_PAGE*sizeof(float)) );
+    gdr_info_t descInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc,&descInfo) );
+
+    /* PCI */
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL ){
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL ){
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    CUdeviceptr dBAR;
+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((PAGE_SIZE%4096)?(4096 * (1 + PAGE_SIZE/4096)):PAGE_SIZE), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+ 
+    cudaError_t err;
+    matrix I,matx,result;
+    dim3 blocks_per_grid(MATRIX_ROW_SIZE/BLOCK_SIZE,MATRIX_ROW_SIZE/BLOCK_SIZE);
+    dim3 threads_per_block(BLOCK_SIZE,BLOCK_SIZE);
+    matrix dev_I,dev_matx,dev_result;
+
+    cudaEvent_t start,stop;
+    float ms_shared,ms_naive;
+    struct timeval t1,t2;
+    double time;
+    assert_cuda( cudaEventCreate(&start) );
+    assert_cuda( cudaEventCreate(&stop) );
+
+    I.rows = MATRIX_ROW_SIZE;
+    I.columns = I.rows;
+    I.stride = I.columns;
+    matx.rows = MATRIX_ROW_SIZE;
+    matx.columns = matx.rows;
+    matx.stride = matx.columns;
+    result.rows = MATRIX_ROW_SIZE;
+    result.columns = result.rows;
+    result.stride = result.columns;
+    I.elements = (float*)malloc(I.rows*I.columns*sizeof(float));
+    matx.elements = (float*)malloc(matx.rows*matx.columns*sizeof(float));
+    result.elements = (float*)malloc(result.rows*result.columns*sizeof(float));
+
+    dev_I.rows = I.rows;
+    dev_I.columns = dev_I.rows;
+    dev_I.stride = dev_I.columns;
+    dev_matx.rows = matx.rows;
+    dev_matx.columns = dev_matx.rows;
+    dev_matx.rows = dev_matx.columns;
+    dev_result.rows = result.rows;;
+    dev_result.columns = dev_result.rows;
+    dev_result.stride = dev_result.columns;
+    assert_cuda( cudaMalloc((void**)&dev_I.elements,I.rows*I.columns*sizeof(float)) );
+    matx.elements = (float*)gpuPageVa;
+    dev_matx.elements = (float*)gpuPagePtr;
+    assert_cuda( cudaMalloc((void**)&dev_result.elements,result.rows*result.columns*sizeof(float)) );
+
+    kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_I);
+    err = cudaGetLastError();
+    printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err));
+    assert_cuda( cudaMemcpy(I.elements,dev_I.elements,I.rows*I.columns*sizeof(float),cudaMemcpyDeviceToHost) );
+    //check_identity_matrix(I);
+
+    
+    /* FPGA */
+    for(int i=0;i<1;i++){//MATRIX_ROW_SIZE;i++){
+	printf("iteration %d/%d\n",i+1,MATRIX_ROW_SIZE);
+	init_to_send(I.elements+i*MATRIX_ROW_SIZE,sizeof(float),MATRIX_ROW_SIZE*MATRIX_ROW_SIZE);//MATRIX_ROW_SIZE);
+	system("/home/mathiasb/sources/benchmarking/launch.sh");
+	usleep(100000);
+	
+	WR32(REG_RESET_DMA, 1);
+	usleep(100000);
+	WR32(REG_RESET_DMA, 0);
+	usleep(100000);
+	WR32_sleep(REG_PERF_COUNTER,0);
+	WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,64);
+	WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64);
+	WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+	WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+	WR64_sleep(REG_UPDATE_COUNTER,descInfo.bus_addr);
+	WR64_sleep(REG_UPDATE_ADDRESS,pciBusDesc+DESCRIPTOR_OFFSET);	
+        WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+	WR32(REG_DMA,1);
+	WR32(REG_INTERCONNECT, 0x232);
+	WR32(REG_COUNTER,0x1);
+	*(int*)gpuDescVa=0;
+	WR64(REG_DESCRIPTOR_ADDRESS,pageInfo.bus_addr+i*MATRIX_ROW_SIZE*sizeof(float));
+	while(*(int*)gpuDescVa == 0);
+	usleep(100000);
+    }
+
+    assert_cuda( cudaMemcpy(I.elements,dev_matx.elements,I.rows*I.columns*sizeof(float),cudaMemcpyDeviceToHost) );
+    check_identity_matrix(I);
+
+   
+    /* assert_cuda( cudaEventRecord(start) ); */
+    /* kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result); */
+    /* err = cudaGetLastError(); */
+    /* printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err)); */
+    /* assert_cuda( cudaDeviceSynchronize() ); */
+    /* assert_cuda( cudaEventRecord(stop) ); */
+    /* assert_cuda( cudaEventSynchronize(stop) ); */
+    /* assert_cuda( cudaEventElapsedTime(&ms_shared,start,stop) ); */
+    /* assert_cuda( cudaMemcpy(result.elements,dev_result.elements,result.rows*result.columns*sizeof(int),cudaMemcpyDeviceToHost) ); */
+    /* check_identity_matrix(result); */
+    /* //check_matrix(result, MATRIX_VALUE); */
+    /* printf("Time for GPU (shared): %f\n", ms_shared); */
+
+    
+exit:
+    printf("\nClosing the connections\n");
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);    
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE*sizeof(float)) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+    assert_cuda( cudaFree(dev_I.elements) );
+    assert_cuda( cudaFree(dev_result.elements) );
+    free(I.elements);
+    /* free(matx.elements); */
+    free(result.elements);
+    assert_cuda( cudaEventDestroy(start) );
+    assert_cuda( cudaEventDestroy(stop) );
+
+    printf("All Cleared\n");
+    exit( EXIT_SUCCESS );
+}

+ 261 - 0
src/multi-gpu.cu

@@ -0,0 +1,261 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+#include <unistd.h>
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+
+#define GPU_PAGE 65536
+#define DATA 0xa2
+
+
+int main(int argc, char* argv[])
+{
+    FILE* fp = fopen("multi-gpu.csv","a");
+    if( fp == NULL ){
+	printf("Cannot open file multi-gpu.csv\n");
+	exit( EXIT_FAILURE );
+    }
+
+    int nb_bytes = atoi(argv[argc -1]);
+    printf("nb_bytes = %d\n",nb_bytes);
+    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
+    
+    /* unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data)); */
+    /* memset(data,DATA,nb_bytes); */
+    /* init_to_send(data,sizeof(char),nb_bytes); */
+    /* system("/home/mathiasb/sources/benchmarking/launch.sh");	 */
+    
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+    if( g==NULL){
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Manage NVIDIA GPU */
+    printf("\nInitialisation of the GPU\n");
+    CUdevice GPU0,GPU1;
+    CUdevprop GPUProp;
+    assert_cu( cuDeviceGet(&GPU0,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU0) );
+    char gpu_name[GPU_NAME_LENGTH] = {0};
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU0));
+    printf("GPU0: %s\n", gpu_name);
+    assert_cu( cuDeviceGet(&GPU1,1) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU1) );
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU1));
+    printf("GPU1: %s\n", gpu_name);
+    /* Check context */
+    CUcontext cuCtx0,cuCtx1;
+    assert_cu( cuCtxCreate(&cuCtx0,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU0) );
+    assert_cu( cuCtxCreate(&cuCtx1,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU1) );
+    
+    /* Allocate memory on the device, pin and map */
+    uint8_t flagValueToSet = 1;
+
+    printf("\nMemory mapping with the GPU0 for pages\n");
+    assert_cu( cuCtxPopCurrent(&cuCtx1) ); //ctx0
+    CUdeviceptr gpuPagePtr0;
+    assert_cu( cuMemAlloc(&gpuPagePtr0,nb_bytes) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr0) );
+    gdr_mh_t GPUMemHandlePage0;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr0,nb_bytes,0,0,&GPUMemHandlePage0) );
+    void* gpuPageVa0;
+    assert_gdr( gdr_map(g,GPUMemHandlePage0,&gpuPageVa0,nb_bytes) );
+    gdr_info_t pageInfo0;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage0,&pageInfo0) );
+    printf("Memory mapping with the GPU0 for descriptors\n");
+    CUdeviceptr gpuDescPtr0;
+    assert_cu( cuMemAlloc(&gpuDescPtr0,GPU_PAGE) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr0) );
+    gdr_mh_t GPUMemHandleDesc0;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr0,GPU_PAGE,0,0,&GPUMemHandleDesc0) );
+    void* gpuDescVa0;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc0,&gpuDescVa0,GPU_PAGE) );
+    gdr_info_t descInfo0;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc0,&descInfo0) );
+
+    printf("\nMemory mapping with the GPU1 for pages\n");
+    assert_cu( cuCtxPushCurrent(cuCtx1) ); //ctx1
+    CUdeviceptr gpuPagePtr1;
+    assert_cu( cuMemAlloc(&gpuPagePtr1,nb_bytes) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr1) );
+    gdr_mh_t GPUMemHandlePage1;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr1,nb_bytes,0,0,&GPUMemHandlePage1) );
+    void* gpuPageVa1;
+    assert_gdr( gdr_map(g,GPUMemHandlePage1,&gpuPageVa1,nb_bytes) );
+    gdr_info_t pageInfo1;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage1,&pageInfo1) );
+    printf("Memory mapping with the GPU1 for descriptors\n");
+    CUdeviceptr gpuDescPtr1;
+    assert_cu( cuMemAlloc(&gpuDescPtr1,GPU_PAGE) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr1) );
+    gdr_mh_t GPUMemHandleDesc1;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr1,GPU_PAGE,0,0,&GPUMemHandleDesc1) );
+    void* gpuDescVa1;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc1,&gpuDescVa1,GPU_PAGE) );
+    gdr_info_t descInfo1;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc1,&descInfo1) );
+
+    
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL ){
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL ){
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    CUdeviceptr dBAR;
+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((nb_bytes%4096)?(4096 * (1 + nb_bytes/4096)):nb_bytes), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+    double simple_write_meas1;
+    double simple_write_meas2;
+    double meas_result;
+    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
+
+    /* for(int j=0;j<nb_bytes;j++) */
+    /* 	printf("%hhx",data[j]); */
+    /* printf("\n"); */
+    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++) */
+    /* 	printf("%hhx",getBack[j]); */
+    /* printf("\n"); */
+	
+
+    printf("\nWorking on the FPGA\n");
+    WR32(REG_RESET_DMA, 1);
+    usleep(100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep(100000);
+    WR32_sleep(REG_PERF_COUNTER,0);
+    WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,nb_transfer); //16);
+    WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
+    WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+    WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+    WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
+//    WR64_sleep(REG_UPDATE_ADDRESS,descInfo0.bus_addr+DESCRIPTOR_OFFSET);
+    WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+    WR32_sleep(REG_DMA,1);
+    WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
+    WR32_sleep(REG_COUNTER,0x1);
+    *(int*)pciMemPtrDesc=0;
+    simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
+    WR64(REG_DESCRIPTOR_ADDRESS,pageInfo0.bus_addr);
+    WR64(REG_DESCRIPTOR_ADDRESS,pageInfo1.bus_addr);
+    while(*(int*)pciMemPtrDesc != 2)
+	simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
+    meas_result=simple_write_meas2-simple_write_meas1;
+    
+    fprintf(fp,"%lf",meas_result);
+    
+    /* assert_cu( cuCtxPopCurrent(&cuCtx1) ); //ctx0 */
+    /* memcpy(getBack,(const void*)gpuPageVa0,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++){ */
+    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
+    /* 	/\*     printf("Last at %d\n",j); *\/ */
+    /* 	/\*     goto exit; *\/ */
+    /* 	/\* } *\/ */
+    /* 	printf("%hhx",getBack[j]); */
+    /* } */
+    /* printf("\n"); */
+    /* assert_cu( cuCtxPushCurrent(cuCtx1) ); //ctx1 */
+    /* memcpy(getBack,(const void*)gpuPageVa1,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++){ */
+    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
+    /* 	/\*     printf("Last at %d\n",j); *\/ */
+    /* 	/\*     goto exit; *\/ */
+    /* 	/\* } *\/ */
+    /* 	printf("%hhx",getBack[j]); */
+    /* } */
+    /* printf("\n"); */
+    
+    usleep(1000);
+
+
+    /* Close everything */
+    printf("\nClosing the connections\n");
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage0,gpuPageVa0,nb_bytes) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage0) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc0,gpuDescVa0,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc0) );
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage1,gpuPageVa1,nb_bytes) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage1) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc1,gpuDescVa1,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc1) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuCtxPopCurrent(&cuCtx1) ); //ctx0
+    assert_cu( cuMemFree(gpuPagePtr0) );
+    assert_cu( cuMemFree(gpuDescPtr0) );
+    assert_cu( cuCtxDestroy(cuCtx0) );
+    assert_cu( cuCtxPushCurrent(cuCtx1) ); //ctx1
+    assert_cu( cuMemFree(gpuPagePtr1) );
+    assert_cu( cuMemFree(gpuDescPtr1) );
+    assert_cu( cuCtxDestroy(cuCtx1) );
+
+    fclose(fp);
+	
+    printf("All Cleared\n");
+    
+    exit(EXIT_SUCCESS);
+}

+ 230 - 0
src/two_steps_dma.cu

@@ -0,0 +1,230 @@
+/* A single test aimed at being looped with a script. Handles the number of Bytes passed in argv[1]. Passes them to CPU from FPGA, then to GPU. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#include "kernels.h"
+#include "gdrapi.h"
+#include <pcilib.h>
+#include <pcilib/kmem.h>
+#include <pcilib/bar.h>
+#include "ipedma.h"
+#include <unistd.h>
+
+#define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
+
+#define KMEM_USE_RING           PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 1)
+#define KMEM_USE_DEFAULT        PCILIB_KMEM_USE(PCILIB_KMEM_USE_USER, 2)
+#define GPU_NAME_LENGTH 30
+
+#define GPU_PAGE 65536
+#define DATA 0xa2
+
+
+int main(int argc, char* argv[])
+{
+    FILE* fp = fopen("two_steps_dma.csv","a");
+    if( fp == NULL ){
+	printf("Cannot open file two_steps_dma.csv\n");
+	exit( EXIT_FAILURE );
+    }
+
+    int nb_bytes = atoi(argv[argc -1]);
+    printf("nb_bytes = %d\n",nb_bytes);
+    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
+    
+    unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
+    memset(data,DATA,nb_bytes);
+    init_to_send(data,sizeof(char),nb_bytes);
+    system("/home/mathiasb/sources/benchmarking/launch.sh");	
+
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+    if( g==NULL){
+	printf("Could not open gdr\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Manage NVIDIA GPU */
+    printf("\nInitialisation of the GPU\n");
+    CUdevice GPU;
+    CUdevprop GPUProp;
+    assert_cu( cuDeviceGet(&GPU,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
+    char gpu_name[GPU_NAME_LENGTH] = {0};
+    assert_cu (cuDeviceGetName (gpu_name, GPU_NAME_LENGTH, GPU));
+    printf("GPU: %s\n", gpu_name);    
+    /* Check context */
+    CUcontext cuCtx;
+    assert_cu( cuCtxCreate(&cuCtx,CU_CTX_MAP_HOST|CU_CTX_SCHED_AUTO,GPU) );
+    assert_cu( cuCtxSetCurrent(cuCtx) );
+    
+    /* Allocate memory on the device, pin and map */
+    uint8_t flagValueToSet = 1;
+    printf("\nMemory mapping with the GPU for pages\n");
+    CUdeviceptr gpuPagePtr;
+    assert_cu( cuMemAlloc(&gpuPagePtr,nb_bytes) );
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuPagePtr) );
+    gdr_mh_t GPUMemHandlePage;
+    assert_gdr( gdr_pin_buffer(g,gpuPagePtr,nb_bytes,0,0,&GPUMemHandlePage) );
+    void* gpuPageVa;
+    assert_gdr( gdr_map(g,GPUMemHandlePage,&gpuPageVa,nb_bytes) );
+    gdr_info_t pageInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandlePage,&pageInfo) );
+    printf("Memory mapping with the GPU for descriptors\n");
+    CUdeviceptr gpuDescPtr;
+    assert_cu( cuMemAlloc(&gpuDescPtr,GPU_PAGE) );       
+    assert_cu( cuPointerSetAttribute(&flagValueToSet,CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,gpuDescPtr) );
+    gdr_mh_t GPUMemHandleDesc;
+    assert_gdr( gdr_pin_buffer(g,gpuDescPtr,GPU_PAGE,0,0,&GPUMemHandleDesc) );
+    void* gpuDescVa;
+    assert_gdr( gdr_map(g,GPUMemHandleDesc,&gpuDescVa,GPU_PAGE) );
+    gdr_info_t descInfo;
+    assert_gdr( gdr_get_info(g,GPUMemHandleDesc,&descInfo) );
+
+    /* PCI */
+    printf("\nSetting up the PCI\n");
+    pcilib_t* pciCtx;
+    char* pciVa;
+    pciCtx = pcilib_open("/dev/fpga0",PCILIB_MODEL_DETECT);
+    if( pciCtx == NULL ){
+	printf("Cannot open a context for pci\n");
+	exit( EXIT_FAILURE );
+    }
+    pciVa = pcilib_resolve_bar_address(pciCtx,0, 0);
+    if( pciVa == NULL ){
+	printf("Cannot resolve PCI physical adress to virtual\n");
+	exit( EXIT_FAILURE );
+    }
+    CUdeviceptr dBAR;
+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
+    
+    /* Config PCI for Pages*/
+    pcilib_kmem_handle_t* pciHandlePage;
+    pciHandlePage = pcilib_alloc_kernel_memory(pciCtx, PCILIB_KMEM_TYPE_DMA_C2S_PAGE, 1, ((nb_bytes%4096)?(4096 * (1 + nb_bytes/4096)):nb_bytes), 4096, KMEM_USE_DEFAULT, KMEM_DEFAULT_FLAGS);
+    if( pciHandlePage == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrPage;
+    uintptr_t pciBusPage;
+    pciMemPtrPage = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandlePage,0);
+    if( pciMemPtrPage == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusPage = pcilib_kmem_get_block_ba(pciCtx,pciHandlePage,0);
+    if( pciBusPage == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    /* Config PCI for Desc */
+    pcilib_kmem_handle_t* pciHandleDesc;
+    pciHandleDesc = pcilib_alloc_kernel_memory(pciCtx,PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096,KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
+    if( pciHandleDesc == NULL ){
+	printf("Cannot allocate  PCI kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    volatile void* pciMemPtrDesc;
+    uintptr_t pciBusDesc;
+    pciMemPtrDesc = (uint64_t*) pcilib_kmem_get_block_ua(pciCtx,pciHandleDesc,0);
+    if( pciMemPtrDesc == NULL ){
+	printf("Cannot get PCI pointer to kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+    pciBusDesc = pcilib_kmem_get_block_ba(pciCtx,pciHandleDesc,0);
+    if( pciBusDesc == 0 ){
+	printf("Cannot get PCI Bus address on kernel memory\n");
+	exit( EXIT_FAILURE );
+    }
+
+    double simple_write_meas1;
+    double simple_write_meas2;
+    double meas_result;
+    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
+
+    assert_cuda( cudaMemset((void*)gpuPagePtr,0x00,nb_bytes) );
+    assert_cuda( cudaDeviceSynchronize() );
+    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++){ */
+    /* 	printf("%hhx",getBack[j]); */
+    /* } */
+    /* printf("\n"); */
+    
+
+    printf("\nWorking on the FPGA\n");
+    WR32(REG_RESET_DMA, 1);
+    usleep(100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep(100000);
+    WR32_sleep(REG_PERF_COUNTER,0);
+    WR32_sleep(REG_NUM_PACKETS_PER_DESCRIPTOR,nb_transfer); //16);
+    WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
+    WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
+    WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
+    WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
+    WR64_sleep(REG_UPDATE_ADDRESS,descInfo.bus_addr+DESCRIPTOR_OFFSET);
+
+    WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
+    WR32_sleep(REG_DMA,1);
+
+    WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
+    WR32_sleep(REG_COUNTER,0x1);
+    *(int*)pciMemPtrDesc=0;
+    simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
+    WR64(REG_DESCRIPTOR_ADDRESS,pciBusPage);
+    while(!*(int*)pciMemPtrDesc);
+    
+    assert_cu( cuMemcpyHtoDAsync(gpuPagePtr,(const void*)pciMemPtrPage,nb_bytes,0) );
+    simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
+    meas_result=simple_write_meas2-simple_write_meas1;
+    /* printf("%hhx-%hhx\n",((char*)gpuPageVa)[nb_bytes-1],((char*)gpuPageVa)[0]); */
+
+    /* memcpy(getBack,(const void*)pciMemPtrPage,nb_bytes); */
+    /* for(int j=0;j<nb_bytes;j++){ */
+    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
+    /* 	/\*     printf("Last at %d\n",j); *\/ */
+    /* 	/\*     goto exit; *\/ */
+    /* 	/\* } *\/ */
+    /* 	printf("%hhx",((char*)pciMemPtrPage)[j]); */
+    /* } */
+    /* printf("\n"); */
+    /* printf("pciBusPage: %p\n",pciBusPage); */
+    /* printf("pageInfo.bus_addr: %p\n",pageInfo.bus_addr); */
+    /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
+    /* printf("start_meas = %lf\n",start_meas); */
+    /* printf("hwaddr = %lx\ngpuPagePtr = %llx\n",*hwaddr,pageInfo.bus_addr); */
+
+    fprintf(fp,"%lf",meas_result);
+    
+
+    /* Close everything */
+    printf("\nClosing the connections\n");
+    free(getBack);
+    WR32(REG_COUNTER, 0);
+    WR32(REG_DMA, 0);
+    WR32(REG_RESET_DMA, 1);
+    usleep (100000);
+    WR32(REG_RESET_DMA, 0);
+    usleep (100000);
+
+    pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
+    pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
+    pcilib_close(pciCtx);
+    assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,nb_bytes) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
+    assert_gdr( gdr_unmap(g,GPUMemHandleDesc,gpuDescVa,GPU_PAGE) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandleDesc) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(gpuPagePtr) );
+    assert_cu( cuMemFree(gpuDescPtr) );
+    assert_cu( cuCtxDestroy(cuCtx) );
+    
+    fclose(fp);
+    
+    printf("All Cleared\n");
+    
+    exit(EXIT_SUCCESS);
+}