6 år sedan · 7c019a1cbc
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,24 +16,12 @@ set(CUDA_NVCC_FLAGS
 
				   -gencode arch=compute_35,code=sm_35
			
 
				   )#;-rdc=true)
			
 
				 
			
 
				-cuda_add_executable(floats_gpu-fpga
			
 
				-  src/floats_gpu-fpga.cu
			
 
				-  src/common.cu
			
 
				-  src/kernels.cu
			
 
				-  )
			
 
				-
			
 
				 cuda_add_executable(gpu-fpga
			
 
				   src/gpu-fpga.cu
			
 
				   src/common.cu
			
 
				   src/kernels.cu
			
 
				   )
			
 
				 
			
 
				-cuda_add_executable(matrix_gpu-fpga
			
 
				-  src/matrix_gpu-fpga.cu
			
 
				-  src/common.cu
			
 
				-  src/kernels.cu
			
 
				-  )
			
 
				-
			
 
				 cuda_add_executable(two_steps_dma
			
 
				   src/two_steps_dma.cu
			
 
				   src/common.cu
			
@@ -58,9 +46,7 @@ cuda_add_executable(loaded
 
				   src/kernels.cu
			
 
				   )
			
 
				 
			
 
				-target_link_libraries(floats_gpu-fpga cuda pcilib gdrapi)
			
 
				 target_link_libraries(gpu-fpga cuda pcilib gdrapi)
			
 
				-target_link_libraries(matrix_gpu-fpga cuda pcilib gdrapi)
			
 
				 target_link_libraries(two_steps_dma cuda pcilib gdrapi)
			
 
				 target_link_libraries(cpu-fpga cuda pcilib gdrapi)
			
 
				 target_link_libraries(multi-gpu cuda pcilib gdrapi)
			
--- a/include/common.h
+++ b/include/common.h
@@ -4,6 +4,7 @@
 
				 #include "cuda.h"
			
 
				 #include "cuda_runtime_api.h"
			
 
				 
			
 
				+/* Used to stop pthreads when loading CPU and/or GPU */
			
 
				 extern int stop_process;
			
 
				 
			
 
				 typedef struct matrix_t{
			
@@ -13,9 +14,13 @@ typedef struct matrix_t{
 
				     uint stride;
			
 
				 } matrix;
			
 
				 
			
 
				+/* Number of threads in a block's dim */
			
 
				 #define BLOCK_SIZE 16
			
 
				+
			
 
				 #define ASSERT_FAIL 0
			
 
				 #define ASSERT_SUCCESS 1
			
 
				+
			
 
				+/* Width of a matrix and value used for checking */
			
 
				 #define MATRIX_ROW_SIZE BLOCK_SIZE*16*1
			
 
				 #define MATRIX_VALUE 2
			
 
				 
			
@@ -27,18 +32,43 @@ typedef struct matrix_t{
 
				 void __assert_cuda(cudaError_t err_id, const char* file, int line); /* for runtime api*/
			
 
				 void __assert_cu(CUresult res_id, const char* file, int line); /* for driver api */
			
 
				 void __assert_gdr(int gdr_id, const char* file, int line);
			
 
				+
			
 
				+/* Generates a binary to_send file used to transmit data to FPGA */
			
 
				 void init_to_send(const void* dataPtr, size_t size, size_t nmemb);
			
 
				+
			
 
				+/* Checks if array of size floats is filled with value */
			
 
				 bool check_array(float* array, float value, size_t size);
			
 
				+
			
 
				+/* Prints some information on device */
			
 
				 void deviceInformation(int device);
			
 
				+
			
 
				+/* Functions to fill array with size floats of defined or random value */
			
 
				 void cpu_fill_array(float* array, float value, size_t size);
			
 
				 void cpu_fill_array_random(float* array, size_t size);
			
 
				+
			
 
				+/* Uses CPU to compute average and standard deviation of array of size floats */
			
 
				 float cpu_average(float* array, size_t size);
			
 
				 float cpu_dispersion(float* array, float average, size_t size);
			
 
				+
			
 
				+/* Returns an identity matrix of size row_size*row_size */
			
 
				 matrix identity_matrix(size_t row_size);
			
 
				+
			
 
				+/* Asserts if M is identity matrix */
			
 
				 void check_identity_matrix(matrix M);
			
 
				+
			
 
				+/* Asserts if matrix is filled with value */
			
 
				 void check_matrix(matrix M, int value);
			
 
				+
			
 
				+/* Computes R=A*B with CPU */
			
 
				 void mult_matrix(matrix A, matrix B, matrix R);
			
 
				+
			
 
				+/* Fills M with random values with CPU */
			
 
				 void fill_matrix_random(matrix M);
			
 
				+
			
 
				+/* Functions to load the CPU/GPU with:
			
 
				+ * computation: loops a matrix multiplication
			
 
				+ * memory operations: loops memcpy
			
 
				+ */
			
 
				 void* cpu_load_compute(void* arg);
			
 
				 void* gpu_load_compute(void* arg);
			
 
				 void* cpu_load_memory(void* arg);
			
--- a/include/kernels.h
+++ b/include/kernels.h
@@ -3,22 +3,35 @@
 
				 #include "gdrapi.h"
			
 
				 #include "common.h"
			
 
				 
			
 
				-__device__ void add_two_device(CUdeviceptr number);
			
 
				-__global__ void add_three_global(CUdeviceptr number);
			
 
				-__global__ void add_one_global(CUdeviceptr number);
			
 
				+/* Kernels to fill array with size int/floats of value */
			
 
				 __global__ void fill_array(int* array, int value, size_t size);
			
 
				 __global__ void fill_float_array(float* array, float value, size_t size);
			
 
				+
			
 
				+/* Functions to compute average and deviation of an array.
			
 
				+ * Only get_average and get_reduction are to be used by main.
			
 
				+ * Works with square grids/blocks
			
 
				+ */
			
 
				 extern __shared__ float _temp[];
			
 
				 __global__ void get_reduction(float* array, float* output, size_t size, size_t stride);
			
 
				 __global__ void get_average_reduction(float* array, float* average, size_t size, size_t stride, size_t divider);
			
 
				 __global__ void get_dispersion_reduction(float* array, float* average, float* output, size_t size, size_t stride, size_t divider);
			
 
				 void get_average(float* array, float* average, size_t size,dim3 blocks_per_grid, dim3 threads_per_block);
			
 
				 void get_dispersion(float* array, float* average, float* dispersion, size_t size, dim3 blocks_per_grid, dim3 threads_per_block);
			
 
				+
			
 
				+/* Some kernels operating on matrix with various methods */
			
 
				 __global__ void fill_matrix(int* M,int value);
			
 
				 __global__ void add_matrix(int* A,int* B,int* C);
			
 
				 __global__ void fill_matrix2(int* M,int value,size_t flat_size);
			
 
				 __global__ void add_matrix2(int* A,int* B,int* C,size_t flat_size);
			
 
				 __global__ void add_matrix_mod(int* B,int* C);
			
 
				+
			
 
				+/* Fills A as an identity matrix */
			
 
				 __global__ void kern_identity_matrix(matrix A);
			
 
				+
			
 
				+/* An implementation of parallel matrix multiplication */
			
 
				 __global__ void kern_mult_matrix_naive(matrix A, matrix B, matrix R);
			
 
				+
			
 
				+/* A (partially) optmisation of parallel matrix multiplication.
			
 
				+ * Only works with square matrix of size multiple of BLOCK_SIZE
			
 
				+ */
			
 
				 __global__ void kern_mult_matrix_shared(matrix A, matrix B, matrix R);
			
--- a/saves/floats_gpu-fpga.cu
+++ b/saves/floats_gpu-fpga.cu
--- a/saves/matrix_gpu-fpga.cu
+++ b/saves/matrix_gpu-fpga.cu
@@ -135,9 +135,7 @@ int main(int argc, char* argv[])
 
				     matrix dev_I,dev_matx,dev_result;
			
 
				 
			
 
				     cudaEvent_t start,stop;
			
 
				-    float ms_shared,ms_naive;
			
 
				-    struct timeval t1,t2;
			
 
				-    double time;
			
 
				+    float ms_shared;
			
 
				     assert_cuda( cudaEventCreate(&start) );
			
 
				     assert_cuda( cudaEventCreate(&stop) );
			
 
				 
			
@@ -172,7 +170,9 @@ int main(int argc, char* argv[])
 
				     err = cudaGetLastError();
			
 
				     printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err));
			
 
				     assert_cuda( cudaMemcpy(I.elements,dev_I.elements,I.rows*I.columns*sizeof(float),cudaMemcpyDeviceToHost) );
			
 
				-    //check_identity_matrix(I);
			
 
				+
			
 
				+    /* Control check */
			
 
				+    check_identity_matrix(I);
			
 
				 
			
 
				     
			
 
				     /* FPGA */
			
@@ -207,21 +207,20 @@ int main(int argc, char* argv[])
 
				     check_identity_matrix(I);
			
 
				 
			
 
				    
			
 
				-    /* assert_cuda( cudaEventRecord(start) ); */
			
 
				-    /* kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result); */
			
 
				-    /* err = cudaGetLastError(); */
			
 
				-    /* printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err)); */
			
 
				-    /* assert_cuda( cudaDeviceSynchronize() ); */
			
 
				-    /* assert_cuda( cudaEventRecord(stop) ); */
			
 
				-    /* assert_cuda( cudaEventSynchronize(stop) ); */
			
 
				-    /* assert_cuda( cudaEventElapsedTime(&ms_shared,start,stop) ); */
			
 
				-    /* assert_cuda( cudaMemcpy(result.elements,dev_result.elements,result.rows*result.columns*sizeof(int),cudaMemcpyDeviceToHost) ); */
			
 
				-    /* check_identity_matrix(result); */
			
 
				-    /* //check_matrix(result, MATRIX_VALUE); */
			
 
				-    /* printf("Time for GPU (shared): %f\n", ms_shared); */
			
 
				+    assert_cuda( cudaEventRecord(start) );
			
 
				+    kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result);
			
 
				+    err = cudaGetLastError();
			
 
				+    printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err));
			
 
				+    assert_cuda( cudaDeviceSynchronize() );
			
 
				+    assert_cuda( cudaEventRecord(stop) );
			
 
				+    assert_cuda( cudaEventSynchronize(stop) );
			
 
				+    assert_cuda( cudaEventElapsedTime(&ms_shared,start,stop) );
			
 
				+    assert_cuda( cudaMemcpy(result.elements,dev_result.elements,result.rows*result.columns*sizeof(int),cudaMemcpyDeviceToHost) );
			
 
				+    check_identity_matrix(result);
			
 
				+    //check_matrix(result, MATRIX_VALUE);
			
 
				+    printf("Time for GPU (shared): %f\n", ms_shared);
			
 
				 
			
 
				     
			
 
				-exit:
			
 
				     printf("\nClosing the connections\n");
			
 
				     WR32(REG_COUNTER, 0);
			
 
				     WR32(REG_DMA, 0);
			
--- a/src/common.cu
+++ b/src/common.cu
@@ -1,4 +1,4 @@
 
				-/* This files contains useful fonctions like assertions */
			
 
				+/* This files contains useful fonctions used by all the programs */
			
 
				 
			
 
				 #include <stdio.h>
			
 
				 #include <stdlib.h>
			
@@ -43,8 +43,6 @@ void init_to_send(const void* dataPtr, size_t size, size_t nmemb)
 
				 	exit( EXIT_FAILURE );
			
 
				     }
			
 
				     int errCheck;
			
 
				-    /* for(int i=0;i<nmemb;i++) */
			
 
				-    /* 	errCheck = fprintf(filePtr,"%f",dataPtr[i]); */
			
 
				     errCheck = fwrite(dataPtr,size,nmemb,filePtr);
			
 
				     if( errCheck == 0 )
			
 
				     {
			
@@ -144,13 +142,13 @@ void check_identity_matrix(matrix M)
 
				 {
			
 
				     for(int i=0;i<M.rows;i++){
			
 
				 	for(int j=0;j<M.rows;j++){
			
 
				-	    printf("%1.0f-",M.elements[i+j*M.rows]);
			
 
				-	    /* if( (i==j && M.elements[i+j*M.rows]!=1) || (i!=j && M.elements[i+j*M.rows]!=0) ){ */
			
 
				-	    /* 	printf("Error in identity matrix at [%d][%d]\n",i,j); */
			
 
				-	    /* 	    exit( EXIT_FAILURE ); */
			
 
				-	    /* 	} */
			
 
				+	    /* printf("%1.0f-",M.elements[i+j*M.rows]); */
			
 
				+	    if( (i==j && M.elements[i+j*M.rows]!=1) || (i!=j && M.elements[i+j*M.rows]!=0) ){
			
 
				+	    	printf("Error in identity matrix at [%d][%d]\n",i,j);
			
 
				+		exit( EXIT_FAILURE );
			
 
				+	    	}
			
 
				 	}
			
 
				-	printf("\n");
			
 
				+	/* printf("\n"); */
			
 
				     }
			
 
				     printf("Matrix checked: no error\n");
			
 
				 }
			
@@ -159,13 +157,13 @@ void check_matrix(matrix M, int value)
 
				 {
			
 
				     for(int i=0;i<M.rows;i++){
			
 
				 	for(int j=0;j<M.columns;j++){
			
 
				-//	    printf("%d",M.elements[i*M.columns+j]);
			
 
				+	    /* printf("%d",M.elements[i*M.columns+j]); */
			
 
				 	    if(M.elements[i*M.columns+j]!=value){
			
 
				 		printf("Error in matrix at [%d][%d]\n",i,j);
			
 
				 		exit( EXIT_FAILURE );
			
 
				 	    }
			
 
				 	}
			
 
				-//	printf("\n");
			
 
				+	/* printf("\n"); */
			
 
				     }
			
 
				     printf("Matrix checked: no error\n");
			
 
				 }
			
@@ -199,7 +197,6 @@ void fill_matrix_random(matrix M)
 
				 
			
 
				 void* cpu_load_compute(void* arg)
			
 
				 {
			
 
				-    /* Loops a matrix multiplication */
			
 
				     matrix I,matx,result;
			
 
				     I.rows = MATRIX_ROW_SIZE;
			
 
				     I.columns = I.rows;
			
@@ -228,7 +225,6 @@ void* cpu_load_compute(void* arg)
 
				 
			
 
				 void* gpu_load_compute(void* arg)
			
 
				 {
			
 
				-    /* Loops a kernel that multiplies matrix */
			
 
				     dim3 blocks_per_grid(MATRIX_ROW_SIZE/BLOCK_SIZE,MATRIX_ROW_SIZE/BLOCK_SIZE);
			
 
				     dim3 threads_per_block(BLOCK_SIZE,BLOCK_SIZE);
			
 
				     matrix dev_I,dev_matx,dev_result;
			
--- a/src/cpu-fpga.cu
+++ b/src/cpu-fpga.cu
@@ -1,4 +1,4 @@
 
				-/* Tansfers data from FPGA to CPU */
			
 
				+/* Transfers arv[1] data from FPGA to CPU */
			
 
				 
			
 
				 #include <stdio.h>
			
 
				 #include <stdlib.h>
			
@@ -22,10 +22,10 @@
 
				 #define GPU_PAGE 65536
			
 
				 #define DATA 0xbb
			
 
				 
			
 
				-//#define LOAD
			
 
				+/* To be used to load the cpu or not */
			
 
				+/* #define LOAD */
			
 
				 
			
 
				 
			
 
				-/* argv[1] = number of bytes to be written */
			
 
				 int main(int argc, char* argv[])
			
 
				 {
			
 
				     FILE* fp = fopen("cpu-fpga.csv","a");
			
@@ -35,10 +35,12 @@ int main(int argc, char* argv[])
 
				 	exit( EXIT_FAILURE );
			
 
				     }
			
 
				 
			
 
				+    /* each transfer deals 64 words of 4 bytes */
			
 
				     int nb_bytes = atoi(argv[argc -1]);
			
 
				     printf("nb_bytes = %d\n",nb_bytes);
			
 
				-    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
			
 
				+    int nb_transfer = nb_bytes/(4*64);
			
 
				 
			
 
				+    /* to be used to send only bytes of a certain DATA value */
			
 
				     unsigned char* data=(unsigned char*)calloc(16384,sizeof(*data));
			
 
				     memset(data,DATA,nb_bytes); memset(data+nb_bytes,0x00,16384-nb_bytes);
			
 
				     init_to_send(data,sizeof(char),nb_bytes);
			
@@ -117,7 +119,6 @@ int main(int argc, char* argv[])
 
				     double simple_write_meas1;
			
 
				     double simple_write_meas2;
			
 
				     double meas_result;
			
 
				-    unsigned char* getBack=(unsigned char*)calloc(16384,sizeof(*getBack));
			
 
				 
			
 
				 #ifdef LOAD
			
 
				     pthread_t cpu_compute,cpu_mem;
			
@@ -136,7 +137,6 @@ int main(int argc, char* argv[])
 
				     WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
			
 
				     WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
			
 
				     WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
			
 
				-//    WR64_sleep(REG_UPDATE_ADDRESS,descInfo.bus_addr+DESCRIPTOR_OFFSET);
			
 
				 
			
 
				     WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
			
 
				     WR32_sleep(REG_DMA,1);
			
@@ -151,22 +151,13 @@ int main(int argc, char* argv[])
 
				     meas_result=simple_write_meas2-simple_write_meas1;
			
 
				 
			
 
				     usleep(1000);
			
 
				-    /* assert_cuda( cudaMemcpy((void*)gpuPageVa,(const void*)gpuPagePtr,16384,cudaMemcpyDeviceToHost) ); */
			
 
				-    /* /\* memcpy(getBack,(const void*)pciMemPtrDesc,16384); *\/ */
			
 
				-    /* memcpy(getBack,(const void*)gpuPageVa,16384); */
			
 
				-    /* for(int j=0;j<16384;j++) */
			
 
				-    /* { */
			
 
				-    /* 	if( getBack[j]!=DATA ) */
			
 
				-    /* 	{ */
			
 
				-    /* 	    printf("Last at %d\n",j); */
			
 
				-    /* 	    goto exit_failure; */
			
 
				-    /* 	} */
			
 
				-    /* 	printf("%hhx",getBack[j]); */
			
 
				+    
			
 
				+    /* A few information for tests */
			
 
				+    /* for(int j=0;j<nb_bytes;j++){ */
			
 
				+    /* 	printf("%hhx",((char*)pciMemPtrPage)[j]); */
			
 
				     /* } */
			
 
				     /* printf("\n"); */
			
 
				-    /* printf("number of descriptor: %d\n",*(int*)gpuDescVa); */
			
 
				-    /* printf("start_meas = %lf\n",start_meas); */
			
 
				-    /* printf("hwaddr = %lx\ngpuPagePtr = %llx\n",*hwaddr,pageInfo.bus_addr); */
			
 
				+    /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
			
 
				 
			
 
				     fprintf(fp,"%lf",meas_result);
			
 
				     
			
@@ -180,7 +171,6 @@ int main(int argc, char* argv[])
 
				     pthread_join(cpu_mem,NULL);
			
 
				 #endif
			
 
				     
			
 
				-    free(getBack);
			
 
				     WR32(REG_COUNTER, 0);
			
 
				     WR32(REG_DMA, 0);
			
 
				     WR32(REG_RESET_DMA, 1);
			
--- a/src/gpu-fpga.cu
+++ b/src/gpu-fpga.cu
@@ -1,4 +1,4 @@
 
				-/* A single test aimed at being looped with a script. Handles the number of Bytes passed in argv[1]. */
			
 
				+/* A single,basic test aimed at being looped with a script. Handles the number of Bytes passed in argv[1]. */
			
 
				 
			
 
				 #include <stdio.h>
			
 
				 #include <stdlib.h>
			
@@ -23,16 +23,19 @@
 
				 
			
 
				 int main(int argc, char* argv[])
			
 
				 {
			
 
				+    /* output file */
			
 
				     FILE* fp = fopen("gpu-fpga.csv","a");
			
 
				     if( fp == NULL ){
			
 
				 	printf("Cannot open file gpu-fpga.csv\n");
			
 
				 	exit( EXIT_FAILURE );
			
 
				     }
			
 
				 
			
 
				+    /* each transfer deals 64 words of 4 bytes */
			
 
				     int nb_bytes = atoi(argv[argc -1]);
			
 
				     printf("nb_bytes = %d\n",nb_bytes);
			
 
				-    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
			
 
				-    
			
 
				+    int nb_transfer = nb_bytes/(4*64);
			
 
				+
			
 
				+    /* to be used to send only bytes of a certain DATA value */
			
 
				     unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
			
 
				     memset(data,DATA,nb_bytes);
			
 
				     init_to_send(data,sizeof(char),nb_bytes);
			
@@ -97,8 +100,8 @@ int main(int argc, char* argv[])
 
				 	exit( EXIT_FAILURE );
			
 
				     }
			
 
				     CUdeviceptr dBAR;
			
 
				-    /* assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) ); */
			
 
				-    /* assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) ); */
			
 
				+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
			
 
				+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
			
 
				     
			
 
				     /* Config PCI for Pages*/
			
 
				     pcilib_kmem_handle_t* pciHandlePage;
			
@@ -141,20 +144,7 @@ int main(int argc, char* argv[])
 
				 
			
 
				     double simple_write_meas1;
			
 
				     double simple_write_meas2;
			
 
				-    double start_meas;
			
 
				     double meas_result;
			
 
				-    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
			
 
				-
			
 
				-    volatile uint64_t *hwaddr = (uint64_t*)((char*)gpuDescVa + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
			
 
				-
			
 
				-    /* for(int j=0;j<nb_bytes;j++) */
			
 
				-    /* 	printf("%hhx",data[j]); */
			
 
				-    /* printf("\n"); */
			
 
				-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
			
 
				-    /* for(int j=0;j<nb_bytes;j++) */
			
 
				-    /* 	printf("%hhx",getBack[j]); */
			
 
				-    /* printf("\n"); */
			
 
				-	
			
 
				 
			
 
				     printf("\nWorking on the FPGA\n");
			
 
				     WR32(REG_RESET_DMA, 1);
			
@@ -174,7 +164,6 @@ WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
 
				 
			
 
				     WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
			
 
				     WR32_sleep(REG_COUNTER,0x1);
			
 
				-    start_meas = 4. *RD32 (0x14)/ 1000;
			
 
				     *(int*)pciMemPtrDesc=0;
			
 
				     simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
			
 
				     WR64(REG_DESCRIPTOR_ADDRESS,pageInfo.bus_addr);
			
@@ -184,28 +173,18 @@ WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
 
				 
			
 
				     usleep(1000);
			
 
				     
			
 
				-    /* assert_cuda( cudaMemcpy((void*)gpuPageVa,(const void*)gpuPagePtr,nb_bytes,cudaMemcpyDeviceToHost) ); */
			
 
				-    /* /\* memcpy(getBack,(const void*)pciMemPtrDesc,nb_bytes); *\/ */
			
 
				-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
			
 
				+    /* A few information for tests */
			
 
				     /* for(int j=0;j<nb_bytes;j++){ */
			
 
				-    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
			
 
				-    /* 	/\*     printf("Last at %d\n",j); *\/ */
			
 
				-    /* 	/\*     goto exit; *\/ */
			
 
				-    /* 	/\* } *\/ */
			
 
				-    /* 	printf("%hhx",getBack[j]); */
			
 
				+    /* 	printf("%hhx",((char*)gpuPageVa)[j]); */
			
 
				     /* } */
			
 
				     /* printf("\n"); */
			
 
				     /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
			
 
				-    /* printf("start_meas = %lf\n",start_meas); */
			
 
				-    /* printf("hwaddr = %p\ngpuPagePtr = %p\n",*hwaddr,pageInfo.bus_addr); */
			
 
				 
			
 
				     fprintf(fp,"%lf",meas_result);
			
 
				     
			
 
				 
			
 
				     /* Close everything */
			
 
				-exit:
			
 
				     printf("\nClosing the connections\n");
			
 
				-    free(getBack);
			
 
				     WR32(REG_COUNTER, 0);
			
 
				     WR32(REG_DMA, 0);
			
 
				     WR32(REG_RESET_DMA, 1);
			
@@ -215,7 +194,7 @@ exit:
 
				 
			
 
				     pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
			
 
				     pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
			
 
				-    /* assert_cu( cuMemHostUnregister((void*) pciVa) ); */
			
 
				+    assert_cu( cuMemHostUnregister((void*) pciVa) );
			
 
				     pcilib_close(pciCtx);
			
 
				     assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,nb_bytes) );
			
 
				     assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );
			
--- a/src/kernels.cu
+++ b/src/kernels.cu
@@ -7,28 +7,6 @@
 
				 
			
 
				 
			
 
				 
			
 
				-__device__
			
 
				-void add_two_device(CUdeviceptr number)
			
 
				-{
			
 
				-    (* (int*) number)+=2;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-__global__
			
 
				-void add_three_global(CUdeviceptr number)
			
 
				-{
			
 
				-    (* (int*) number)++;
			
 
				-    add_two_device(number);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-__global__
			
 
				-void add_one_global(CUdeviceptr number)
			
 
				-{
			
 
				-    (* (int*) number)++;
			
 
				-}
			
 
				-
			
 
				-
			
 
				 __global__
			
 
				 void fill_array(int* array, int value, size_t size)
			
 
				 {
			
@@ -143,7 +121,7 @@ void get_average(float* array, float* average, size_t size,dim3 blocks_per_grid,
 
				 	<<< 1,extend_size,extend_size*sizeof(float)  >>>
			
 
				 	(extend,
			
 
				 	 average,
			
 
				-	 extend_size,//blocks_per_grid.x,
			
 
				+	 extend_size,
			
 
				 	 0,
			
 
				 	 size);
			
 
				     assert_cuda( cudaFree(output) );
			
@@ -185,7 +163,7 @@ void get_dispersion(float* array, float* average, float* dispersion, size_t size
 
				       	<<< 1,extend_size,blocks_per_grid.x*sizeof(float)  >>>
			
 
				     	(extend,
			
 
				     	 dispersion,
			
 
				-    	 extend_size,//blocks_per_grid.x,
			
 
				+    	 extend_size,
			
 
				     	 0);
			
 
				     assert_cuda( cudaFree(output) );
			
 
				     assert_cuda( cudaFree(extend) );
			
@@ -216,7 +194,6 @@ __global__
 
				 void fill_matrix2(int* M,int value,size_t flat_size)
			
 
				 {
			
 
				     uint i =(blockIdx.x*blockDim.x)+threadIdx.x;
			
 
				-    //   printf("i=%d",blockDim.x);
			
 
				     if( i<flat_size )
			
 
				 	M[i] = value;
			
 
				 }
			
@@ -247,9 +224,6 @@ void kern_identity_matrix(matrix I)
 
				 {
			
 
				     uint x = blockIdx.x*blockDim.x +threadIdx.x;
			
 
				     uint y = blockIdx.y*blockDim.y +threadIdx.y;
			
 
				-/*    uint step = blockDim.x*gridDim.x;
			
 
				-    for(int i =0;i<blockDim.x;i+=blockDim.y)
			
 
				-    I.elements[x+(y+i)*step] = (x==(y+i))? 1:0; */
			
 
				     if( x>=I.columns || y>=I.rows )
			
 
				 	return;
			
 
				     I.elements[x+y*I.columns] = (x==y)? 1:0;
			
--- a/src/multi-gpu.cu
+++ b/src/multi-gpu.cu
@@ -27,14 +27,16 @@ int main(int argc, char* argv[])
 
				 	exit( EXIT_FAILURE );
			
 
				     }
			
 
				 
			
 
				+    /* each transfer deals 64 words of 4 bytes */
			
 
				     int nb_bytes = atoi(argv[argc -1]);
			
 
				     printf("nb_bytes = %d\n",nb_bytes);
			
 
				-    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
			
 
				+    int nb_transfer = nb_bytes/(4*64);
			
 
				     
			
 
				-    /* unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data)); */
			
 
				-    /* memset(data,DATA,nb_bytes); */
			
 
				-    /* init_to_send(data,sizeof(char),nb_bytes); */
			
 
				-    /* system("/home/mathiasb/sources/benchmarking/launch.sh");	 */
			
 
				+    /* to be used to send only bytes of a certain DATA value */
			
 
				+    unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
			
 
				+    memset(data,DATA,nb_bytes);
			
 
				+    init_to_send(data,sizeof(char),nb_bytes);
			
 
				+    system("/home/mathiasb/sources/benchmarking/launch.sh");	
			
 
				     
			
 
				     /* Initialisation of the APIs */
			
 
				     assert_cu( cuInit(0) );
			
@@ -168,16 +170,6 @@ int main(int argc, char* argv[])
 
				     double simple_write_meas1;
			
 
				     double simple_write_meas2;
			
 
				     double meas_result;
			
 
				-    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
			
 
				-
			
 
				-    /* for(int j=0;j<nb_bytes;j++) */
			
 
				-    /* 	printf("%hhx",data[j]); */
			
 
				-    /* printf("\n"); */
			
 
				-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
			
 
				-    /* for(int j=0;j<nb_bytes;j++) */
			
 
				-    /* 	printf("%hhx",getBack[j]); */
			
 
				-    /* printf("\n"); */
			
 
				-	
			
 
				 
			
 
				     printf("\nWorking on the FPGA\n");
			
 
				     WR32(REG_RESET_DMA, 1);
			
@@ -205,24 +197,17 @@ int main(int argc, char* argv[])
 
				     
			
 
				     fprintf(fp,"%lf",meas_result);
			
 
				     
			
 
				+    /* A few information for tests */
			
 
				     /* assert_cu( cuCtxPopCurrent(&cuCtx1) ); //ctx0 */
			
 
				-    /* memcpy(getBack,(const void*)gpuPageVa0,nb_bytes); */
			
 
				     /* for(int j=0;j<nb_bytes;j++){ */
			
 
				-    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
			
 
				-    /* 	/\*     printf("Last at %d\n",j); *\/ */
			
 
				-    /* 	/\*     goto exit; *\/ */
			
 
				-    /* 	/\* } *\/ */
			
 
				-    /* 	printf("%hhx",getBack[j]); */
			
 
				+    /* 	printf("%hhx",((char*)gpuPageVa0)[j]); */
			
 
				     /* } */
			
 
				     /* printf("\n"); */
			
 
				+    /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
			
 
				+
			
 
				     /* assert_cu( cuCtxPushCurrent(cuCtx1) ); //ctx1 */
			
 
				-    /* memcpy(getBack,(const void*)gpuPageVa1,nb_bytes); */
			
 
				     /* for(int j=0;j<nb_bytes;j++){ */
			
 
				-    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
			
 
				-    /* 	/\*     printf("Last at %d\n",j); *\/ */
			
 
				-    /* 	/\*     goto exit; *\/ */
			
 
				-    /* 	/\* } *\/ */
			
 
				-    /* 	printf("%hhx",getBack[j]); */
			
 
				+    /* 	printf("%hhx",((char*)gpuPageVa1)[j]); */
			
 
				     /* } */
			
 
				     /* printf("\n"); */
			
 
				     
			
--- a/src/two_steps_dma.cu
+++ b/src/two_steps_dma.cu
@@ -29,10 +29,12 @@ int main(int argc, char* argv[])
 
				 	exit( EXIT_FAILURE );
			
 
				     }
			
 
				 
			
 
				+    /* each transfer deals 64 words of 4 bytes */
			
 
				     int nb_bytes = atoi(argv[argc -1]);
			
 
				     printf("nb_bytes = %d\n",nb_bytes);
			
 
				-    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
			
 
				-    
			
 
				+    int nb_transfer = nb_bytes/(4*64);
			
 
				+
			
 
				+    /* to be used to send only bytes of a certain DATA value */
			
 
				     unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
			
 
				     memset(data,DATA,nb_bytes);
			
 
				     init_to_send(data,sizeof(char),nb_bytes);
			
@@ -142,16 +144,9 @@ int main(int argc, char* argv[])
 
				     double simple_write_meas1;
			
 
				     double simple_write_meas2;
			
 
				     double meas_result;
			
 
				-    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
			
 
				 
			
 
				     assert_cuda( cudaMemset((void*)gpuPagePtr,0xbb,nb_bytes) );
			
 
				-    assert_cuda( cudaDeviceSynchronize() );
			
 
				-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
			
 
				-    /* for(int j=0;j<nb_bytes;j++){ */
			
 
				-    /* 	printf("%hhx",getBack[j]); */
			
 
				-    /* } */
			
 
				-    /* printf("\n"); */
			
 
				-    
			
 
				+    assert_cuda( cudaDeviceSynchronize() );    
			
 
				 
			
 
				     printf("\nWorking on the FPGA\n");
			
 
				     WR32(REG_RESET_DMA, 1);
			
@@ -177,32 +172,23 @@ int main(int argc, char* argv[])
 
				     while(!*(int*)pciMemPtrDesc);
			
 
				     
			
 
				     assert_cu( cuMemcpyHtoDAsync(gpuPagePtr,(const void*)pciMemPtrPage,nb_bytes,0) );
			
 
				+    /* Waiting loop until memory changes. Can be dangerous. */
			
 
				     while( ((unsigned char*)gpuPageVa)[nb_bytes-1] == 0xbb );
			
 
				     simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
			
 
				     meas_result=simple_write_meas2-simple_write_meas1;
			
 
				-    /* printf("%hhx-%hhx\n",((char*)gpuPageVa)[nb_bytes-1],((char*)gpuPageVa)[0]); */
			
 
				 
			
 
				-    /* memcpy(getBack,(const void*)pciMemPtrPage,nb_bytes); */
			
 
				+    /* A few information for tests */
			
 
				     /* for(int j=0;j<nb_bytes;j++){ */
			
 
				-    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
			
 
				-    /* 	/\*     printf("Last at %d\n",j); *\/ */
			
 
				-    /* 	/\*     goto exit; *\/ */
			
 
				-    /* 	/\* } *\/ */
			
 
				-    /* 	printf("%hhx-",((char*)pciMemPtrPage)[j]); */
			
 
				+    /* 	printf("%hhx",((char*)gpuPageVa)[j]); */
			
 
				     /* } */
			
 
				     /* printf("\n"); */
			
 
				-    /* printf("pciBusPage: %p\n",pciBusPage); */
			
 
				-    /* printf("pageInfo.bus_addr: %p\n",pageInfo.bus_addr); */
			
 
				     /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
			
 
				-    /* printf("start_meas = %lf\n",start_meas); */
			
 
				-    /* printf("hwaddr = %lx\ngpuPagePtr = %llx\n",*hwaddr,pageInfo.bus_addr); */
			
 
				-
			
 
				+    
			
 
				     fprintf(fp,"%lf",meas_result);
			
 
				     
			
 
				 
			
 
				     /* Close everything */
			
 
				     printf("\nClosing the connections\n");
			
 
				-    free(getBack);
			
 
				     WR32(REG_COUNTER, 0);
			
 
				     WR32(REG_DMA, 0);
			
 
				     WR32(REG_RESET_DMA, 1);