Bläddra i källkod

Code cleaned and commented

mathiasb 6 år sedan
förälder
incheckning
7c019a1cbc
11 ändrade filer med 116 tillägg och 178 borttagningar
  1. 0 14
      CMakeLists.txt
  2. 30 0
      include/common.h
  3. 16 3
      include/kernels.h
  4. 0 0
      saves/floats_gpu-fpga.cu
  5. 16 17
      saves/matrix_gpu-fpga.cu
  6. 9 13
      src/common.cu
  7. 11 21
      src/cpu-fpga.cu
  8. 11 32
      src/gpu-fpga.cu
  9. 2 28
      src/kernels.cu
  10. 12 27
      src/multi-gpu.cu
  11. 9 23
      src/two_steps_dma.cu

+ 0 - 14
CMakeLists.txt

@@ -16,24 +16,12 @@ set(CUDA_NVCC_FLAGS
   -gencode arch=compute_35,code=sm_35
   )#;-rdc=true)
 
-cuda_add_executable(floats_gpu-fpga
-  src/floats_gpu-fpga.cu
-  src/common.cu
-  src/kernels.cu
-  )
-
 cuda_add_executable(gpu-fpga
   src/gpu-fpga.cu
   src/common.cu
   src/kernels.cu
   )
 
-cuda_add_executable(matrix_gpu-fpga
-  src/matrix_gpu-fpga.cu
-  src/common.cu
-  src/kernels.cu
-  )
-
 cuda_add_executable(two_steps_dma
   src/two_steps_dma.cu
   src/common.cu
@@ -58,9 +46,7 @@ cuda_add_executable(loaded
   src/kernels.cu
   )
 
-target_link_libraries(floats_gpu-fpga cuda pcilib gdrapi)
 target_link_libraries(gpu-fpga cuda pcilib gdrapi)
-target_link_libraries(matrix_gpu-fpga cuda pcilib gdrapi)
 target_link_libraries(two_steps_dma cuda pcilib gdrapi)
 target_link_libraries(cpu-fpga cuda pcilib gdrapi)
 target_link_libraries(multi-gpu cuda pcilib gdrapi)

+ 30 - 0
include/common.h

@@ -4,6 +4,7 @@
 #include "cuda.h"
 #include "cuda_runtime_api.h"
 
+/* Used to stop pthreads when loading CPU and/or GPU */
 extern int stop_process;
 
 typedef struct matrix_t{
@@ -13,9 +14,13 @@ typedef struct matrix_t{
     uint stride;
 } matrix;
 
+/* Number of threads in a block's dim */
 #define BLOCK_SIZE 16
+
 #define ASSERT_FAIL 0
 #define ASSERT_SUCCESS 1
+
+/* Width of a matrix and value used for checking */
 #define MATRIX_ROW_SIZE BLOCK_SIZE*16*1
 #define MATRIX_VALUE 2
 
@@ -27,18 +32,43 @@ typedef struct matrix_t{
 void __assert_cuda(cudaError_t err_id, const char* file, int line); /* for runtime api*/
 void __assert_cu(CUresult res_id, const char* file, int line); /* for driver api */
 void __assert_gdr(int gdr_id, const char* file, int line);
+
+/* Generates a binary to_send file used to transmit data to FPGA */
 void init_to_send(const void* dataPtr, size_t size, size_t nmemb);
+
+/* Checks if array of size floats is filled with value */
 bool check_array(float* array, float value, size_t size);
+
+/* Prints some information on device */
 void deviceInformation(int device);
+
+/* Functions to fill array with size floats of defined or random value */
 void cpu_fill_array(float* array, float value, size_t size);
 void cpu_fill_array_random(float* array, size_t size);
+
+/* Uses CPU to compute average and standard deviation of array of size floats */
 float cpu_average(float* array, size_t size);
 float cpu_dispersion(float* array, float average, size_t size);
+
+/* Returns an identity matrix of size row_size*row_size */
 matrix identity_matrix(size_t row_size);
+
+/* Asserts if M is identity matrix */
 void check_identity_matrix(matrix M);
+
+/* Asserts if matrix is filled with value */
 void check_matrix(matrix M, int value);
+
+/* Computes R=A*B with CPU */
 void mult_matrix(matrix A, matrix B, matrix R);
+
+/* Fills M with random values with CPU */
 void fill_matrix_random(matrix M);
+
+/* Functions to load the CPU/GPU with:
+ * computation: loops a matrix multiplication
+ * memory operations: loops memcpy
+ */
 void* cpu_load_compute(void* arg);
 void* gpu_load_compute(void* arg);
 void* cpu_load_memory(void* arg);

+ 16 - 3
include/kernels.h

@@ -3,22 +3,35 @@
 #include "gdrapi.h"
 #include "common.h"
 
-__device__ void add_two_device(CUdeviceptr number);
-__global__ void add_three_global(CUdeviceptr number);
-__global__ void add_one_global(CUdeviceptr number);
+/* Kernels to fill array with size int/floats of value */
 __global__ void fill_array(int* array, int value, size_t size);
 __global__ void fill_float_array(float* array, float value, size_t size);
+
+/* Functions to compute average and deviation of an array.
+ * Only get_average and get_reduction are to be used by main.
+ * Works with square grids/blocks
+ */
 extern __shared__ float _temp[];
 __global__ void get_reduction(float* array, float* output, size_t size, size_t stride);
 __global__ void get_average_reduction(float* array, float* average, size_t size, size_t stride, size_t divider);
 __global__ void get_dispersion_reduction(float* array, float* average, float* output, size_t size, size_t stride, size_t divider);
 void get_average(float* array, float* average, size_t size,dim3 blocks_per_grid, dim3 threads_per_block);
 void get_dispersion(float* array, float* average, float* dispersion, size_t size, dim3 blocks_per_grid, dim3 threads_per_block);
+
+/* Some kernels operating on matrix with various methods */
 __global__ void fill_matrix(int* M,int value);
 __global__ void add_matrix(int* A,int* B,int* C);
 __global__ void fill_matrix2(int* M,int value,size_t flat_size);
 __global__ void add_matrix2(int* A,int* B,int* C,size_t flat_size);
 __global__ void add_matrix_mod(int* B,int* C);
+
+/* Fills A as an identity matrix */
 __global__ void kern_identity_matrix(matrix A);
+
+/* An implementation of parallel matrix multiplication */
 __global__ void kern_mult_matrix_naive(matrix A, matrix B, matrix R);
+
+/* A (partially) optmisation of parallel matrix multiplication.
+ * Only works with square matrix of size multiple of BLOCK_SIZE
+ */
 __global__ void kern_mult_matrix_shared(matrix A, matrix B, matrix R);

+ 0 - 0
src/floats_gpu-fpga.cu → saves/floats_gpu-fpga.cu


+ 16 - 17
src/matrix_gpu-fpga.cu → saves/matrix_gpu-fpga.cu

@@ -135,9 +135,7 @@ int main(int argc, char* argv[])
     matrix dev_I,dev_matx,dev_result;
 
     cudaEvent_t start,stop;
-    float ms_shared,ms_naive;
-    struct timeval t1,t2;
-    double time;
+    float ms_shared;
     assert_cuda( cudaEventCreate(&start) );
     assert_cuda( cudaEventCreate(&stop) );
 
@@ -172,7 +170,9 @@ int main(int argc, char* argv[])
     err = cudaGetLastError();
     printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err));
     assert_cuda( cudaMemcpy(I.elements,dev_I.elements,I.rows*I.columns*sizeof(float),cudaMemcpyDeviceToHost) );
-    //check_identity_matrix(I);
+
+    /* Control check */
+    check_identity_matrix(I);
 
     
     /* FPGA */
@@ -207,21 +207,20 @@ int main(int argc, char* argv[])
     check_identity_matrix(I);
 
    
-    /* assert_cuda( cudaEventRecord(start) ); */
-    /* kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result); */
-    /* err = cudaGetLastError(); */
-    /* printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err)); */
-    /* assert_cuda( cudaDeviceSynchronize() ); */
-    /* assert_cuda( cudaEventRecord(stop) ); */
-    /* assert_cuda( cudaEventSynchronize(stop) ); */
-    /* assert_cuda( cudaEventElapsedTime(&ms_shared,start,stop) ); */
-    /* assert_cuda( cudaMemcpy(result.elements,dev_result.elements,result.rows*result.columns*sizeof(int),cudaMemcpyDeviceToHost) ); */
-    /* check_identity_matrix(result); */
-    /* //check_matrix(result, MATRIX_VALUE); */
-    /* printf("Time for GPU (shared): %f\n", ms_shared); */
+    assert_cuda( cudaEventRecord(start) );
+    kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result);
+    err = cudaGetLastError();
+    printf("%s: %s\n",cudaGetErrorName(err),cudaGetErrorString(err));
+    assert_cuda( cudaDeviceSynchronize() );
+    assert_cuda( cudaEventRecord(stop) );
+    assert_cuda( cudaEventSynchronize(stop) );
+    assert_cuda( cudaEventElapsedTime(&ms_shared,start,stop) );
+    assert_cuda( cudaMemcpy(result.elements,dev_result.elements,result.rows*result.columns*sizeof(int),cudaMemcpyDeviceToHost) );
+    check_identity_matrix(result);
+    //check_matrix(result, MATRIX_VALUE);
+    printf("Time for GPU (shared): %f\n", ms_shared);
 
     
-exit:
     printf("\nClosing the connections\n");
     WR32(REG_COUNTER, 0);
     WR32(REG_DMA, 0);

+ 9 - 13
src/common.cu

@@ -1,4 +1,4 @@
-/* This files contains useful fonctions like assertions */
+/* This files contains useful fonctions used by all the programs */
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -43,8 +43,6 @@ void init_to_send(const void* dataPtr, size_t size, size_t nmemb)
 	exit( EXIT_FAILURE );
     }
     int errCheck;
-    /* for(int i=0;i<nmemb;i++) */
-    /* 	errCheck = fprintf(filePtr,"%f",dataPtr[i]); */
     errCheck = fwrite(dataPtr,size,nmemb,filePtr);
     if( errCheck == 0 )
     {
@@ -144,13 +142,13 @@ void check_identity_matrix(matrix M)
 {
     for(int i=0;i<M.rows;i++){
 	for(int j=0;j<M.rows;j++){
-	    printf("%1.0f-",M.elements[i+j*M.rows]);
-	    /* if( (i==j && M.elements[i+j*M.rows]!=1) || (i!=j && M.elements[i+j*M.rows]!=0) ){ */
-	    /* 	printf("Error in identity matrix at [%d][%d]\n",i,j); */
-	    /* 	    exit( EXIT_FAILURE ); */
-	    /* 	} */
+	    /* printf("%1.0f-",M.elements[i+j*M.rows]); */
+	    if( (i==j && M.elements[i+j*M.rows]!=1) || (i!=j && M.elements[i+j*M.rows]!=0) ){
+	    	printf("Error in identity matrix at [%d][%d]\n",i,j);
+		exit( EXIT_FAILURE );
+	    	}
 	}
-	printf("\n");
+	/* printf("\n"); */
     }
     printf("Matrix checked: no error\n");
 }
@@ -159,13 +157,13 @@ void check_matrix(matrix M, int value)
 {
     for(int i=0;i<M.rows;i++){
 	for(int j=0;j<M.columns;j++){
-//	    printf("%d",M.elements[i*M.columns+j]);
+	    /* printf("%d",M.elements[i*M.columns+j]); */
 	    if(M.elements[i*M.columns+j]!=value){
 		printf("Error in matrix at [%d][%d]\n",i,j);
 		exit( EXIT_FAILURE );
 	    }
 	}
-//	printf("\n");
+	/* printf("\n"); */
     }
     printf("Matrix checked: no error\n");
 }
@@ -199,7 +197,6 @@ void fill_matrix_random(matrix M)
 
 void* cpu_load_compute(void* arg)
 {
-    /* Loops a matrix multiplication */
     matrix I,matx,result;
     I.rows = MATRIX_ROW_SIZE;
     I.columns = I.rows;
@@ -228,7 +225,6 @@ void* cpu_load_compute(void* arg)
 
 void* gpu_load_compute(void* arg)
 {
-    /* Loops a kernel that multiplies matrix */
     dim3 blocks_per_grid(MATRIX_ROW_SIZE/BLOCK_SIZE,MATRIX_ROW_SIZE/BLOCK_SIZE);
     dim3 threads_per_block(BLOCK_SIZE,BLOCK_SIZE);
     matrix dev_I,dev_matx,dev_result;

+ 11 - 21
src/cpu-fpga.cu

@@ -1,4 +1,4 @@
-/* Tansfers data from FPGA to CPU */
+/* Transfers arv[1] data from FPGA to CPU */
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -22,10 +22,10 @@
 #define GPU_PAGE 65536
 #define DATA 0xbb
 
-//#define LOAD
+/* To be used to load the cpu or not */
+/* #define LOAD */
 
 
-/* argv[1] = number of bytes to be written */
 int main(int argc, char* argv[])
 {
     FILE* fp = fopen("cpu-fpga.csv","a");
@@ -35,10 +35,12 @@ int main(int argc, char* argv[])
 	exit( EXIT_FAILURE );
     }
 
+    /* each transfer deals 64 words of 4 bytes */
     int nb_bytes = atoi(argv[argc -1]);
     printf("nb_bytes = %d\n",nb_bytes);
-    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
+    int nb_transfer = nb_bytes/(4*64);
 
+    /* to be used to send only bytes of a certain DATA value */
     unsigned char* data=(unsigned char*)calloc(16384,sizeof(*data));
     memset(data,DATA,nb_bytes); memset(data+nb_bytes,0x00,16384-nb_bytes);
     init_to_send(data,sizeof(char),nb_bytes);
@@ -117,7 +119,6 @@ int main(int argc, char* argv[])
     double simple_write_meas1;
     double simple_write_meas2;
     double meas_result;
-    unsigned char* getBack=(unsigned char*)calloc(16384,sizeof(*getBack));
 
 #ifdef LOAD
     pthread_t cpu_compute,cpu_mem;
@@ -136,7 +137,6 @@ int main(int argc, char* argv[])
     WR32_sleep(REG_TIMER_THRESHOLD, 0x1);
     WR32_sleep(REG_UPDATE_THRESHOLD, 0x1);
     WR64_sleep(REG_UPDATE_COUNTER,pciBusDesc);
-//    WR64_sleep(REG_UPDATE_ADDRESS,descInfo.bus_addr+DESCRIPTOR_OFFSET);
 
     WR32_sleep(REG_CONTROL,CONTROL_ENABLE_READ|CONTROL_SOURCE_RX_FIFO);
     WR32_sleep(REG_DMA,1);
@@ -151,22 +151,13 @@ int main(int argc, char* argv[])
     meas_result=simple_write_meas2-simple_write_meas1;
 
     usleep(1000);
-    /* assert_cuda( cudaMemcpy((void*)gpuPageVa,(const void*)gpuPagePtr,16384,cudaMemcpyDeviceToHost) ); */
-    /* /\* memcpy(getBack,(const void*)pciMemPtrDesc,16384); *\/ */
-    /* memcpy(getBack,(const void*)gpuPageVa,16384); */
-    /* for(int j=0;j<16384;j++) */
-    /* { */
-    /* 	if( getBack[j]!=DATA ) */
-    /* 	{ */
-    /* 	    printf("Last at %d\n",j); */
-    /* 	    goto exit_failure; */
-    /* 	} */
-    /* 	printf("%hhx",getBack[j]); */
+    
+    /* A few information for tests */
+    /* for(int j=0;j<nb_bytes;j++){ */
+    /* 	printf("%hhx",((char*)pciMemPtrPage)[j]); */
     /* } */
     /* printf("\n"); */
-    /* printf("number of descriptor: %d\n",*(int*)gpuDescVa); */
-    /* printf("start_meas = %lf\n",start_meas); */
-    /* printf("hwaddr = %lx\ngpuPagePtr = %llx\n",*hwaddr,pageInfo.bus_addr); */
+    /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
 
     fprintf(fp,"%lf",meas_result);
     
@@ -180,7 +171,6 @@ int main(int argc, char* argv[])
     pthread_join(cpu_mem,NULL);
 #endif
     
-    free(getBack);
     WR32(REG_COUNTER, 0);
     WR32(REG_DMA, 0);
     WR32(REG_RESET_DMA, 1);

+ 11 - 32
src/gpu-fpga.cu

@@ -1,4 +1,4 @@
-/* A single test aimed at being looped with a script. Handles the number of Bytes passed in argv[1]. */
+/* A single,basic test aimed at being looped with a script. Handles the number of Bytes passed in argv[1]. */
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -23,16 +23,19 @@
 
 int main(int argc, char* argv[])
 {
+    /* output file */
     FILE* fp = fopen("gpu-fpga.csv","a");
     if( fp == NULL ){
 	printf("Cannot open file gpu-fpga.csv\n");
 	exit( EXIT_FAILURE );
     }
 
+    /* each transfer deals 64 words of 4 bytes */
     int nb_bytes = atoi(argv[argc -1]);
     printf("nb_bytes = %d\n",nb_bytes);
-    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
-    
+    int nb_transfer = nb_bytes/(4*64);
+
+    /* to be used to send only bytes of a certain DATA value */
     unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
     memset(data,DATA,nb_bytes);
     init_to_send(data,sizeof(char),nb_bytes);
@@ -97,8 +100,8 @@ int main(int argc, char* argv[])
 	exit( EXIT_FAILURE );
     }
     CUdeviceptr dBAR;
-    /* assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) ); */
-    /* assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) ); */
+    assert_cu( cuMemHostRegister((void*)pciVa,128,CU_MEMHOSTREGISTER_IOMEMORY) );
+    assert_cu( cuMemHostGetDevicePointer(&dBAR,(void*)pciVa, 0) );
     
     /* Config PCI for Pages*/
     pcilib_kmem_handle_t* pciHandlePage;
@@ -141,20 +144,7 @@ int main(int argc, char* argv[])
 
     double simple_write_meas1;
     double simple_write_meas2;
-    double start_meas;
     double meas_result;
-    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
-
-    volatile uint64_t *hwaddr = (uint64_t*)((char*)gpuDescVa + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
-
-    /* for(int j=0;j<nb_bytes;j++) */
-    /* 	printf("%hhx",data[j]); */
-    /* printf("\n"); */
-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
-    /* for(int j=0;j<nb_bytes;j++) */
-    /* 	printf("%hhx",getBack[j]); */
-    /* printf("\n"); */
-	
 
     printf("\nWorking on the FPGA\n");
     WR32(REG_RESET_DMA, 1);
@@ -174,7 +164,6 @@ WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
 
     WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
     WR32_sleep(REG_COUNTER,0x1);
-    start_meas = 4. *RD32 (0x14)/ 1000;
     *(int*)pciMemPtrDesc=0;
     simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
     WR64(REG_DESCRIPTOR_ADDRESS,pageInfo.bus_addr);
@@ -184,28 +173,18 @@ WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
 
     usleep(1000);
     
-    /* assert_cuda( cudaMemcpy((void*)gpuPageVa,(const void*)gpuPagePtr,nb_bytes,cudaMemcpyDeviceToHost) ); */
-    /* /\* memcpy(getBack,(const void*)pciMemPtrDesc,nb_bytes); *\/ */
-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
+    /* A few information for tests */
     /* for(int j=0;j<nb_bytes;j++){ */
-    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
-    /* 	/\*     printf("Last at %d\n",j); *\/ */
-    /* 	/\*     goto exit; *\/ */
-    /* 	/\* } *\/ */
-    /* 	printf("%hhx",getBack[j]); */
+    /* 	printf("%hhx",((char*)gpuPageVa)[j]); */
     /* } */
     /* printf("\n"); */
     /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
-    /* printf("start_meas = %lf\n",start_meas); */
-    /* printf("hwaddr = %p\ngpuPagePtr = %p\n",*hwaddr,pageInfo.bus_addr); */
 
     fprintf(fp,"%lf",meas_result);
     
 
     /* Close everything */
-exit:
     printf("\nClosing the connections\n");
-    free(getBack);
     WR32(REG_COUNTER, 0);
     WR32(REG_DMA, 0);
     WR32(REG_RESET_DMA, 1);
@@ -215,7 +194,7 @@ exit:
 
     pcilib_free_kernel_memory(pciCtx,pciHandleDesc,PCILIB_KMEM_FLAG_FORCE);
     pcilib_free_kernel_memory(pciCtx,pciHandlePage,PCILIB_KMEM_FLAG_FORCE);
-    /* assert_cu( cuMemHostUnregister((void*) pciVa) ); */
+    assert_cu( cuMemHostUnregister((void*) pciVa) );
     pcilib_close(pciCtx);
     assert_gdr( gdr_unmap(g,GPUMemHandlePage,gpuPageVa,nb_bytes) );
     assert_gdr( gdr_unpin_buffer(g,GPUMemHandlePage) );

+ 2 - 28
src/kernels.cu

@@ -7,28 +7,6 @@
 
 
 
-__device__
-void add_two_device(CUdeviceptr number)
-{
-    (* (int*) number)+=2;
-}
-
-
-__global__
-void add_three_global(CUdeviceptr number)
-{
-    (* (int*) number)++;
-    add_two_device(number);
-}
-
-
-__global__
-void add_one_global(CUdeviceptr number)
-{
-    (* (int*) number)++;
-}
-
-
 __global__
 void fill_array(int* array, int value, size_t size)
 {
@@ -143,7 +121,7 @@ void get_average(float* array, float* average, size_t size,dim3 blocks_per_grid,
 	<<< 1,extend_size,extend_size*sizeof(float)  >>>
 	(extend,
 	 average,
-	 extend_size,//blocks_per_grid.x,
+	 extend_size,
 	 0,
 	 size);
     assert_cuda( cudaFree(output) );
@@ -185,7 +163,7 @@ void get_dispersion(float* array, float* average, float* dispersion, size_t size
       	<<< 1,extend_size,blocks_per_grid.x*sizeof(float)  >>>
     	(extend,
     	 dispersion,
-    	 extend_size,//blocks_per_grid.x,
+    	 extend_size,
     	 0);
     assert_cuda( cudaFree(output) );
     assert_cuda( cudaFree(extend) );
@@ -216,7 +194,6 @@ __global__
 void fill_matrix2(int* M,int value,size_t flat_size)
 {
     uint i =(blockIdx.x*blockDim.x)+threadIdx.x;
-    //   printf("i=%d",blockDim.x);
     if( i<flat_size )
 	M[i] = value;
 }
@@ -247,9 +224,6 @@ void kern_identity_matrix(matrix I)
 {
     uint x = blockIdx.x*blockDim.x +threadIdx.x;
     uint y = blockIdx.y*blockDim.y +threadIdx.y;
-/*    uint step = blockDim.x*gridDim.x;
-    for(int i =0;i<blockDim.x;i+=blockDim.y)
-    I.elements[x+(y+i)*step] = (x==(y+i))? 1:0; */
     if( x>=I.columns || y>=I.rows )
 	return;
     I.elements[x+y*I.columns] = (x==y)? 1:0;

+ 12 - 27
src/multi-gpu.cu

@@ -27,14 +27,16 @@ int main(int argc, char* argv[])
 	exit( EXIT_FAILURE );
     }
 
+    /* each transfer deals 64 words of 4 bytes */
     int nb_bytes = atoi(argv[argc -1]);
     printf("nb_bytes = %d\n",nb_bytes);
-    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
+    int nb_transfer = nb_bytes/(4*64);
     
-    /* unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data)); */
-    /* memset(data,DATA,nb_bytes); */
-    /* init_to_send(data,sizeof(char),nb_bytes); */
-    /* system("/home/mathiasb/sources/benchmarking/launch.sh");	 */
+    /* to be used to send only bytes of a certain DATA value */
+    unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
+    memset(data,DATA,nb_bytes);
+    init_to_send(data,sizeof(char),nb_bytes);
+    system("/home/mathiasb/sources/benchmarking/launch.sh");	
     
     /* Initialisation of the APIs */
     assert_cu( cuInit(0) );
@@ -168,16 +170,6 @@ int main(int argc, char* argv[])
     double simple_write_meas1;
     double simple_write_meas2;
     double meas_result;
-    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
-
-    /* for(int j=0;j<nb_bytes;j++) */
-    /* 	printf("%hhx",data[j]); */
-    /* printf("\n"); */
-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
-    /* for(int j=0;j<nb_bytes;j++) */
-    /* 	printf("%hhx",getBack[j]); */
-    /* printf("\n"); */
-	
 
     printf("\nWorking on the FPGA\n");
     WR32(REG_RESET_DMA, 1);
@@ -205,24 +197,17 @@ int main(int argc, char* argv[])
     
     fprintf(fp,"%lf",meas_result);
     
+    /* A few information for tests */
     /* assert_cu( cuCtxPopCurrent(&cuCtx1) ); //ctx0 */
-    /* memcpy(getBack,(const void*)gpuPageVa0,nb_bytes); */
     /* for(int j=0;j<nb_bytes;j++){ */
-    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
-    /* 	/\*     printf("Last at %d\n",j); *\/ */
-    /* 	/\*     goto exit; *\/ */
-    /* 	/\* } *\/ */
-    /* 	printf("%hhx",getBack[j]); */
+    /* 	printf("%hhx",((char*)gpuPageVa0)[j]); */
     /* } */
     /* printf("\n"); */
+    /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
+
     /* assert_cu( cuCtxPushCurrent(cuCtx1) ); //ctx1 */
-    /* memcpy(getBack,(const void*)gpuPageVa1,nb_bytes); */
     /* for(int j=0;j<nb_bytes;j++){ */
-    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
-    /* 	/\*     printf("Last at %d\n",j); *\/ */
-    /* 	/\*     goto exit; *\/ */
-    /* 	/\* } *\/ */
-    /* 	printf("%hhx",getBack[j]); */
+    /* 	printf("%hhx",((char*)gpuPageVa1)[j]); */
     /* } */
     /* printf("\n"); */
     

+ 9 - 23
src/two_steps_dma.cu

@@ -29,10 +29,12 @@ int main(int argc, char* argv[])
 	exit( EXIT_FAILURE );
     }
 
+    /* each transfer deals 64 words of 4 bytes */
     int nb_bytes = atoi(argv[argc -1]);
     printf("nb_bytes = %d\n",nb_bytes);
-    int nb_transfer = nb_bytes/(4*64); //each transfer deals 64 words of 4 bytes
-    
+    int nb_transfer = nb_bytes/(4*64);
+
+    /* to be used to send only bytes of a certain DATA value */
     unsigned char* data=(unsigned char*)calloc(nb_bytes,sizeof(*data));
     memset(data,DATA,nb_bytes);
     init_to_send(data,sizeof(char),nb_bytes);
@@ -142,16 +144,9 @@ int main(int argc, char* argv[])
     double simple_write_meas1;
     double simple_write_meas2;
     double meas_result;
-    unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
 
     assert_cuda( cudaMemset((void*)gpuPagePtr,0xbb,nb_bytes) );
-    assert_cuda( cudaDeviceSynchronize() );
-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
-    /* for(int j=0;j<nb_bytes;j++){ */
-    /* 	printf("%hhx",getBack[j]); */
-    /* } */
-    /* printf("\n"); */
-    
+    assert_cuda( cudaDeviceSynchronize() );    
 
     printf("\nWorking on the FPGA\n");
     WR32(REG_RESET_DMA, 1);
@@ -177,32 +172,23 @@ int main(int argc, char* argv[])
     while(!*(int*)pciMemPtrDesc);
     
     assert_cu( cuMemcpyHtoDAsync(gpuPagePtr,(const void*)pciMemPtrPage,nb_bytes,0) );
+    /* Waiting loop until memory changes. Can be dangerous. */
     while( ((unsigned char*)gpuPageVa)[nb_bytes-1] == 0xbb );
     simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
     meas_result=simple_write_meas2-simple_write_meas1;
-    /* printf("%hhx-%hhx\n",((char*)gpuPageVa)[nb_bytes-1],((char*)gpuPageVa)[0]); */
 
-    /* memcpy(getBack,(const void*)pciMemPtrPage,nb_bytes); */
+    /* A few information for tests */
     /* for(int j=0;j<nb_bytes;j++){ */
-    /* 	/\* if( getBack[j]!=DATA ){ *\/ */
-    /* 	/\*     printf("Last at %d\n",j); *\/ */
-    /* 	/\*     goto exit; *\/ */
-    /* 	/\* } *\/ */
-    /* 	printf("%hhx-",((char*)pciMemPtrPage)[j]); */
+    /* 	printf("%hhx",((char*)gpuPageVa)[j]); */
     /* } */
     /* printf("\n"); */
-    /* printf("pciBusPage: %p\n",pciBusPage); */
-    /* printf("pageInfo.bus_addr: %p\n",pageInfo.bus_addr); */
     /* printf("number of descriptor: %d\n",*(int*)pciMemPtrDesc); */
-    /* printf("start_meas = %lf\n",start_meas); */
-    /* printf("hwaddr = %lx\ngpuPagePtr = %llx\n",*hwaddr,pageInfo.bus_addr); */
-
+    
     fprintf(fp,"%lf",meas_result);
     
 
     /* Close everything */
     printf("\nClosing the connections\n");
-    free(getBack);
     WR32(REG_COUNTER, 0);
     WR32(REG_DMA, 0);
     WR32(REG_RESET_DMA, 1);