Преглед изворни кода

Add gitignore. Moved load functions to common.cu

mathiasb пре 6 година
родитељ
комит
6c7b020edf
8 измењених фајлова са 155 додато и 122 уклоњено
  1. 20 0
      .gitignore
  2. 7 7
      build/gpu-fpga.sh
  3. 7 7
      build/two_steps_dma.sh
  4. 6 0
      include/common.h
  5. 90 0
      src/common.cu
  6. 16 13
      src/cpu-fpga.cu
  7. 6 93
      src/loaded.cu
  8. 3 2
      src/two_steps_dma.cu

+ 20 - 0
.gitignore

@@ -0,0 +1,20 @@
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+cpu-fpga.csv
+cpu-fpga.export.csv
+gpu-fpga.csv
+gpu-fpga.export.csv
+loaded.csv
+loaded.export.csv
+multi-gpu.csv
+multi-gpu.export.csv
+two_steps_dma.csv
+two_steps_dma.export.csv
+Makefile
+cpu-fpga
+gpu-fpga
+loaded
+multi-gpu
+two_steps_dma
+to_send

+ 7 - 7
build/gpu-fpga.sh

@@ -3,12 +3,12 @@
 ITERATION=$1
 
 echo P100 > gpu-fpga.csv
-# for i in `seq 1 $ITERATION`;
-# do
-#     echo $i
-#     CUDA_VISIBLE_DEVICES=0 ./gpu-fpga 4096 
-#     echo , >> gpu-fpga.csv
-# done
+for i in `seq 1 $ITERATION`;
+do
+    echo $i
+    CUDA_VISIBLE_DEVICES=0 ./gpu-fpga 4096 
+    echo , >> gpu-fpga.csv
+done
 
 echo \ >> gpu-fpga.csv
 
@@ -16,7 +16,7 @@ echo K40 >> gpu-fpga.csv
 for i in `seq 1 $ITERATION`;
 do
    echo $i
-   CUDA_VISIBLE_DEVICES=0 ./gpu-fpga 4096
+   CUDA_VISIBLE_DEVICES=1 ./gpu-fpga 4096
    echo , >> gpu-fpga.csv
 done
 

+ 7 - 7
build/two_steps_dma.sh

@@ -3,12 +3,12 @@
 ITERATION=$1
 
 echo P100 > two_steps_dma.csv
-# for i in `seq 1 $ITERATION`;
-# do
-#     echo $i
-#     CUDA_VISIBLE_DEVICES=0 ./two_steps_dma 4096
-#     echo , >> two_steps_dma.csv
-# done
+for i in `seq 1 $ITERATION`;
+do
+    echo $i
+    CUDA_VISIBLE_DEVICES=0 ./two_steps_dma 4096
+    echo , >> two_steps_dma.csv
+done
 
 echo \ >> two_steps_dma.csv
 
@@ -16,7 +16,7 @@ echo K40 >> two_steps_dma.csv
 for i in `seq 1 $ITERATION`;
 do
     echo $i
-    CUDA_VISIBLE_DEVICES=0 ./two_steps_dma 4096
+    CUDA_VISIBLE_DEVICES=1 ./two_steps_dma 4096
     echo , >> two_steps_dma.csv
 done
 

+ 6 - 0
include/common.h

@@ -4,6 +4,8 @@
 #include "cuda.h"
 #include "cuda_runtime_api.h"
 
+extern int stop_process;
+
 typedef struct matrix_t{
     float* elements;
     uint rows;
@@ -37,5 +39,9 @@ void check_identity_matrix(matrix M);
 void check_matrix(matrix M, int value);
 void mult_matrix(matrix A, matrix B, matrix R);
 void fill_matrix_random(matrix M);
+void* cpu_load_compute(void* arg);
+void* gpu_load_compute(void* arg);
+void* cpu_load_memory(void* arg);
+void* gpu_load_memory(void* arg);
 
 #endif

+ 90 - 0
src/common.cu

@@ -3,6 +3,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "common.h"
+#include "kernels.h"
+
+int stop_process = 0;
 
 void __assert_cuda(cudaError_t err_id, const char* file, int line)
 {
@@ -193,3 +196,90 @@ void fill_matrix_random(matrix M)
 	}
     }
 }
+
+void* cpu_load_compute(void* arg)
+{
+    /* Loops a matrix multiplication */
+    matrix I,matx,result;
+    I.rows = MATRIX_ROW_SIZE;
+    I.columns = I.rows;
+    I.stride = I.columns;
+    matx.rows = MATRIX_ROW_SIZE;
+    matx.columns = matx.rows;
+    matx.stride = matx.columns;
+    result.rows = MATRIX_ROW_SIZE;
+    result.columns = result.rows;
+    result.stride = result.columns;
+    I.elements = (float*)malloc(I.rows*I.columns*sizeof(float));
+    matx.elements = (float*)malloc(matx.rows*matx.columns*sizeof(float));
+    result.elements = (float*)malloc(result.rows*result.columns*sizeof(float));
+
+    fill_matrix_random(I);
+    fill_matrix_random(matx);
+    
+    while(!stop_process){
+	mult_matrix(I, matx, result);
+    }
+    free(I.elements);
+    free(result.elements);
+    free(matx.elements);
+    return NULL;
+}
+
+void* gpu_load_compute(void* arg)
+{
+    /* Loops a kernel that multiplies matrix */
+    dim3 blocks_per_grid(MATRIX_ROW_SIZE/BLOCK_SIZE,MATRIX_ROW_SIZE/BLOCK_SIZE);
+    dim3 threads_per_block(BLOCK_SIZE,BLOCK_SIZE);
+    matrix dev_I,dev_matx,dev_result;
+
+    dev_I.rows = MATRIX_ROW_SIZE;
+    dev_I.columns = dev_I.rows;
+    dev_I.stride = dev_I.columns;
+    dev_matx.rows = MATRIX_ROW_SIZE;
+    dev_matx.columns = dev_matx.rows;
+    dev_matx.rows = dev_matx.columns;
+    dev_result.rows = MATRIX_ROW_SIZE;
+    dev_result.columns = dev_result.rows;
+    dev_result.stride = dev_result.columns;
+    assert_cuda( cudaMalloc((void**)&dev_I.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+    assert_cuda( cudaMalloc((void**)&dev_matx.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+    assert_cuda( cudaMalloc((void**)&dev_result.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
+
+    kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_I);
+    kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_matx);
+
+    while(!stop_process){
+	kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result);
+    }
+    assert_cuda( cudaFree(dev_I.elements) );
+    assert_cuda( cudaFree(dev_result.elements) );
+    assert_cuda( cudaFree(dev_matx.elements) );
+    return NULL;
+}
+
+void* cpu_load_memory(void* arg)
+{
+    char* foo = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
+    char* bar = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
+    while(!stop_process){
+	memcpy(foo, bar, MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char));
+    }
+    free(foo);
+    free(bar);
+    return NULL;
+}
+
+void* gpu_load_memory(void* arg)
+{
+    char *dev_foo,*dev_bar;
+    assert_cuda( cudaMalloc((void**)&dev_foo,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
+    assert_cuda( cudaMalloc((void**)&dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
+    while(!stop_process){
+	assert_cuda( cudaMemcpy(dev_foo,dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char),cudaMemcpyDeviceToDevice) );
+    }
+    assert_cuda( cudaFree(dev_foo) );
+    assert_cuda( cudaFree(dev_bar) );
+    return NULL;
+}
+

+ 16 - 13
src/cpu-fpga.cu

@@ -10,6 +10,7 @@
 #include <pcilib/bar.h>
 #include "ipedma.h"
 #include <unistd.h>
+#include <pthread.h>
 
 #define KMEM_DEFAULT_FLAGS      (pcilib_kmem_flags_t)(PCILIB_KMEM_FLAG_HARDWARE | PCILIB_KMEM_FLAG_PERSISTENT | PCILIB_KMEM_FLAG_EXCLUSIVE)
 
@@ -21,6 +22,8 @@
 #define GPU_PAGE 65536
 #define DATA 0xbb
 
+//#define LOAD
+
 
 /* argv[1] = number of bytes to be written */
 int main(int argc, char* argv[])
@@ -113,21 +116,15 @@ int main(int argc, char* argv[])
 
     double simple_write_meas1;
     double simple_write_meas2;
-    double start_meas;
     double meas_result;
     unsigned char* getBack=(unsigned char*)calloc(16384,sizeof(*getBack));
 
-    volatile uint64_t *hwaddr = (uint64_t*)((char*)pciMemPtrDesc + DESCRIPTOR_OFFSET + 2 * sizeof(uint32_t));
-
-    /* for(int j=0;j<16384;j++) */
-    /* 	printf("%hhx",data[j]); */
-    /* printf("\n"); */
-    /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
-    /* for(int j=0;j<nb_bytes;j++) */
-    /* 	printf("%hhx",getBack[j]); */
-    /* printf("\n"); */
-	
-
+#ifdef LOAD
+    pthread_t cpu_compute,cpu_mem;
+    pthread_create(&cpu_compute, NULL, cpu_load_compute, NULL);
+    pthread_create(&cpu_mem, NULL, cpu_load_memory, NULL);    
+#endif
+    
     printf("\nWorking on the FPGA\n");
     WR32(REG_RESET_DMA, 1);
     usleep(100000);
@@ -146,7 +143,6 @@ int main(int argc, char* argv[])
 
     WR32_sleep(REG_INTERCONNECT, 0x232); //0x262);
     WR32_sleep(REG_COUNTER,0x1);
-    start_meas = 4. *RD32 (0x14)/ 1000;
     *(int*)pciMemPtrDesc=0;
     simple_write_meas1 = 4. *RD32 (0x14)/ 1000;
     WR64(REG_DESCRIPTOR_ADDRESS,pciBusPage);
@@ -177,6 +173,13 @@ int main(int argc, char* argv[])
 
     /* Close everything */
     printf("\nClosing the connections\n");
+
+#ifdef LOAD
+    stop_process = 1;
+    pthread_join(cpu_compute,NULL);
+    pthread_join(cpu_mem,NULL);
+#endif
+    
     free(getBack);
     WR32(REG_COUNTER, 0);
     WR32(REG_DMA, 0);

+ 6 - 93
src/loaded.cu

@@ -19,94 +19,6 @@
 #define GPU_PAGE 65536
 #define DATA 0xa2
 
-int stop_process = 0;
-
-void* cpu_load_compute(void* arg)
-{
-    /* Loops a matrix multiplication */
-    matrix I,matx,result;
-    I.rows = MATRIX_ROW_SIZE;
-    I.columns = I.rows;
-    I.stride = I.columns;
-    matx.rows = MATRIX_ROW_SIZE;
-    matx.columns = matx.rows;
-    matx.stride = matx.columns;
-    result.rows = MATRIX_ROW_SIZE;
-    result.columns = result.rows;
-    result.stride = result.columns;
-    I.elements = (float*)malloc(I.rows*I.columns*sizeof(float));
-    matx.elements = (float*)malloc(matx.rows*matx.columns*sizeof(float));
-    result.elements = (float*)malloc(result.rows*result.columns*sizeof(float));
-
-    fill_matrix_random(I);
-    fill_matrix_random(matx);
-    
-    while(!stop_process){
-	mult_matrix(I, matx, result);
-    }
-    free(I.elements);
-    free(result.elements);
-    free(matx.elements);
-    return NULL;
-}
-
-void* gpu_load_compute(void* arg)
-{
-    /* Loops a kernel that multiplies matrix */
-    dim3 blocks_per_grid(MATRIX_ROW_SIZE/BLOCK_SIZE,MATRIX_ROW_SIZE/BLOCK_SIZE);
-    dim3 threads_per_block(BLOCK_SIZE,BLOCK_SIZE);
-    matrix dev_I,dev_matx,dev_result;
-
-    dev_I.rows = MATRIX_ROW_SIZE;
-    dev_I.columns = dev_I.rows;
-    dev_I.stride = dev_I.columns;
-    dev_matx.rows = MATRIX_ROW_SIZE;
-    dev_matx.columns = dev_matx.rows;
-    dev_matx.rows = dev_matx.columns;
-    dev_result.rows = MATRIX_ROW_SIZE;
-    dev_result.columns = dev_result.rows;
-    dev_result.stride = dev_result.columns;
-    assert_cuda( cudaMalloc((void**)&dev_I.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
-    assert_cuda( cudaMalloc((void**)&dev_matx.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
-    assert_cuda( cudaMalloc((void**)&dev_result.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
-
-    kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_I);
-    kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_matx);
-
-    while(!stop_process){
-	kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result);
-    }
-    assert_cuda( cudaFree(dev_I.elements) );
-    assert_cuda( cudaFree(dev_result.elements) );
-    assert_cuda( cudaFree(dev_matx.elements) );
-    return NULL;
-}
-
-void* cpu_load_memory(void* arg)
-{
-    char* foo = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
-    char* bar = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
-    while(!stop_process){
-	memcpy(foo, bar, MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char));
-    }
-    free(foo);
-    free(bar);
-    return NULL;
-}
-
-void* gpu_load_memory(void* arg)
-{
-    char *dev_foo,*dev_bar;
-    assert_cuda( cudaMalloc((void**)&dev_foo,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
-    assert_cuda( cudaMalloc((void**)&dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
-    while(!stop_process){
-	assert_cuda( cudaMemcpy(dev_foo,dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char),cudaMemcpyDeviceToDevice) );
-    }
-    assert_cuda( cudaFree(dev_foo) );
-    assert_cuda( cudaFree(dev_bar) );
-    return NULL;
-}
-
 
 int main(int argc, char* argv[])
 {
@@ -232,11 +144,12 @@ int main(int argc, char* argv[])
     unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
 
     pthread_t cpu_compute,cpu_mem,gpu_compute,gpu_mem;
+
     pthread_create(&cpu_compute, NULL, cpu_load_compute, NULL);
-    pthread_create(&gpu_compute, NULL, gpu_load_compute, NULL);
-    pthread_create(&cpu_mem, NULL, cpu_load_memory, NULL);
+    pthread_create(&cpu_mem, NULL, cpu_load_memory, NULL);    
     pthread_create(&gpu_mem, NULL, gpu_load_memory, NULL);
-
+    pthread_create(&gpu_compute, NULL, gpu_load_compute, NULL);
+    
     printf("\nWorking on the FPGA\n");
     WR32(REG_RESET_DMA, 1);
     usleep(100000);
@@ -272,10 +185,10 @@ WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
 
     stop_process = 1;
     pthread_join(cpu_compute,NULL);
-    pthread_join(gpu_compute,NULL);
     pthread_join(cpu_mem,NULL);
     pthread_join(gpu_mem,NULL);
-
+    pthread_join(gpu_compute,NULL);
+    
     free(getBack);
     WR32(REG_COUNTER, 0);
     WR32(REG_DMA, 0);

+ 3 - 2
src/two_steps_dma.cu

@@ -144,7 +144,7 @@ int main(int argc, char* argv[])
     double meas_result;
     unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
 
-    assert_cuda( cudaMemset((void*)gpuPagePtr,0x00,nb_bytes) );
+    assert_cuda( cudaMemset((void*)gpuPagePtr,0xbb,nb_bytes) );
     assert_cuda( cudaDeviceSynchronize() );
     /* memcpy(getBack,(const void*)gpuPageVa,nb_bytes); */
     /* for(int j=0;j<nb_bytes;j++){ */
@@ -177,6 +177,7 @@ int main(int argc, char* argv[])
     while(!*(int*)pciMemPtrDesc);
     
     assert_cu( cuMemcpyHtoDAsync(gpuPagePtr,(const void*)pciMemPtrPage,nb_bytes,0) );
+    while( ((unsigned char*)gpuPageVa)[nb_bytes-1] == 0xbb );
     simple_write_meas2 = 4. *RD32 (0x14)/ 1000;
     meas_result=simple_write_meas2-simple_write_meas1;
     /* printf("%hhx-%hhx\n",((char*)gpuPageVa)[nb_bytes-1],((char*)gpuPageVa)[0]); */
@@ -187,7 +188,7 @@ int main(int argc, char* argv[])
     /* 	/\*     printf("Last at %d\n",j); *\/ */
     /* 	/\*     goto exit; *\/ */
     /* 	/\* } *\/ */
-    /* 	printf("%hhx",((char*)pciMemPtrPage)[j]); */
+    /* 	printf("%hhx-",((char*)pciMemPtrPage)[j]); */
     /* } */
     /* printf("\n"); */
     /* printf("pciBusPage: %p\n",pciBusPage); */