|
@@ -19,94 +19,6 @@
|
|
|
#define GPU_PAGE 65536
|
|
|
#define DATA 0xa2
|
|
|
|
|
|
-int stop_process = 0;
|
|
|
-
|
|
|
-void* cpu_load_compute(void* arg)
|
|
|
-{
|
|
|
- /* Loops a matrix multiplication */
|
|
|
- matrix I,matx,result;
|
|
|
- I.rows = MATRIX_ROW_SIZE;
|
|
|
- I.columns = I.rows;
|
|
|
- I.stride = I.columns;
|
|
|
- matx.rows = MATRIX_ROW_SIZE;
|
|
|
- matx.columns = matx.rows;
|
|
|
- matx.stride = matx.columns;
|
|
|
- result.rows = MATRIX_ROW_SIZE;
|
|
|
- result.columns = result.rows;
|
|
|
- result.stride = result.columns;
|
|
|
- I.elements = (float*)malloc(I.rows*I.columns*sizeof(float));
|
|
|
- matx.elements = (float*)malloc(matx.rows*matx.columns*sizeof(float));
|
|
|
- result.elements = (float*)malloc(result.rows*result.columns*sizeof(float));
|
|
|
-
|
|
|
- fill_matrix_random(I);
|
|
|
- fill_matrix_random(matx);
|
|
|
-
|
|
|
- while(!stop_process){
|
|
|
- mult_matrix(I, matx, result);
|
|
|
- }
|
|
|
- free(I.elements);
|
|
|
- free(result.elements);
|
|
|
- free(matx.elements);
|
|
|
- return NULL;
|
|
|
-}
|
|
|
-
|
|
|
-void* gpu_load_compute(void* arg)
|
|
|
-{
|
|
|
- /* Loops a kernel that multiplies matrix */
|
|
|
- dim3 blocks_per_grid(MATRIX_ROW_SIZE/BLOCK_SIZE,MATRIX_ROW_SIZE/BLOCK_SIZE);
|
|
|
- dim3 threads_per_block(BLOCK_SIZE,BLOCK_SIZE);
|
|
|
- matrix dev_I,dev_matx,dev_result;
|
|
|
-
|
|
|
- dev_I.rows = MATRIX_ROW_SIZE;
|
|
|
- dev_I.columns = dev_I.rows;
|
|
|
- dev_I.stride = dev_I.columns;
|
|
|
- dev_matx.rows = MATRIX_ROW_SIZE;
|
|
|
- dev_matx.columns = dev_matx.rows;
|
|
|
- dev_matx.rows = dev_matx.columns;
|
|
|
- dev_result.rows = MATRIX_ROW_SIZE;
|
|
|
- dev_result.columns = dev_result.rows;
|
|
|
- dev_result.stride = dev_result.columns;
|
|
|
- assert_cuda( cudaMalloc((void**)&dev_I.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
|
|
|
- assert_cuda( cudaMalloc((void**)&dev_matx.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
|
|
|
- assert_cuda( cudaMalloc((void**)&dev_result.elements,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(float)) );
|
|
|
-
|
|
|
- kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_I);
|
|
|
- kern_identity_matrix<<< blocks_per_grid,threads_per_block >>>(dev_matx);
|
|
|
-
|
|
|
- while(!stop_process){
|
|
|
- kern_mult_matrix_shared<<< blocks_per_grid,threads_per_block >>>(dev_I,dev_matx,dev_result);
|
|
|
- }
|
|
|
- assert_cuda( cudaFree(dev_I.elements) );
|
|
|
- assert_cuda( cudaFree(dev_result.elements) );
|
|
|
- assert_cuda( cudaFree(dev_matx.elements) );
|
|
|
- return NULL;
|
|
|
-}
|
|
|
-
|
|
|
-void* cpu_load_memory(void* arg)
|
|
|
-{
|
|
|
- char* foo = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
|
|
|
- char* bar = (char*) malloc( MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char) );
|
|
|
- while(!stop_process){
|
|
|
- memcpy(foo, bar, MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char));
|
|
|
- }
|
|
|
- free(foo);
|
|
|
- free(bar);
|
|
|
- return NULL;
|
|
|
-}
|
|
|
-
|
|
|
-void* gpu_load_memory(void* arg)
|
|
|
-{
|
|
|
- char *dev_foo,*dev_bar;
|
|
|
- assert_cuda( cudaMalloc((void**)&dev_foo,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
|
|
|
- assert_cuda( cudaMalloc((void**)&dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char)) );
|
|
|
- while(!stop_process){
|
|
|
- assert_cuda( cudaMemcpy(dev_foo,dev_bar,MATRIX_ROW_SIZE*MATRIX_ROW_SIZE*sizeof(char),cudaMemcpyDeviceToDevice) );
|
|
|
- }
|
|
|
- assert_cuda( cudaFree(dev_foo) );
|
|
|
- assert_cuda( cudaFree(dev_bar) );
|
|
|
- return NULL;
|
|
|
-}
|
|
|
-
|
|
|
|
|
|
int main(int argc, char* argv[])
|
|
|
{
|
|
@@ -232,11 +144,12 @@ int main(int argc, char* argv[])
|
|
|
unsigned char* getBack=(unsigned char*)calloc(nb_bytes,sizeof(*getBack));
|
|
|
|
|
|
pthread_t cpu_compute,cpu_mem,gpu_compute,gpu_mem;
|
|
|
+
|
|
|
pthread_create(&cpu_compute, NULL, cpu_load_compute, NULL);
|
|
|
- pthread_create(&gpu_compute, NULL, gpu_load_compute, NULL);
|
|
|
- pthread_create(&cpu_mem, NULL, cpu_load_memory, NULL);
|
|
|
+ pthread_create(&cpu_mem, NULL, cpu_load_memory, NULL);
|
|
|
pthread_create(&gpu_mem, NULL, gpu_load_memory, NULL);
|
|
|
-
|
|
|
+ pthread_create(&gpu_compute, NULL, gpu_load_compute, NULL);
|
|
|
+
|
|
|
printf("\nWorking on the FPGA\n");
|
|
|
WR32(REG_RESET_DMA, 1);
|
|
|
usleep(100000);
|
|
@@ -272,10 +185,10 @@ WR32_sleep(REG_PACKET_LENGTH,0x80000 | 64); // added flag
|
|
|
|
|
|
stop_process = 1;
|
|
|
pthread_join(cpu_compute,NULL);
|
|
|
- pthread_join(gpu_compute,NULL);
|
|
|
pthread_join(cpu_mem,NULL);
|
|
|
pthread_join(gpu_mem,NULL);
|
|
|
-
|
|
|
+ pthread_join(gpu_compute,NULL);
|
|
|
+
|
|
|
free(getBack);
|
|
|
WR32(REG_COUNTER, 0);
|
|
|
WR32(REG_DMA, 0);
|