8 years ago · 5d5fc8f86b
--- a/check.c
+++ b/check.c
@@ -542,9 +542,11 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
 
				     struct timespec end;
			
 
				     double host_latency;
			
 
				     double debug_latency;
			
 
				+    cl_event signal_event;
			
 
				     cl_event event;
			
 
				     uint32_t check[CHECK_BUFFER_SIZE / 4];
			
 
				     size_t work_size = 1;
			
 
				+    cl_ulong times[4];
			
 
				 
			
 
				     printf ("\n** FPGA to GPU latency [kernel]\n\n");
			
 
				     fill_gpu_buffer (app);
			
@@ -553,6 +555,9 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
 
				     configure_dma_descriptors (app);
			
 
				     setup_counter (app);
			
 
				 
			
 
				+    WR32_sleep (HF_REG_CONTROL, HF_CONTROL_ENABLE_READ | HF_CONTROL_ENABLE_MULTI_READ);
			
 
				+    WR64_sleep (HF_REG_PERF_COUNTER_FEEDBACK_1, 0);
			
 
				+    WR64_sleep (HF_REG_DESCRIPTOR_AMD_SIGNAL, app->gpu.addr.marker_bus_address);
			
 
				     WR64_sleep (HF_REG_DESCRIPTOR_ADDRESS, app->gpu.addr.surface_bus_address);
			
 
				 
			
 
				     reset_dbg (app);
			
@@ -561,17 +566,25 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
 
				     OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 1, sizeof (cl_mem), &app->fpga_buffer));
			
 
				     OCL_CHECK_ERROR (clSetKernelArg (app->kernel, 2, sizeof (cl_mem), &app->check_buffer));
			
 
				 
			
 
				-    OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1,
			
 
				-                                             NULL, &work_size, NULL,
			
 
				-                                             0, NULL, &event));
			
 
				+    WR32_sleep (HF_REG_DMA, 0);
			
 
				 
			
 
				-    WR32 (HF_REG_DMA, 1);
			
 
				-    clock_gettime (CLOCK_MONOTONIC, &start);
			
 
				+    OCL_CHECK_ERROR (clEnqueueWaitSignalAMD (app->queue, app->gpu.buffer, 1, 0, NULL, &signal_event));
			
 
				 
			
 
				-    clWaitForEvents (1, &event);
			
 
				+    /* OCL_CHECK_ERROR (clEnqueueNDRangeKernel (app->queue, app->kernel, 1, */
			
 
				+    /*                                          NULL, &work_size, NULL, */
			
 
				+    /*                                          1, &signal_event, &event)); */
			
 
				+    usleep (100);
			
 
				+    WR32_sleep (HF_REG_DMA, 1);
			
 
				 
			
 
				+    clock_gettime (CLOCK_MONOTONIC, &start);
			
 
				+    OCL_CHECK_ERROR (clWaitForEvents (1, &signal_event));
			
 
				     clock_gettime (CLOCK_MONOTONIC, &end);
			
 
				-    OCL_CHECK_ERROR (clReleaseEvent (event));
			
 
				+
			
 
				+    WR32 ( HF_REG_DMA, 0);
			
 
				+    WR32 ( 0x20, 1);
			
 
				+    WR32 ( 0x24, 1);
			
 
				+
			
 
				+    /* OCL_CHECK_ERROR (clReleaseEvent (event)); */
			
 
				 
			
 
				     counter = RD32 (HF_REG_PERF_COUNTER);
			
 
				     host_latency = elapsed_seconds (&start, &end);
			
@@ -579,19 +592,31 @@ measure_fpga_to_gpu_latency_with_kernel (App *app)
 
				 
			
 
				     check_data_transfer (app, PAGE_SIZE);
			
 
				 
			
 
				+    printf ("DMA running: %i\n", RD32 (HF_REG_DMA) & 0x1);
			
 
				+
			
 
				     OCL_CHECK_ERROR (clEnqueueReadBuffer (app->queue, app->check_buffer, CL_TRUE,
			
 
				                                           0, CHECK_BUFFER_SIZE, check, 0, NULL, NULL));
			
 
				 
			
 
				+    WR32 (HF_REG_DMA, 0);
			
 
				+
			
 
				     debug_assert_cmp ("Data check", check[0], 1);
			
 
				+    WR32_sleep (HF_REG_CONTROL, HF_CONTROL_ENABLE_READ);
			
 
				+
			
 
				+    printf ("%-16s %f us\n", "FPGA latency", RD32 (0x20) * 4 / 1000.0);
			
 
				+
			
 
				+    ocl_get_event_times (signal_event, &times[0], &times[1], &times[2], &times[3]);
			
 
				 
			
 
				-    printf ("\n%-16s %i\n", "Kernel count", check[1]);
			
 
				+    printf ("submit -> start = %f, queue -> start = %f, run_time=%lu\n",
			
 
				+            (times[0] - times[3]) / 1000.0, (times[0] - times[2]) / 1000.0, times[1] - times[0]);
			
 
				+    /* printf ("\n%-16s %i\n", "Kernel count", check[1]); */
			
 
				     printf ("%-16s %f us\n", "Wall time", host_latency * 1000.0 * 1000.0);
			
 
				-    printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0);
			
 
				-    printf ("%-16s %f us\n", "FPGA [debug]", debug_latency);
			
 
				-    printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.);
			
 
				+    /* printf ("%-16s %f us\n", "FPGA [counter]", ((counter << 8) * 4) / 1000.0); */
			
 
				+    /* printf ("%-16s %f us\n", "FPGA [debug]", debug_latency); */
			
 
				+    /* printf ("%-16s %.2f MB/s\n", "Throughput", GPU_BUFFER_SIZE / host_latency / 1024. / 1024.); */
			
 
				 }
			
 
				 
			
 
				 
			
 
				+
			
 
				 static void
			
 
				 measure_fpga_to_gpu_latency_with_cpu (App *app)
			
 
				 {