8 years ago · 8ae8aadf31
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@ FIGURES=figures/intra-copy.pdf \
 
				 		figures/opencl-setup.pdf \
			
 
				 		figures/transf.pdf \
			
 
				 		figures/throughput.pdf \
			
 
				-		figures/latency.pdf
			
 
				+		figures/latency.pdf \
			
 
				+		figures/latency-hist.pdf
			
 
				 
			
 
				 .PHONY: clean figures
			
 
				 
			
@@ -18,6 +19,9 @@ figures/intra-copy.pdf:
 
				 figures/latency.pdf: data/ipedirectgma/ipedirectgma.cpu.txt data/ipedirectgma/ipedirectgma.gpu.txt data/ipedirectgma/plot.py
			
 
				 	@cd data/ipedirectgma && python plot.py && cp latency.pdf ../../figures
			
 
				 
			
 
				+figures/latency-hist.pdf: data/ipedirectgma/ipedirectgma.1024.cpu.txt data/ipedirectgma/ipedirectgma.1024.gpu.txt data/ipedirectgma/latency-hist.py
			
 
				+	@cd data/ipedirectgma && python latency-hist.py && cp latency-hist.pdf ../../figures
			
 
				+
			
 
				 figures/throughput.pdf: data/throughput.cpu data/throughput.gpu data/throughput.py
			
 
				 	@cd data && python throughput.py && cp throughput.pdf ../figures
			
 
				 
			
--- a/paper.tex
+++ b/paper.tex
@@ -309,24 +309,34 @@ three sizes and an increasing block size. At a block size of about 384 KB, the
 
				 throughput surpasses the maximum possible PCIe bandwidth, thus making a double
			
 
				 buffering strategy a viable solution for very large data transfers.
			
 
				 
			
 
				+\begin{figure}
			
 
				+  \includegraphics[width=\textwidth]{figures/latency-hist}
			
 
				+  \caption{%
			
 
				+    Latency distribution for a single 1024 B packet transferred from FPGA to
			
 
				+    GPU memory and to main memory.
			
 
				+  }
			
 
				+  \label{fig:latency-distribution}
			
 
				+\end{figure}
			
 
				+
			
 
				 For HEP experiments, low latencies are necessary to react in a reasonable time
			
 
				 frame. In order to measure the latency caused by the communication overhead we
			
 
				 conducted the following protocol: 1) the host issues continuous data transfers
			
 
				 of a 4 KB buffer that is initialized with a fixed value to the FPGA using the
			
 
				 \texttt{cl\-Enqueue\-Copy\-Buffer} call. 2) when the FPGA receives data in its
			
 
				 input FIFO it moves it directly to the output FIFO which feeds the outgoing DMA
			
 
				-engine thus pushing back the data to the GPU. 3) At some point, the host
			
 
				-enables generation of data different from initial value which also starts an
			
 
				-internal FPGA counter with 4 ns resolution. 4) When the generated data is
			
 
				-received again at the FPGA, the counter is stopped. 5) The host program reads
			
 
				-out the counter values and computes the round-trip latency. The distribution of
			
 
				-10000 measurements of the one-way latency is shown in \figref{fig:latency}. The
			
 
				-GPU latency has a mean value of 84.38 \textmu s and a standard variation of
			
 
				-6.34 \textmu s. This is 9.73 \% slower than the CPU latency of 76.89 \textmu s
			
 
				-that was measured using the same driver and measuring procedure. The
			
 
				-non-Gaussian distribution with two distinct peaks indicates a systemic influence
			
 
				-that we cannot control and is most likely caused by the non-deterministic
			
 
				-run-time behaviour of the operating system scheduler.
			
 
				+engine thus pushing back the data to the GPU. 3) At some point, the host enables
			
 
				+generation of data different from initial value which also starts an internal
			
 
				+FPGA counter with 4 ns resolution. 4) When the generated data is received again
			
 
				+at the FPGA, the counter is stopped. 5) The host program reads out the counter
			
 
				+values and computes the round-trip latency. The distribution of 10000
			
 
				+measurements of the one-way latency is shown in \figref{fig:latency-hist}.
			
 
				+[\textbf{REWRITE THIS PART}] The GPU latency has a mean value of 84.38 \textmu s
			
 
				+and a standard variation of 6.34 \textmu s. This is 9.73 \% slower than the CPU
			
 
				+latency of 76.89 \textmu s that was measured using the same driver and measuring
			
 
				+procedure. The non-Gaussian distribution with two distinct peaks indicates a
			
 
				+systemic influence that we cannot control and is most likely caused by the
			
 
				+non-deterministic run-time behaviour of the operating system scheduler.
			
 
				+
			
 
				 
			
 
				 \section{Conclusion and outlook}