8 år sedan · 8ae8aadf31
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@ FIGURES=figures/intra-copy.pdf \
 
															 		figures/opencl-setup.pdf \
														
 
															 		figures/transf.pdf \
														
 
															 		figures/throughput.pdf \
														
 
															-		figures/latency.pdf
														
 
															+		figures/latency.pdf \
														
 
															+		figures/latency-hist.pdf
														
 
															 .PHONY: clean figures
														
@@ -18,6 +19,9 @@ figures/intra-copy.pdf:
 
															 figures/latency.pdf: data/ipedirectgma/ipedirectgma.cpu.txt data/ipedirectgma/ipedirectgma.gpu.txt data/ipedirectgma/plot.py
														
 
															 	@cd data/ipedirectgma && python plot.py && cp latency.pdf ../../figures
														
 
															+figures/latency-hist.pdf: data/ipedirectgma/ipedirectgma.1024.cpu.txt data/ipedirectgma/ipedirectgma.1024.gpu.txt data/ipedirectgma/latency-hist.py
														
 
															+	@cd data/ipedirectgma && python latency-hist.py && cp latency-hist.pdf ../../figures
														
 
															+
														
 
															 figures/throughput.pdf: data/throughput.cpu data/throughput.gpu data/throughput.py
														
 
															 	@cd data && python throughput.py && cp throughput.pdf ../figures
														
--- a/paper.tex
+++ b/paper.tex
@@ -309,24 +309,34 @@ three sizes and an increasing block size. At a block size of about 384 KB, the
 
															 throughput surpasses the maximum possible PCIe bandwidth, thus making a double
														
 
															 buffering strategy a viable solution for very large data transfers.
														
 
															+\begin{figure}
														
 
															+  \includegraphics[width=\textwidth]{figures/latency-hist}
														
 
															+  \caption{%
														
 
															+    Latency distribution for a single 1024 B packet transferred from FPGA to
														
 
															+    GPU memory and to main memory.
														
 
															+  }
														
 
															+  \label{fig:latency-distribution}
														
 
															+\end{figure}
														
 
															+
														
 
															 For HEP experiments, low latencies are necessary to react in a reasonable time
														
 
															 frame. In order to measure the latency caused by the communication overhead we
														
 
															 conducted the following protocol: 1) the host issues continuous data transfers
														
 
															 of a 4 KB buffer that is initialized with a fixed value to the FPGA using the
														
 
															 \texttt{cl\-Enqueue\-Copy\-Buffer} call. 2) when the FPGA receives data in its
														
 
															 input FIFO it moves it directly to the output FIFO which feeds the outgoing DMA
														
 
															-engine thus pushing back the data to the GPU. 3) At some point, the host
														
 
															-enables generation of data different from initial value which also starts an
														
 
															-internal FPGA counter with 4 ns resolution. 4) When the generated data is
														
 
															-received again at the FPGA, the counter is stopped. 5) The host program reads
														
 
															-out the counter values and computes the round-trip latency. The distribution of
														
 
															-10000 measurements of the one-way latency is shown in \figref{fig:latency}. The
														
 
															-GPU latency has a mean value of 84.38 \textmu s and a standard variation of
														
 
															-6.34 \textmu s. This is 9.73 \% slower than the CPU latency of 76.89 \textmu s
														
 
															-that was measured using the same driver and measuring procedure. The
														
 
															-non-Gaussian distribution with two distinct peaks indicates a systemic influence
														
 
															-that we cannot control and is most likely caused by the non-deterministic
														
 
															-run-time behaviour of the operating system scheduler.
														
 
															+engine thus pushing back the data to the GPU. 3) At some point, the host enables
														
 
															+generation of data different from initial value which also starts an internal
														
 
															+FPGA counter with 4 ns resolution. 4) When the generated data is received again
														
 
															+at the FPGA, the counter is stopped. 5) The host program reads out the counter
														
 
															+values and computes the round-trip latency. The distribution of 10000
														
 
															+measurements of the one-way latency is shown in \figref{fig:latency-hist}.
														
 
															+[\textbf{REWRITE THIS PART}] The GPU latency has a mean value of 84.38 \textmu s
														
 
															+and a standard variation of 6.34 \textmu s. This is 9.73 \% slower than the CPU
														
 
															+latency of 76.89 \textmu s that was measured using the same driver and measuring
														
 
															+procedure. The non-Gaussian distribution with two distinct peaks indicates a
														
 
															+systemic influence that we cannot control and is most likely caused by the
														
 
															+non-deterministic run-time behaviour of the operating system scheduler.
														
 
															+
														
 
															 \section{Conclusion and outlook}