8 years ago · 23c6895169
--- a/data/throughput.cpu
+++ b/data/throughput.cpu
@@ -1,16 +1,77 @@
 
				-8388608000  6023
			
 
				-4194304000  6028
			
 
				-2097152000  6018
			
 
				-1048576000  6028
			
 
				-524288000   6006
			
 
				-262144000   5998
			
 
				-131072000   5995
			
 
				-65536000    6004
			
 
				-32768000    6002
			
 
				-16384000    5817
			
 
				-8192000     5604
			
 
				-4096000     5148
			
 
				-2048000     4507
			
 
				-1024000     3414
			
 
				-512000      2410
			
 
				-256000      1840
			
 
				+16000 		    1277		
			
 
				+32000 		    1757		
			
 
				+64000 		    2749		
			
 
				+128000 		    2917		
			
 
				+256000 		    3771		
			
 
				+512000 		    3909		
			
 
				+1024000 		4096		
			
 
				+2048000 		4219		
			
 
				+4096000 		4222		
			
 
				+8192000 		4356		
			
 
				+16384000 		5216		
			
 
				+32768000 		5522		
			
 
				+65536000 		5901		
			
 
				+131072000 		6100		
			
 
				+262144000 		6218		
			
 
				+524288000 		6548		
			
 
				+1048000000 		6472		
			
 
				+2097000000 		6528		
			
 
				+4194000000 		6561		
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/paper.tex
+++ b/paper.tex
@@ -111,19 +111,16 @@ links~\cite{nieto2015high}. Their system (as limited by the interconnect)
 
				 achieves an average throughput of 870 MB/s with 1 KB block transfers.
			
 
				 
			
 
				 In order to achieve the best performance in terms of latency and bandwidth, we
			
 
				-developed a high-performance DMA engine based on Xilinx's PCIe Gen3 Core.
			
 
				-
			
 
				-To process the data, we encapsulated the DMA setup and memory mapping in a
			
 
				-plugin for our scalable GPU processing framework~\cite{vogelgesang2012ufo}.
			
 
				-This framework allows for an easy construction of streamed data processing on
			
 
				+developed a high-performance DMA engine based on Xilinx's PCIe Gen3 Core.To
			
 
				+process the data, we encapsulated the DMA setup and memory mapping in a plugin
			
 
				+for our scalable GPU processing framework~\cite{vogelgesang2012ufo}. This
			
 
				+framework allows for an easy construction of streamed data processing on
			
 
				 heterogeneous multi-GPU systems. The framework is based on OpenCL,  and
			
 
				 integration with NVIDIA's CUDA functions for GPUDirect technology is not
			
 
				-possible.
			
 
				-
			
 
				-We therefore integrated direct FPGA-to-GPU communication into our processing
			
 
				-pipeline using AMD's DirectGMA technology. In this paper we report the
			
 
				-performance of our DMA engine for FPGA-to-CPU communication and the first
			
 
				-preliminary results with  DirectGMA technology.
			
 
				+possible. We therefore integrated direct FPGA-to-GPU communication into our
			
 
				+processing pipeline using AMD's DirectGMA technology. In this paper we report
			
 
				+the performance of our DMA engine for FPGA-to-CPU communication and some
			
 
				+preliminary measurements about DirectGMA's performance in low-latency applications.
			
 
				 
			
 
				 \section{Architecture}
			
 
				 
			
@@ -254,41 +251,55 @@ E5-1630 at 3.7 GHz, Intel C612 chipset running openSUSE 13.1 with Linux
 
				 3.0 x8 slots. In case of FPGA-to-CPU data transfers, the software
			
 
				 implementation is the one  described in~\cite{rota2015dma}.
			
 
				 
			
 
				+\subsection{Throughput}
			
 
				+
			
 
				+
			
 
				+
			
 
				 \begin{figure}
			
 
				-  \centering
			
 
				-  \begin{subfigure}[b]{.49\textwidth}
			
 
				-    \centering
			
 
				-    \includegraphics[width=\textwidth]{figures/throughput}
			
 
				-    \caption{%
			
 
				-      DMA data transfer throughput.
			
 
				-    }
			
 
				-    \label{fig:throughput}
			
 
				-  \end{subfigure}
			
 
				-  \begin{subfigure}[b]{.49\textwidth}
			
 
				-    \includegraphics[width=\textwidth]{figures/latency}
			
 
				-    \caption{%
			
 
				-      Latency distribution.
			
 
				-      % for a single 4 KB packet transferred
			
 
				-      % from FPGA-to-CPU and FPGA-to-GPU.
			
 
				-    }
			
 
				-    \label{fig:latency}
			
 
				-  \end{subfigure}
			
 
				+  \includegraphics[width=\textwidth]{figures/throughput}
			
 
				   \caption{%
			
 
				     Measured results for data transfers from FPGA to main memory
			
 
				     (CPU) and from FPGA to the global GPU memory (GPU).
			
 
				-  }
			
 
				+}
			
 
				+\label{fig:throughput}
			
 
				 \end{figure}
			
 
				 
			
 
				+% \begin{figure}
			
 
				+%   \centering
			
 
				+%   \begin{subfigure}[b]{.49\textwidth}
			
 
				+%     \centering
			
 
				+%     \includegraphics[width=\textwidth]{figures/throughput}
			
 
				+%     \caption{%
			
 
				+%       DMA data transfer throughput.
			
 
				+%     }
			
 
				+%     \label{fig:throughput}
			
 
				+%   \end{subfigure}
			
 
				+%   \begin{subfigure}[b]{.49\textwidth}
			
 
				+%     \includegraphics[width=\textwidth]{figures/latency}
			
 
				+%     \caption{%
			
 
				+%       Latency distribution.
			
 
				+%       % for a single 4 KB packet transferred
			
 
				+%       % from FPGA-to-CPU and FPGA-to-GPU.
			
 
				+%     }
			
 
				+%     \label{fig:latency}
			
 
				+%   \end{subfigure}
			
 
				+%   \caption{%
			
 
				+%     Measured throuhput for data transfers from FPGA to main memory
			
 
				+%     (CPU) and from FPGA to the global GPU memory (GPU). 
			
 
				+%   }
			
 
				+% \end{figure}
			
 
				+
			
 
				 The measured results for the pure data throughput is shown in
			
 
				 \figref{fig:throughput} for transfers from the FPGA to the system's main
			
 
				-memory as well as to the global memory as explained in \ref{sec:host}. As one
			
 
				-can see, in both cases the write performance is primarily limited by the PCIe
			
 
				-bus. Higher payloads make up for the constant overhead thus increasing the net
			
 
				-bandwidth. Up until 2 MB data transfer size, the throughput to the GPU is
			
 
				-approaching slowly 100 MB/s. From there on, the throughput increases up to 6.4
			
 
				-GB/s when PCIe bus saturation sets in at about 1 GB data size. The CPU
			
 
				-throughput saturates earlier at about 30 MB but the maximum throughput is
			
 
				-limited to about 6 GB/s losing about 6\% write performance.
			
 
				+memory as well as to the global memory as explained in \ref{sec:host}. 
			
 
				+% Must ask Suren about this
			
 
				+
			
 
				+In the case of FPGA-to-GPU data transfers, the double buffering solution was
			
 
				+used. As one can see, in both cases the write performance is primarily limited
			
 
				+by the PCIe bus. Up until 2 MB data transfer size, the throughput to the GPU
			
 
				+is approaching slowly 100 MB/s. From there on, the throughput increases up to
			
 
				+6.4 GB/s when PCIe bus saturation sets in at about 1 GB data size. The CPU
			
 
				+throughput saturates earlier but the maximum throughput is 6.6 GB/s.
			
 
				 
			
 
				 % We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
			
 
				 % system based on an Intel Atom CPU. The results showed no significant difference
			
@@ -325,6 +336,8 @@ latency.
 
				 % three sizes and an increasing block size.
			
 
				 
			
 
				 
			
 
				+\subsection{Latency}
			
 
				+
			
 
				 \begin{figure}
			
 
				   \includegraphics[width=\textwidth]{figures/latency-hist}
			
 
				   \caption{%