8 years ago · cbe743c9bf
--- a/Makefile
+++ b/Makefile
@@ -3,10 +3,8 @@
 
				 all: paper.pdf
			
 
				 
			
 
				 paper.pdf: paper.tex literature.bib figures/intra-copy.png
			
 
				-	@pdflatex $<
			
 
				 	@bibtex paper.aux
			
 
				 	@pdflatex $<
			
 
				-	@pdflatex $<
			
 
				 
			
 
				 figures/intra-copy.png:
			
 
				 	@mkdir -p figures
			
--- a/paper.tex
+++ b/paper.tex
@@ -30,7 +30,22 @@
 
				 
			
 
				 \section{Introduction}
			
 
				 
			
 
				-Citation~\cite{lonardo2015nanet}
			
 
				+GPU computing has become the main driving force for high-performance computing
			
 
				+due to an unprecedented parallelism and a low cost-benefit factor.  GPU
			
 
				+acceleration has found its way into numerous applications, ranging from
			
 
				+simulation to image processing.
			
 
				+
			
 
				+The main challenge for latency-sensitive applications is the data transfer from
			
 
				+front-end electronics to the GPUs internal memory. In a typical setup, data is
			
 
				+routed through system main memory [MV: maybe refer to that poster drawing], thus
			
 
				+limiting latency and overall throughput by PCI Express (PCIe) bus transfers.
			
 
				+Lonardo et~al.\ lifted this limitation with their NaNet design, an FPGA-based
			
 
				+PCIe network interface card with GPUdirect integration~\cite{lonardo2015nanet}.
			
 
				+Due to its design, the bandwidth saturates at 120 MB/s for a 1472 byte large UDP
			
 
				+datagram. In order to fully saturate the PCIe bus bandwidth\footnote{Net
			
 
				+bandwidth of 6.?  GB/s for PCIe 3.0 x16.}, we propose complete hardware-software
			
 
				+stack architecture based on our own DMA design and integration of AMD's
			
 
				+DirectGMA technology into our processing pipeline.
			
 
				 
			
 
				 
			
 
				 \section{Architecture}
			
@@ -39,14 +54,14 @@ Citation~\cite{lonardo2015nanet}
 
				 \label{sec:host}
			
 
				 
			
 
				 On the host side, AMD's DirectGMA technology, an implementation of the
			
 
				-bus-addressable memory extension for OpenCL 1.1+, is used to prepare GPU buffers
			
 
				-for writing data by FPGA as well as mapping the remote FPGA device for writing
			
 
				-signals. To write into the GPU, the physical bus address of the GPU buffer is
			
 
				-determined with a call to \texttt{clEnqueueMakeBuffersResidentAMD}. The address
			
 
				-is written to an FPGA register and updated for each successful transfer of one
			
 
				-or more pages of data. Due to hardware restrictions the largest possible GPU
			
 
				-buffer sizes are about 95 MB. Larger transfers are achieved with a double
			
 
				-buffering mechanism.
			
 
				+bus-addressable memory extension for OpenCL 1.1 and later, is used to prepare
			
 
				+GPU buffers for writing data by FPGA as well as mapping the remote FPGA device
			
 
				+for writing signals. To write into the GPU, the physical bus address of the GPU
			
 
				+buffer is determined with a call to \texttt{clEnqueueMakeBuffersResidentAMD}.
			
 
				+The address is written to an FPGA register and updated for each successful
			
 
				+transfer of one or more pages of data. Due to hardware restrictions the largest
			
 
				+possible GPU buffer sizes are about 95 MB. Larger transfers are achieved with a
			
 
				+double buffering mechanism.
			
 
				 
			
 
				 To process the data, we encapsulated the DMA setup and memory mapping in a
			
 
				 plugin for our scalable GPU processing framework~\cite{vogelgesang2012ufo}. This