8 gadi atpakaļ · 9c62fe4187
--- a/literature.bib
+++ b/literature.bib
@@ -1,3 +1,15 @@
 
				+@INPROCEEDINGS{bittner, 
			
 
				+author={Bittner, R. and Ruf, E.}, 
			
 
				+booktitle={Parallel Processing Workshops (ICPPW), 2012 41st International Conference on}, 
			
 
				+title={Direct GPU/FPGA Communication via PCI Express}, 
			
 
				+year={2012}, 
			
 
				+pages={135-139}, 
			
 
				+keywords={field programmable gate arrays;graphics processing units;peripheral interfaces;CPU-GPU pair;PCI express;direct GPU/FPGA communication;full hardware implementation;mainstream computing;parallel processing;Bandwidth;Field programmable gate arrays;Graphics processing unit;Hardware;Memory management;Operating systems;Switches;CUDA;FPGA;GPU;PCI Express;PCIe;Verilog;Windows;Xilinx;nVidia}, 
			
 
				+doi={10.1109/ICPPW.2012.20}, 
			
 
				+ISSN={1530-2016}, 
			
 
				+month={Sept},
			
 
				+}
			
 
				+
			
 
				 @article{nieto2015high,
			
 
				   author = {J. Nieto and G. de Arcas and M. Ruiz and R. Castro and J. Vega and P. Guillen},
			
 
				   title = {A high throughput data acquisition and processing model for applications based on GPUs},
			
@@ -11,10 +23,25 @@
 
				   url = {http://www.sciencedirect.com/science/article/pii/S0920379615002616},
			
 
				 }
			
 
				 
			
 
				+@article{thoma,
			
 
				+title = "FPGA-GPU communicating through \{PCIe\} ",
			
 
				+journal = "Microprocessors and Microsystems ",
			
 
				+volume = "39",
			
 
				+number = "7",
			
 
				+pages = "565 - 575",
			
 
				+year = "2015",
			
 
				+note = "",
			
 
				+issn = "0141-9331",
			
 
				+doi = "http://dx.doi.org/10.1016/j.micpro.2015.02.005",
			
 
				+author = "Yann Thoma and Alberto Dassatti and Daniel Molla and Enrico Petraglio",
			
 
				+keywords = "PCIe",
			
 
				+keywords = "FPGA",
			
 
				+keywords = "GPU",
			
 
				+keywords = "Communication "
			
 
				+}
			
 
				+
			
 
				 @article{lonardo2015nanet,
			
 
				-  author = {A. Lonardo and F. Ameli and R. Ammendola and A. Biagioni and A. Cotta Ramusino and M. Fiorini and O. Frezza and G. Lamanna and F. Lo
			
 
				-      Cicero and M. Martinelli and I. Neri and P.S. Paolucci and E. Pastorelli and L. Pontisso and D. Rossetti and F. Simeone and F.
			
 
				-      Simula and M. Sozzi and L. Tosoratto and P. Vicini},
			
 
				+  author = {Lonardo, A. et al.},
			
 
				   title = {NaNet: a configurable NIC bridging the gap between HPC and real-time HEP GPU computing},
			
 
				   journal = {Journal of Instrumentation},
			
 
				   volume = {10},
			
@@ -46,7 +73,7 @@
 
				 }
			
 
				 
			
 
				 @article{alice_gpu,
			
 
				-  author={Gorbunov, S. and Rohr, D. and Aamodt, K. and Alt, T. and Appelshauser, H. and Arend, A. and Bach, M. and Becker, B. and Bottger, S. and Breitner, T. and Busching, H. and Chattopadhyay, S. and Cleymans, J. and Cicalo, C. and Das, I. and Djuvsland, O. and Engel, H. and Erdal, H.A. and Fearick, R. and Haaland, O.S. and Hille, P.T. and Kalcher, S. and Kanaki, K. and Kebschull, U.W. and Kisel, I. and Kretz, M. and Lara, C. and Lindal, S. and Lindenstruth, V. and Masoodi, A.A. and Ovrebekk, G. and Panse, R. and Peschek, J. and Ploskon, M. and Pocheptsov, T. and Ram, D. and Rascanu, T. and Richter, M. and Rohrich, D. and Ronchetti, F. and Skaali, B. and Smorholm, O. and Stokkevag, C. and Steinbeck, T.M. and Szostak, A. and Thader, J. and Tveter, T. and Ullaland, K. and Vilakazi, Z. and Weis, R. and Zhongbao Yin and Zelnicek, P.},
			
 
				+  author={Gorbunov, S. et al.},
			
 
				   journal={Nuclear Science, IEEE Transactions on},
			
 
				   title={ALICE HLT High Speed Tracking on GPU},
			
 
				   year={2011},
			
@@ -68,9 +95,7 @@
 
				 }
			
 
				 
			
 
				 @article{mu3e_gpu,
			
 
				-  author={S Bachmann and N Berger and A Blondel and S Bravar and A Buniatyan and G Dissertori and P Eckert and P Fischer and C Grab and R Gredig and M
			
 
				-Hildebrandt and P -R Kettle and M Kiehn and A Papa and I Peric and M Pohl and S Ritt and P Robmann and A Schöning and H -C
			
 
				-Schultz-Coulon and W Shen and S Shresta and A Stoykov and U Straumann and R Wallny and D Wiedner and B Windelband},
			
 
				+  author={Bachmann, S. et al.},
			
 
				   title={The proposed trigger-less TBit/s readout for the Mu3e experiment},
			
 
				   journal={Journal of Instrumentation},
			
 
				   volume={9},
			
--- a/paper.tex
+++ b/paper.tex
@@ -14,15 +14,19 @@
 
				 
			
 
				 \title{A high-throughput readout architecture based on PCI-Express Gen3 and DirectGMA technology}
			
 
				 
			
 
				-\author{M.~Vogelgesang$^a$,
			
 
				+\author{
			
 
				   L.~Rota$^a$,
			
 
				+  M.~Vogelgesang$^a$,
			
 
				   N.~Zilio$^a$,
			
 
				   M.~Caselle$^a$,
			
 
				+  S.~Chilingaryan$^a$,
			
 
				   L.E.~Ardila Perez$^a$,
			
 
				+  M.~Balzer$^a$,
			
 
				   M.~Weber$^a$\\
			
 
				-  \llap{$^a$}Institute for Data Processing and Electronics,\\
			
 
				+  \llap{$^a$}All authors Institute for Data Processing and Electronics,\\
			
 
				     Karlsruhe Institute of Technology (KIT),\\
			
 
				-    Herrmann-von-Helmholtz-Platz 1, Karlsruhe, Germany
			
 
				+    Herrmann-von-Helmholtz-Platz 1, Karlsruhe, Germany \\   
			
 
				+  E-mail: \email{lorenzo.rota@kit.edu}
			
 
				 }
			
 
				 
			
 
				 \abstract{%
			
@@ -41,6 +45,7 @@
 
				   trigger systems.
			
 
				 }
			
 
				 
			
 
				+\keywords{AMD directGMA; FPGA; Readout architecture}
			
 
				 
			
 
				 \begin{document}
			
 
				 
			
@@ -50,52 +55,66 @@
 
				 \fi
			
 
				 
			
 
				 
			
 
				-\section{Motivation}
			
 
				+\section{Introduction}
			
 
				 
			
 
				 GPU computing has become the main driving force for high performance computing
			
 
				 due to an unprecedented parallelism and a low cost-benefit factor. GPU
			
 
				 acceleration has found its way into numerous applications, ranging from
			
 
				 simulation to image processing. Recent years have also seen an increasing
			
 
				-interest in GPU-based systems for HEP applications, which require a combination
			
 
				-of high data rates, high computational power and low latency (\emph{e.g.}
			
 
				+interest in GPU-based systems for HEP experiments (\emph{e.g.}
			
 
				 ATLAS~\cite{atlas_gpu}, ALICE~\cite{alice_gpu}, Mu3e~\cite{mu3e_gpu},
			
 
				-PANDA~\cite{panda_gpu}). Moreover, the volumes of data produced in recent photon
			
 
				+PANDA~\cite{panda_gpu}). In a typical HEP scenario,
			
 
				+data is acquired by one or more read-out boards and then
			
 
				+transmitted in short bursts or in a continuous streaming mode to a computation stage.
			
 
				+With expected data rates of several GB/s, the data transmission link between the
			
 
				+read-out boards and the host system may partially limit the overall system
			
 
				+performance. In particular, latency becomes the most stringent specification if
			
 
				+a time-deterministic feedback is required, \emph{e.g.} in Low/High-level trigger
			
 
				+systems. Moreover, the volumes of data produced in recent photon
			
 
				 science facilities have become comparable to those traditionally associated with
			
 
				 HEP.
			
 
				 
			
 
				-In HEP experiments data is acquired by one or more read-out boards and then
			
 
				-transmitted to GPUs in short bursts or in a continuous streaming mode. With
			
 
				-expected data rates of several GB/s, the data transmission link between the
			
 
				-read-out boards and the host system may partially limit the overall system
			
 
				-performance. In particular, latency becomes the most stringent specification if
			
 
				-a time-deterministic feedback is required, \emph{e.g.} Low/High-level Triggers.
			
 
				+In order to achieve the best performance in terms of latency and bandwidth, 
			
 
				+data transfers are handled by a dedicated DMA controller, at the cost of higher
			
 
				+system complexity.
			
 
				 
			
 
				 To address these problems we propose a complete hardware/software stack
			
 
				-architecture based on our own DMA design, and integration
			
 
				-of AMD's DirectGMA technology into our processing pipeline. In our solution,
			
 
				-PCI-express (PCIe) has been chosen as a data link between FPGA boards and the
			
 
				-host computer. Due to its high bandwidth and modularity, PCIe quickly became the
			
 
				-commercial standard for connecting high-throughput peripherals such as GPUs or
			
 
				-solid state disks. Moreover, optical PCIe networks have been demonstrated
			
 
				-a decade ago~\cite{optical_pcie}, opening the possibility of using PCIe
			
 
				-as a communication bus over long distances. In particular, in HEP DAQ systems,
			
 
				-optical links are preferred over electrical ones because of their superior
			
 
				-radiation hardness, lower power consumption and higher density.
			
 
				-
			
 
				-Lonardo et~al.\ lifted this limitation with their NaNet design, an FPGA-based
			
 
				-PCIe network interface card with NVIDIA's GPUDirect
			
 
				-integration~\cite{lonardo2015nanet}. Due to its design, the bandwidth saturates
			
 
				-at 120 MB/s for a 1472 byte large UDP datagram. 
			
 
				-Nieto et~al.\ presented a system that moves data from an FPGA to a GPU using
			
 
				-GPUDirect and a PXIexpress data link that makes use of four PCIe 1.0 links
			
 
				-\cite{nieto2015high}.  Their system (as limited by the interconnect) achieves an
			
 
				-average throughput of 870 MB/s with 1 KB block transfers.
			
 
				-
			
 
				-\section{Architecture}
			
 
				-
			
 
				-DMA data transfers are handled by dedicated hardware, which compared with
			
 
				-Programmed Input Output (PIO) access, offer lower latency and higher throughput
			
 
				-at the cost of higher system complexity.
			
 
				+architecture based on our own DMA engine, and integration
			
 
				+of AMD's DirectGMA technology into our processing pipeline.
			
 
				+
			
 
				+\section{Background}
			
 
				+
			
 
				+Several solutions for direct FPGA/GPU communication are reported in literature.
			
 
				+All these are based on NVIDIA's GPUDirect technology. 
			
 
				+
			
 
				+The first implementation was realized by Bittner and Ruf with the Speedy 
			
 
				+PCIe Core~\cite{bittner}. In their design, during an FPGA-to-GPU data transfers,
			
 
				+the GPU acts as master and reading data
			
 
				+from the FPGA. This solution limits the reported bandwidth and 
			
 
				+latency to, respectively, 514 MB/s and 40~$\mu$s.
			
 
				+
			
 
				+Lonardo et~al.\ achieved lower latencies with their NaNet design, an FPGA-based
			
 
				+PCIe network interface card~\cite{lonardo2015nanet}. 
			
 
				+The Gbe link limits the latency performance of the system to a few tens of $\mu$s.
			
 
				+If only the FPGA-to-GPU latency is considered, the measured values span between 
			
 
				+1~$\mu$s and 6~$\mu$s, depending on the datagram size. Due to its design,
			
 
				+ the bandwidth saturates at 120 MB/s. 
			
 
				+
			
 
				+Nieto et~al.\ presented a system based on a PXIexpress data link that makes use
			
 
				+of four PCIe 1.0 links~\cite{nieto2015high}.
			
 
				+Their system (as limited by the interconnect) achieves an average throughput of
			
 
				+870 MB/s with 1 KB block transfers.
			
 
				+
			
 
				+A higher throughput has been achieved with the FPGA\textsuperscript{2} framework
			
 
				+by Thoma et~al.\cite{thoma}: 2454 MB/s using a 8x Gen2.0 data link. 
			
 
				+
			
 
				+\section{Basic Concepts}
			
 
				+
			
 
				+In our solution, PCI-express (PCIe) has been chosen as a direct data link between FPGA boards and the
			
 
				+host computer. Due to its high bandwidth and modularity, PCIe quickly became the com-
			
 
				+mercial standard for connecting high-throughput peripherals such as GPUs or solid state disks.
			
 
				+Moreover, optical PCIe networks have been demonstrated a decade ago [5], opening the possibility
			
 
				+of using PCIe as a communication bus over long distances.
			
 
				 
			
 
				 \begin{figure}[t]
			
 
				   \centering
			
@@ -120,7 +139,6 @@ is reduced and total throughput increased. Moreover, the CPU and main system
 
				 memory are relieved from processing because they are not directly involved in
			
 
				 the data transfer anymore.
			
 
				 
			
 
				-
			
 
				 \subsection{DMA engine implementation on the FPGA}
			
 
				 
			
 
				 We have developed a DMA architecture that minimizes resource utilization while