\documentclass{JINST} \usepackage{lineno} \usepackage{ifthen} \newboolean{draft} \setboolean{draft}{true} \title{A high-throughput readout architecture based on PCIe Gen3 and DirectGMA technology} \author{N.~Zilio$^b$, M.~Weber$^a$\\ \llap{$^a$}Institute for Data Processing and Electronics,\\ Karlsruhe Institute of Technology (KIT),\\ Herrmann-von-Helmholtz-Platz 1, Karlsruhe, Germany\\ \llap{$^b$}Somewhere in France } \abstract{Abstract} \begin{document} \ifdraft \setpagewiselinenumbers \linenumbers \fi \section{Introduction} Citation~\cite{lonardo2015nanet} \section{Architecture} \subsection{Host interface} On the host side, AMD's DirectGMA technology, an implementation of the bus-addressable memory extension for OpenCL 1.1+, is used to prepare GPU buffers for writing data by FPGA as well as mapping the remote FPGA device for writing signals. To write into the GPU, the physical bus address of the GPU buffer is determined with a call to \texttt{clEnqueueMakeBuffersResidentAMD}. The address is written to an FPGA register and updated for each successful transfer of one or more pages of data. Due to hardware restrictions the largest possible GPU buffer sizes are about 95 MB. Larger transfers are achieved with a double buffering mechanism (MV: we should measure intra-GPU data transfers). \section{Results} \begin{figure} \includegraphics[width=\textwidth]{figures/intra-copy.png} \caption{Throughput in MB/s for an intra-GPU data transfer of smaller block sizes (4KB -- 24 MB) into a larger destination buffer (32 MB -- 128 MB). The lower performance for smaller block sizes is caused by the larger amount of transfers required to fill the destination buffer. The throughput has been estimated using the host side wall clock time. On-GPU data transfer is about twice as fast.} \label{fig:intra-copy} \end{figure} \section{Conclusion} \acknowledgments UFO? KSETA? \bibliographystyle{JHEP} \bibliography{literature} \end{document}