8 years ago · 4aabbc2481
--- a/data/latency-hist.py
+++ b/data/latency-hist.py
@@ -12,8 +12,8 @@ plt.rc('font', **dict(family='serif'))
 
				 
			
 
				 plt.figure(figsize=(10, 4))
			
 
				 
			
 
				-plt.hist(data, bins=200)
			
 
				+plt.hist(data, bins=200, normed=True)
			
 
				 
			
 
				 plt.xlabel(u'Latency in \u00b5s')
			
 
				-plt.ylabel('Number')
			
 
				+plt.ylabel('Frequency')
			
 
				 plt.savefig('latency-michele.pdf', dpi=300, bbox_inches='tight')
			
--- a/paper.tex
+++ b/paper.tex
@@ -1,5 +1,6 @@
 
				 \documentclass{JINST}
			
 
				 
			
 
				+\usepackage[utf8]{inputenc}
			
 
				 \usepackage{lineno}
			
 
				 \usepackage{ifthen}
			
 
				 
			
@@ -160,7 +161,10 @@ To signal events to the FPGA (4), the control registers can be mapped into the
 
				 GPU's address space passing a special AMD-specific flag and passing the physical
			
 
				 BAR address of the FPGA configuration memory to the \texttt{cl\-Create\-Buffer}
			
 
				 function. From the GPU, this memory is seen transparently and as regular GPU
			
 
				-memory and can be written accordingly (3).
			
 
				+memory and can be written accordingly (3). Individual write accesses are issued
			
 
				+as PIO commands, however using the \texttt{cl\-Enqueue\-Copy\-Buffer} function
			
 
				+call it is also possible to write entire memory regions in a DMA fashion to the
			
 
				+FPGA. In this case, the GPU acts as bus master to push data to the FPGA.
			
 
				 
			
 
				 \begin{figure}
			
 
				   \centering
			
@@ -215,40 +219,50 @@ strategy a viable solution for very large data transfers.
 
				   \label{fig:intra-copy}
			
 
				 \end{figure}
			
 
				 
			
 
				+
			
 
				 \subsection{Throughput}
			
 
				 
			
 
				+A high throughput is desired for applications in which the FPGA outputs large
			
 
				+amounts of data and timing is not an issue. This includes fast, high resolution
			
 
				+photon detectors as used in synchrotron facilities.
			
 
				+
			
 
				+\figref{fig:throughput} shows the memory write throughput for a GPU and the CPU
			
 
				+For both system and GPU memory, the write performance is primarily limited by
			
 
				+the PCIe bus. Higher payloads introduce less overhead, thus increasing the net
			
 
				+bandwidth. Up until 2 MB transfer size, the performance is almost the same,
			
 
				+after that the GPU transfer shows a slightly better slope. Data transfers larger
			
 
				+than 1 GB saturate the PCIe bus. \bf{LR: We should measure the slope for
			
 
				+different page sizes, I expect the saturation point to change for different
			
 
				+page sizes}
			
 
				+
			
 
				+\begin{figure}
			
 
				+  \centering
			
 
				+  \includegraphics[width=0.6\textwidth]{figures/through_plot}
			
 
				+  \caption{
			
 
				+    Throughput of regular CPU and our GPU DMA data transfer for up to 50 GB of
			
 
				+    data.
			
 
				+  }
			
 
				+  \label{fig:throughput}
			
 
				+\end{figure}
			
 
				 
			
 
				 
			
 
				 \subsection{Latency}
			
 
				 
			
 
				 %% Change the specs for the small crate
			
 
				-For FPGA-to-GPU transfers, we also repeated the measurements using a low-end system
			
 
				-based on XXX and Intel Nano XXXX. The results does not show any significant difference
			
 
				-compared to the previous setup, making it a more cost-effective solution.
			
 
				+% MV: we never did anything in that regard
			
 
				+% For FPGA-to-GPU transfers, we also repeated the measurements using a low-end system
			
 
				+% based on XXX and Intel Nano XXXX. The results does not show any significant difference
			
 
				+% compared to the previous setup, making it a more cost-effective solution.
			
 
				 
			
 
				 \begin{figure}
			
 
				   \includegraphics[width=\textwidth]{figures/latency-michele}
			
 
				   \caption{%
			
 
				-    FILL ME
			
 
				+    Relative frequency of measured latencies for a single 4 KB packet transfered
			
 
				+    from the GPU to the FPGA.
			
 
				   }
			
 
				   \label{fig:intra-copy}
			
 
				 \end{figure}
			
 
				 
			
 
				-\begin{figure}
			
 
				-  \centering
			
 
				-  \includegraphics[width=0.6\textwidth]{figures/through_plot}
			
 
				-  \caption{
			
 
				-    Writing from the FPGA to either system or GPU memory is primarily limited by
			
 
				-    the PCIe bus. Higher payloads introduce less overhead, thus increasing the net bandwidth.
			
 
				-    Up until 2 MB transfer size, the performance is almost the
			
 
				-    same, after that the GPU transfer shows a slightly better slope. Data
			
 
				-    transfers larger than 1 GB saturate the PCIe bus.
			
 
				-    \bf{LR: We should measure the slope for different page sizes, I expect the saturation point
			
 
				-    to change for different page sizes}
			
 
				-  }
			
 
				-  \label{fig:throughput}
			
 
				-\end{figure}
			
 
				-
			
 
				 %% Latency here? What do we do?
			
 
				 %% We should add an histogram with 1000+ measurements of the latency to see if it's time-deterministic
			
 
				 %% Also: add a plot of latency vs different data sizes transmitted (from FPGA)
			
@@ -265,13 +279,16 @@ compared to the previous setup, making it a more cost-effective solution.
 
				 
			
 
				 
			
 
				 \section{Conclusion}
			
 
				-%% Added here
			
 
				-We developed a complete hardware and software solution that enable direct DMA transfers
			
 
				-between FPGA-based readout boards and GPU computing clusters. The net throughput is mainly
			
 
				-limited by the PCIe bus, reaching 6.7 GB/s for a 256 B payload. By writing directly into GPU
			
 
				-memory instead of routing data through system main memory, latency is reduced by a factor of 2.
			
 
				-The solution here proposed allows high performance GPU computing thanks to the support of the
			
 
				-framework. Integration with different DAQ systems and custom algorithms is therefore immediate.
			
 
				+
			
 
				+We developed a complete hardware and software solution that enables DMA
			
 
				+transfers between FPGA-based readout boards and GPU computing clusters. The net
			
 
				+throughput is primarily limited by the PCIe bus, reaching 6.x GB/s for a 256 B
			
 
				+payload. By writing directly into GPU memory instead of routing data through
			
 
				+system main memory, the overall latency is reduced by a factor of 2. Moreover,
			
 
				+the solution proposed here allows high performance GPU computing due to
			
 
				+integration of the DMA transfer setup in our streamed computing framework.
			
 
				+Integration with different DAQ systems and custom algorithms is therefore
			
 
				+immediate.
			
 
				 
			
 
				 
			
 
				 \subsection{Outlook}