SHA1
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,8 @@ FIGURES=figures/intra-copy.pdf \
 
				 		figures/transf.pdf \
			
 
				 		figures/throughput.pdf \
			
 
				 		figures/latency.pdf \
			
 
				-		figures/latency-hist.pdf
			
 
				+		figures/latency-hist.pdf \
			
 
				+		figures/fpga-arch.pdf
			
 
				 
			
 
				 .PHONY: clean figures
			
 
				 
			
--- a/data/ipedirectgma/latency-hist.py
+++ b/data/ipedirectgma/latency-hist.py
@@ -12,14 +12,17 @@ gpu_data = gpu_data[gpu_data < 4.5]
 
				 
			
 
				 plt.rc('font', **dict(family='serif'))
			
 
				 
			
 
				-plt.figure(figsize=(6, 4))
			
 
				+plt.figure(figsize=(8, 3))
			
 
				+
			
 
				+cpu_weights = np.ones_like(cpu_data)/float(len(cpu_data))
			
 
				+gpu_weights = np.ones_like(gpu_data)/float(len(gpu_data))
			
 
				 
			
 
				 # divide by 2 for one-way latency
			
 
				 # plt.ylim(0.1, 10000)
			
 
				 # plt.hist(gpu_data, bins=200, label='GPU', log=True)
			
 
				 # plt.hist(cpu_data, bins=200, label='CPU', log=True)
			
 
				-plt.hist(gpu_data, bins=100, color='#3b5b92', label='GPU')
			
 
				-plt.hist(cpu_data, bins=100, color='#d54d4d', label='CPU')
			
 
				+plt.hist(gpu_data, weights=gpu_weights, bins=50, color='#3b5b92', label='GPU', linewidth=0)
			
 
				+plt.hist(cpu_data, weights=cpu_weights, bins=50, color='#d54d4d', label='CPU', linewidth=0)
			
 
				 # plt.semilogy()
			
 
				 
			
 
				 plt.xlabel(u'Latency in \u00b5s')
			
--- a/data/latency-hist.py
+++ b/data/latency-hist.py
@@ -17,8 +17,8 @@ plt.rc('font', **dict(family='serif'))
 
				 plt.figure(figsize=(4, 3))
			
 
				 
			
 
				 # divide by 2 for one-way latency
			
 
				-plt.hist(gpu_data / 2, bins=100, normed=False, color='#3b5b92', label='GPU')
			
 
				-plt.hist(cpu_data / 2, bins=100, normed=False, color='#d54d4d', label='CPU')
			
 
				+plt.hist(gpu_data / 2, bins=100, normed=True, color='#3b5b92', label='GPU', linewidth=0)
			
 
				+plt.hist(cpu_data / 2, bins=100, normed=True, color='#d54d4d', label='CPU', linewidth=0)
			
 
				 
			
 
				 plt.xlabel(u'Latency in \u00b5s')
			
 
				 plt.ylabel('Frequency')
			
--- a/data/throughput.cpu
+++ b/data/throughput.cpu
@@ -17,8 +17,7 @@
 
				 1048000000 		6472		
			
 
				 2097000000 		6528		
			
 
				 4194000000 		6561		
			
 
				-
			
 
				-
			
 
				+8388000000 		6581		
			
 
				 
			
 
				 
			
 
				 
			
--- a/data/throughput.gpu
+++ b/data/throughput.gpu
@@ -18,6 +18,3 @@
 
				 2147483648        6386.3333333333
			
 
				 4294967296        6408
			
 
				 8589934592        6393.8333333333
			
 
				-17179869184       6370.6666666667
			
 
				-34359738368       6372.1666666667
			
 
				-68719476736       6372.3333333333
			
--- a/data/throughput.py
+++ b/data/throughput.py
@@ -6,12 +6,14 @@ cpu_data = np.loadtxt('throughput.cpu')
 
				 
			
 
				 plt.rc('font', **dict(family='serif'))
			
 
				 
			
 
				-plt.figure(figsize=(8, 1))
			
 
				+plt.figure(figsize=(8, 3))
			
 
				 
			
 
				 plt.semilogx(gpu_data[:,0], gpu_data[:,1], '*-', color='#3b5b92', label='GPU')
			
 
				 plt.semilogx(cpu_data[:,0], cpu_data[:,1], 'o-', color='#d54d4d', label='CPU')
			
 
				+plt.xticks([1e4,1e6,1e8,1e10])
			
 
				+plt.yticks([0,2000,4000,6000,8000])
			
 
				 
			
 
				-plt.xlabel(u'Data size in B')
			
 
				+plt.xlabel('Data size in B')
			
 
				 plt.ylabel('Throughput in MB/s')
			
 
				 plt.legend(loc='lower right')
			
 
				 plt.savefig('throughput.pdf', dpi=300, bbox_inches='tight')
			
--- a/paper.tex
+++ b/paper.tex
@@ -162,6 +162,15 @@ friendly interfaces with the custom logic with an input bandwidth of 7.45
 
				 GB/s. The user logic and the DMA engine are configured by the host through PIO
			
 
				 registers.
			
 
				 
			
 
				+\begin{figure}[t]
			
 
				+  \centering
			
 
				+  \includegraphics[width=0.5\textwidth]{figures/fpga-arch}
			
 
				+  \caption{%
			
 
				+    FPGA AAA
			
 
				+  }
			
 
				+  \label{fig:fpga-arch}
			
 
				+\end{figure}
			
 
				+
			
 
				 The physical addresses of the host's memory buffers are stored into an internal
			
 
				 memory and are dynamically updated by the driver or user, allowing highly
			
 
				 efficient zero-copy data transfers. The maximum size associated with each
			
@@ -248,15 +257,38 @@ Python.
 
				 
			
 
				 \section{Results}
			
 
				 
			
 
				-We carried out performance measurements on a machine with an Intel Xeon
			
 
				-E5-1630 at 3.7 GHz, Intel C612 chipset running openSUSE 13.1 with Linux
			
 
				-3.11.10. The Xilinx VC709 evaluation board was plugged into one of the PCIe
			
 
				-3.0 x8 slots. In case of FPGA-to-CPU data transfers, the software
			
 
				-implementation is the one  described in~\cite{rota2015dma}.
			
 
				+We carried out performance measurements on two different setups, described in
			
 
				+table~\ref{table:setups}. In Setup 2, a low-end Supermicro X7SPA-HF-D525
			
 
				+system was connected to a Netstor NA255A external PCIe enclosure. In both
			
 
				+cases, a Xilinx VC709 evaluation board was plugged into a PCIe 3.0 x8 slots.
			
 
				+In case of FPGA-to-CPU data transfers, the software implementation is the one
			
 
				+described in~\cite{rota2015dma}.
			
 
				 
			
 
				-\subsection{Throughput}
			
 
				+\begin{table}[b]
			
 
				+\centering
			
 
				+\caption{Hardware used for throughput and latency measurements}
			
 
				+\label{table:setups}
			
 
				+\begin{tabular}{@{}llll@{}}
			
 
				+  \toprule
			
 
				+Component & Setup 1 & Setup 2 \\
			
 
				+  \midrule
			
 
				+CPU           & Intel Xeon E5-1630 at 3.7 GHz  & Intel Atom D525   \\
			
 
				+Chipset       & Intel C612                     & Intel ICH9R Express   \\
			
 
				+GPU           & AMD FirePro W9100              & AMD FirePro W9100   \\
			
 
				+PCIe link (FPGA-System memory)    & x8 Gen3                        & x4 Gen1     \\
			
 
				+PCIe link (FPGA-GPU)    & x8 Gen3                        & x8 Gen3     \\
			
 
				+  \bottomrule
			
 
				+\end{tabular}
			
 
				+\end{table}
			
 
				 
			
 
				 
			
 
				+\subsection{Throughput}
			
 
				+
			
 
				+% We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
			
 
				+% system based on an Intel Atom CPU. The results showed no significant difference
			
 
				+% compared to the previous setup. Depending on the application and computing
			
 
				+% requirements, this result makes smaller acquisition system a cost-effective
			
 
				+% alternative to larger workstations.
			
 
				 
			
 
				 \begin{figure}
			
 
				   \includegraphics[width=\textwidth]{figures/throughput}
			
@@ -267,31 +299,6 @@ implementation is the one  described in~\cite{rota2015dma}.
 
				 \label{fig:throughput}
			
 
				 \end{figure}
			
 
				 
			
 
				-% \begin{figure}
			
 
				-%   \centering
			
 
				-%   \begin{subfigure}[b]{.49\textwidth}
			
 
				-%     \centering
			
 
				-%     \includegraphics[width=\textwidth]{figures/throughput}
			
 
				-%     \caption{%
			
 
				-%       DMA data transfer throughput.
			
 
				-%     }
			
 
				-%     \label{fig:throughput}
			
 
				-%   \end{subfigure}
			
 
				-%   \begin{subfigure}[b]{.49\textwidth}
			
 
				-%     \includegraphics[width=\textwidth]{figures/latency}
			
 
				-%     \caption{%
			
 
				-%       Latency distribution.
			
 
				-%       % for a single 4 KB packet transferred
			
 
				-%       % from FPGA-to-CPU and FPGA-to-GPU.
			
 
				-%     }
			
 
				-%     \label{fig:latency}
			
 
				-%   \end{subfigure}
			
 
				-%   \caption{%
			
 
				-%     Measured throuhput for data transfers from FPGA to main memory
			
 
				-%     (CPU) and from FPGA to the global GPU memory (GPU). 
			
 
				-%   }
			
 
				-% \end{figure}
			
 
				-
			
 
				 The measured results for the pure data throughput is shown in
			
 
				 \figref{fig:throughput} for transfers from the FPGA to the system's main
			
 
				 memory as well as to the global memory as explained in \ref{sec:host}. 
			
@@ -304,11 +311,7 @@ is approaching slowly 100 MB/s. From there on, the throughput increases up to
 
				 6.4 GB/s when PCIe bus saturation sets in at about 1 GB data size. The CPU
			
 
				 throughput saturates earlier but the maximum throughput is 6.6 GB/s.
			
 
				 
			
 
				-% We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
			
 
				-% system based on an Intel Atom CPU. The results showed no significant difference
			
 
				-% compared to the previous setup. Depending on the application and computing
			
 
				-% requirements, this result makes smaller acquisition system a cost-effective
			
 
				-% alternative to larger workstations.
			
 
				+
			
 
				 
			
 
				 % \begin{figure}
			
 
				 %   \includegraphics[width=\textwidth]{figures/intra-copy}
			
@@ -340,14 +343,20 @@ latency.
 
				 
			
 
				 
			
 
				 \subsection{Latency}
			
 
				-
			
 
				-\begin{figure}
			
 
				-  \includegraphics[width=\textwidth]{figures/latency-hist}
			
 
				-  \caption{%
			
 
				-    Latency distribution for a single 1024 B packet transferred from FPGA to
			
 
				-    GPU memory and to main memory.
			
 
				-  }
			
 
				-  \label{fig:latency-distribution}
			
 
				+\begin{figure}[t]
			
 
				+  \centering
			
 
				+  \begin{subfigure}[b]{.8\textwidth}
			
 
				+    \centering
			
 
				+    \includegraphics[width=\textwidth]{figures/latency}
			
 
				+    \caption{Latency }
			
 
				+    \label{fig:latency_vs_size}
			
 
				+  \end{subfigure}
			
 
				+  \begin{subfigure}[b]{.8\textwidth}
			
 
				+    \includegraphics[width=\textwidth]{figures/latency-hist}
			
 
				+    \caption{Latency distribution.}
			
 
				+    \label{fig:latency_hist}
			
 
				+  \end{subfigure}
			
 
				+  \label{fig:latency}
			
 
				 \end{figure}
			
 
				 
			
 
				 For HEP experiments, low latencies are necessary to react in a reasonable time
Author	SHA1 Message	Date
Lorenzo	60f5751469 Added placeholder for fpga-arch figure, minor changes plots	8 years ago
Lorenzo	9401d11247 Merge branch 'master' of git.ipe.kit.edu:ufo/twepp2015	8 years ago
Lorenzo	8b1626010f Added table for setups	8 years ago