3 Commits 54925c291e ... 60f5751469

Author SHA1 Message Date
  Lorenzo 60f5751469 Added placeholder for fpga-arch figure, minor changes plots 8 years ago
  Lorenzo 9401d11247 Merge branch 'master' of git.ipe.kit.edu:ufo/twepp2015 8 years ago
  Lorenzo 8b1626010f Added table for setups 8 years ago
7 changed files with 68 additions and 57 deletions
  1. 2 1
      Makefile
  2. 6 3
      data/ipedirectgma/latency-hist.py
  3. 2 2
      data/latency-hist.py
  4. 1 2
      data/throughput.cpu
  5. 0 3
      data/throughput.gpu
  6. 4 2
      data/throughput.py
  7. 53 44
      paper.tex

+ 2 - 1
Makefile

@@ -3,7 +3,8 @@ FIGURES=figures/intra-copy.pdf \
 		figures/transf.pdf \
 		figures/throughput.pdf \
 		figures/latency.pdf \
-		figures/latency-hist.pdf
+		figures/latency-hist.pdf \
+		figures/fpga-arch.pdf
 
 .PHONY: clean figures
 

+ 6 - 3
data/ipedirectgma/latency-hist.py

@@ -12,14 +12,17 @@ gpu_data = gpu_data[gpu_data < 4.5]
 
 plt.rc('font', **dict(family='serif'))
 
-plt.figure(figsize=(6, 4))
+plt.figure(figsize=(8, 3))
+
+cpu_weights = np.ones_like(cpu_data)/float(len(cpu_data))
+gpu_weights = np.ones_like(gpu_data)/float(len(gpu_data))
 
 # divide by 2 for one-way latency
 # plt.ylim(0.1, 10000)
 # plt.hist(gpu_data, bins=200, label='GPU', log=True)
 # plt.hist(cpu_data, bins=200, label='CPU', log=True)
-plt.hist(gpu_data, bins=100, color='#3b5b92', label='GPU')
-plt.hist(cpu_data, bins=100, color='#d54d4d', label='CPU')
+plt.hist(gpu_data, weights=gpu_weights, bins=50, color='#3b5b92', label='GPU', linewidth=0)
+plt.hist(cpu_data, weights=cpu_weights, bins=50, color='#d54d4d', label='CPU', linewidth=0)
 # plt.semilogy()
 
 plt.xlabel(u'Latency in \u00b5s')

+ 2 - 2
data/latency-hist.py

@@ -17,8 +17,8 @@ plt.rc('font', **dict(family='serif'))
 plt.figure(figsize=(4, 3))
 
 # divide by 2 for one-way latency
-plt.hist(gpu_data / 2, bins=100, normed=False, color='#3b5b92', label='GPU')
-plt.hist(cpu_data / 2, bins=100, normed=False, color='#d54d4d', label='CPU')
+plt.hist(gpu_data / 2, bins=100, normed=True, color='#3b5b92', label='GPU', linewidth=0)
+plt.hist(cpu_data / 2, bins=100, normed=True, color='#d54d4d', label='CPU', linewidth=0)
 
 plt.xlabel(u'Latency in \u00b5s')
 plt.ylabel('Frequency')

+ 1 - 2
data/throughput.cpu

@@ -17,8 +17,7 @@
 1048000000 		6472		
 2097000000 		6528		
 4194000000 		6561		
-
-
+8388000000 		6581		
 
 
 

+ 0 - 3
data/throughput.gpu

@@ -18,6 +18,3 @@
 2147483648        6386.3333333333
 4294967296        6408
 8589934592        6393.8333333333
-17179869184       6370.6666666667
-34359738368       6372.1666666667
-68719476736       6372.3333333333

+ 4 - 2
data/throughput.py

@@ -6,12 +6,14 @@ cpu_data = np.loadtxt('throughput.cpu')
 
 plt.rc('font', **dict(family='serif'))
 
-plt.figure(figsize=(8, 1))
+plt.figure(figsize=(8, 3))
 
 plt.semilogx(gpu_data[:,0], gpu_data[:,1], '*-', color='#3b5b92', label='GPU')
 plt.semilogx(cpu_data[:,0], cpu_data[:,1], 'o-', color='#d54d4d', label='CPU')
+plt.xticks([1e4,1e6,1e8,1e10])
+plt.yticks([0,2000,4000,6000,8000])
 
-plt.xlabel(u'Data size in B')
+plt.xlabel('Data size in B')
 plt.ylabel('Throughput in MB/s')
 plt.legend(loc='lower right')
 plt.savefig('throughput.pdf', dpi=300, bbox_inches='tight')

+ 53 - 44
paper.tex

@@ -162,6 +162,15 @@ friendly interfaces with the custom logic with an input bandwidth of 7.45
 GB/s. The user logic and the DMA engine are configured by the host through PIO
 registers.
 
+\begin{figure}[t]
+  \centering
+  \includegraphics[width=0.5\textwidth]{figures/fpga-arch}
+  \caption{%
+    FPGA AAA
+  }
+  \label{fig:fpga-arch}
+\end{figure}
+
 The physical addresses of the host's memory buffers are stored into an internal
 memory and are dynamically updated by the driver or user, allowing highly
 efficient zero-copy data transfers. The maximum size associated with each
@@ -248,15 +257,38 @@ Python.
 
 \section{Results}
 
-We carried out performance measurements on a machine with an Intel Xeon
-E5-1630 at 3.7 GHz, Intel C612 chipset running openSUSE 13.1 with Linux
-3.11.10. The Xilinx VC709 evaluation board was plugged into one of the PCIe
-3.0 x8 slots. In case of FPGA-to-CPU data transfers, the software
-implementation is the one  described in~\cite{rota2015dma}.
+We carried out performance measurements on two different setups, described in
+table~\ref{table:setups}. In Setup 2, a low-end Supermicro X7SPA-HF-D525
+system was connected to a Netstor NA255A external PCIe enclosure. In both
+cases, a Xilinx VC709 evaluation board was plugged into a PCIe 3.0 x8 slots.
+In case of FPGA-to-CPU data transfers, the software implementation is the one
+described in~\cite{rota2015dma}.
 
-\subsection{Throughput}
+\begin{table}[b]
+\centering
+\caption{Hardware used for throughput and latency measurements}
+\label{table:setups}
+\begin{tabular}{@{}llll@{}}
+  \toprule
+Component & Setup 1 & Setup 2 \\
+  \midrule
+CPU           & Intel Xeon E5-1630 at 3.7 GHz  & Intel Atom D525   \\
+Chipset       & Intel C612                     & Intel ICH9R Express   \\
+GPU           & AMD FirePro W9100              & AMD FirePro W9100   \\
+PCIe link (FPGA-System memory)    & x8 Gen3                        & x4 Gen1     \\
+PCIe link (FPGA-GPU)    & x8 Gen3                        & x8 Gen3     \\
+  \bottomrule
+\end{tabular}
+\end{table}
 
 
+\subsection{Throughput}
+
+% We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
+% system based on an Intel Atom CPU. The results showed no significant difference
+% compared to the previous setup. Depending on the application and computing
+% requirements, this result makes smaller acquisition system a cost-effective
+% alternative to larger workstations.
 
 \begin{figure}
   \includegraphics[width=\textwidth]{figures/throughput}
@@ -267,31 +299,6 @@ implementation is the one  described in~\cite{rota2015dma}.
 \label{fig:throughput}
 \end{figure}
 
-% \begin{figure}
-%   \centering
-%   \begin{subfigure}[b]{.49\textwidth}
-%     \centering
-%     \includegraphics[width=\textwidth]{figures/throughput}
-%     \caption{%
-%       DMA data transfer throughput.
-%     }
-%     \label{fig:throughput}
-%   \end{subfigure}
-%   \begin{subfigure}[b]{.49\textwidth}
-%     \includegraphics[width=\textwidth]{figures/latency}
-%     \caption{%
-%       Latency distribution.
-%       % for a single 4 KB packet transferred
-%       % from FPGA-to-CPU and FPGA-to-GPU.
-%     }
-%     \label{fig:latency}
-%   \end{subfigure}
-%   \caption{%
-%     Measured throuhput for data transfers from FPGA to main memory
-%     (CPU) and from FPGA to the global GPU memory (GPU). 
-%   }
-% \end{figure}
-
 The measured results for the pure data throughput is shown in
 \figref{fig:throughput} for transfers from the FPGA to the system's main
 memory as well as to the global memory as explained in \ref{sec:host}. 
@@ -304,11 +311,7 @@ is approaching slowly 100 MB/s. From there on, the throughput increases up to
 6.4 GB/s when PCIe bus saturation sets in at about 1 GB data size. The CPU
 throughput saturates earlier but the maximum throughput is 6.6 GB/s.
 
-% We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
-% system based on an Intel Atom CPU. The results showed no significant difference
-% compared to the previous setup. Depending on the application and computing
-% requirements, this result makes smaller acquisition system a cost-effective
-% alternative to larger workstations.
+
 
 % \begin{figure}
 %   \includegraphics[width=\textwidth]{figures/intra-copy}
@@ -340,14 +343,20 @@ latency.
 
 
 \subsection{Latency}
-
-\begin{figure}
-  \includegraphics[width=\textwidth]{figures/latency-hist}
-  \caption{%
-    Latency distribution for a single 1024 B packet transferred from FPGA to
-    GPU memory and to main memory.
-  }
-  \label{fig:latency-distribution}
+\begin{figure}[t]
+  \centering
+  \begin{subfigure}[b]{.8\textwidth}
+    \centering
+    \includegraphics[width=\textwidth]{figures/latency}
+    \caption{Latency }
+    \label{fig:latency_vs_size}
+  \end{subfigure}
+  \begin{subfigure}[b]{.8\textwidth}
+    \includegraphics[width=\textwidth]{figures/latency-hist}
+    \caption{Latency distribution.}
+    \label{fig:latency_hist}
+  \end{subfigure}
+  \label{fig:latency}
 \end{figure}
 
 For HEP experiments, low latencies are necessary to react in a reasonable time