4
0

3 Commitit 54925c291e ... 60f5751469

Tekijä SHA1 Viesti Päivämäärä
  Lorenzo 60f5751469 Added placeholder for fpga-arch figure, minor changes plots 8 vuotta sitten
  Lorenzo 9401d11247 Merge branch 'master' of git.ipe.kit.edu:ufo/twepp2015 8 vuotta sitten
  Lorenzo 8b1626010f Added table for setups 8 vuotta sitten
7 muutettua tiedostoa jossa 68 lisäystä ja 57 poistoa
  1. 2 1
      Makefile
  2. 6 3
      data/ipedirectgma/latency-hist.py
  3. 2 2
      data/latency-hist.py
  4. 1 2
      data/throughput.cpu
  5. 0 3
      data/throughput.gpu
  6. 4 2
      data/throughput.py
  7. 53 44
      paper.tex

+ 2 - 1
Makefile

@@ -3,7 +3,8 @@ FIGURES=figures/intra-copy.pdf \
 		figures/transf.pdf \
 		figures/transf.pdf \
 		figures/throughput.pdf \
 		figures/throughput.pdf \
 		figures/latency.pdf \
 		figures/latency.pdf \
-		figures/latency-hist.pdf
+		figures/latency-hist.pdf \
+		figures/fpga-arch.pdf
 
 
 .PHONY: clean figures
 .PHONY: clean figures
 
 

+ 6 - 3
data/ipedirectgma/latency-hist.py

@@ -12,14 +12,17 @@ gpu_data = gpu_data[gpu_data < 4.5]
 
 
 plt.rc('font', **dict(family='serif'))
 plt.rc('font', **dict(family='serif'))
 
 
-plt.figure(figsize=(6, 4))
+plt.figure(figsize=(8, 3))
+
+cpu_weights = np.ones_like(cpu_data)/float(len(cpu_data))
+gpu_weights = np.ones_like(gpu_data)/float(len(gpu_data))
 
 
 # divide by 2 for one-way latency
 # divide by 2 for one-way latency
 # plt.ylim(0.1, 10000)
 # plt.ylim(0.1, 10000)
 # plt.hist(gpu_data, bins=200, label='GPU', log=True)
 # plt.hist(gpu_data, bins=200, label='GPU', log=True)
 # plt.hist(cpu_data, bins=200, label='CPU', log=True)
 # plt.hist(cpu_data, bins=200, label='CPU', log=True)
-plt.hist(gpu_data, bins=100, color='#3b5b92', label='GPU')
-plt.hist(cpu_data, bins=100, color='#d54d4d', label='CPU')
+plt.hist(gpu_data, weights=gpu_weights, bins=50, color='#3b5b92', label='GPU', linewidth=0)
+plt.hist(cpu_data, weights=cpu_weights, bins=50, color='#d54d4d', label='CPU', linewidth=0)
 # plt.semilogy()
 # plt.semilogy()
 
 
 plt.xlabel(u'Latency in \u00b5s')
 plt.xlabel(u'Latency in \u00b5s')

+ 2 - 2
data/latency-hist.py

@@ -17,8 +17,8 @@ plt.rc('font', **dict(family='serif'))
 plt.figure(figsize=(4, 3))
 plt.figure(figsize=(4, 3))
 
 
 # divide by 2 for one-way latency
 # divide by 2 for one-way latency
-plt.hist(gpu_data / 2, bins=100, normed=False, color='#3b5b92', label='GPU')
-plt.hist(cpu_data / 2, bins=100, normed=False, color='#d54d4d', label='CPU')
+plt.hist(gpu_data / 2, bins=100, normed=True, color='#3b5b92', label='GPU', linewidth=0)
+plt.hist(cpu_data / 2, bins=100, normed=True, color='#d54d4d', label='CPU', linewidth=0)
 
 
 plt.xlabel(u'Latency in \u00b5s')
 plt.xlabel(u'Latency in \u00b5s')
 plt.ylabel('Frequency')
 plt.ylabel('Frequency')

+ 1 - 2
data/throughput.cpu

@@ -17,8 +17,7 @@
 1048000000 		6472		
 1048000000 		6472		
 2097000000 		6528		
 2097000000 		6528		
 4194000000 		6561		
 4194000000 		6561		
-
-
+8388000000 		6581		
 
 
 
 
 
 

+ 0 - 3
data/throughput.gpu

@@ -18,6 +18,3 @@
 2147483648        6386.3333333333
 2147483648        6386.3333333333
 4294967296        6408
 4294967296        6408
 8589934592        6393.8333333333
 8589934592        6393.8333333333
-17179869184       6370.6666666667
-34359738368       6372.1666666667
-68719476736       6372.3333333333

+ 4 - 2
data/throughput.py

@@ -6,12 +6,14 @@ cpu_data = np.loadtxt('throughput.cpu')
 
 
 plt.rc('font', **dict(family='serif'))
 plt.rc('font', **dict(family='serif'))
 
 
-plt.figure(figsize=(8, 1))
+plt.figure(figsize=(8, 3))
 
 
 plt.semilogx(gpu_data[:,0], gpu_data[:,1], '*-', color='#3b5b92', label='GPU')
 plt.semilogx(gpu_data[:,0], gpu_data[:,1], '*-', color='#3b5b92', label='GPU')
 plt.semilogx(cpu_data[:,0], cpu_data[:,1], 'o-', color='#d54d4d', label='CPU')
 plt.semilogx(cpu_data[:,0], cpu_data[:,1], 'o-', color='#d54d4d', label='CPU')
+plt.xticks([1e4,1e6,1e8,1e10])
+plt.yticks([0,2000,4000,6000,8000])
 
 
-plt.xlabel(u'Data size in B')
+plt.xlabel('Data size in B')
 plt.ylabel('Throughput in MB/s')
 plt.ylabel('Throughput in MB/s')
 plt.legend(loc='lower right')
 plt.legend(loc='lower right')
 plt.savefig('throughput.pdf', dpi=300, bbox_inches='tight')
 plt.savefig('throughput.pdf', dpi=300, bbox_inches='tight')

+ 53 - 44
paper.tex

@@ -162,6 +162,15 @@ friendly interfaces with the custom logic with an input bandwidth of 7.45
 GB/s. The user logic and the DMA engine are configured by the host through PIO
 GB/s. The user logic and the DMA engine are configured by the host through PIO
 registers.
 registers.
 
 
+\begin{figure}[t]
+  \centering
+  \includegraphics[width=0.5\textwidth]{figures/fpga-arch}
+  \caption{%
+    FPGA AAA
+  }
+  \label{fig:fpga-arch}
+\end{figure}
+
 The physical addresses of the host's memory buffers are stored into an internal
 The physical addresses of the host's memory buffers are stored into an internal
 memory and are dynamically updated by the driver or user, allowing highly
 memory and are dynamically updated by the driver or user, allowing highly
 efficient zero-copy data transfers. The maximum size associated with each
 efficient zero-copy data transfers. The maximum size associated with each
@@ -248,15 +257,38 @@ Python.
 
 
 \section{Results}
 \section{Results}
 
 
-We carried out performance measurements on a machine with an Intel Xeon
-E5-1630 at 3.7 GHz, Intel C612 chipset running openSUSE 13.1 with Linux
-3.11.10. The Xilinx VC709 evaluation board was plugged into one of the PCIe
-3.0 x8 slots. In case of FPGA-to-CPU data transfers, the software
-implementation is the one  described in~\cite{rota2015dma}.
+We carried out performance measurements on two different setups, described in
+table~\ref{table:setups}. In Setup 2, a low-end Supermicro X7SPA-HF-D525
+system was connected to a Netstor NA255A external PCIe enclosure. In both
+cases, a Xilinx VC709 evaluation board was plugged into a PCIe 3.0 x8 slots.
+In case of FPGA-to-CPU data transfers, the software implementation is the one
+described in~\cite{rota2015dma}.
 
 
-\subsection{Throughput}
+\begin{table}[b]
+\centering
+\caption{Hardware used for throughput and latency measurements}
+\label{table:setups}
+\begin{tabular}{@{}llll@{}}
+  \toprule
+Component & Setup 1 & Setup 2 \\
+  \midrule
+CPU           & Intel Xeon E5-1630 at 3.7 GHz  & Intel Atom D525   \\
+Chipset       & Intel C612                     & Intel ICH9R Express   \\
+GPU           & AMD FirePro W9100              & AMD FirePro W9100   \\
+PCIe link (FPGA-System memory)    & x8 Gen3                        & x4 Gen1     \\
+PCIe link (FPGA-GPU)    & x8 Gen3                        & x8 Gen3     \\
+  \bottomrule
+\end{tabular}
+\end{table}
 
 
 
 
+\subsection{Throughput}
+
+% We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
+% system based on an Intel Atom CPU. The results showed no significant difference
+% compared to the previous setup. Depending on the application and computing
+% requirements, this result makes smaller acquisition system a cost-effective
+% alternative to larger workstations.
 
 
 \begin{figure}
 \begin{figure}
   \includegraphics[width=\textwidth]{figures/throughput}
   \includegraphics[width=\textwidth]{figures/throughput}
@@ -267,31 +299,6 @@ implementation is the one  described in~\cite{rota2015dma}.
 \label{fig:throughput}
 \label{fig:throughput}
 \end{figure}
 \end{figure}
 
 
-% \begin{figure}
-%   \centering
-%   \begin{subfigure}[b]{.49\textwidth}
-%     \centering
-%     \includegraphics[width=\textwidth]{figures/throughput}
-%     \caption{%
-%       DMA data transfer throughput.
-%     }
-%     \label{fig:throughput}
-%   \end{subfigure}
-%   \begin{subfigure}[b]{.49\textwidth}
-%     \includegraphics[width=\textwidth]{figures/latency}
-%     \caption{%
-%       Latency distribution.
-%       % for a single 4 KB packet transferred
-%       % from FPGA-to-CPU and FPGA-to-GPU.
-%     }
-%     \label{fig:latency}
-%   \end{subfigure}
-%   \caption{%
-%     Measured throuhput for data transfers from FPGA to main memory
-%     (CPU) and from FPGA to the global GPU memory (GPU). 
-%   }
-% \end{figure}
-
 The measured results for the pure data throughput is shown in
 The measured results for the pure data throughput is shown in
 \figref{fig:throughput} for transfers from the FPGA to the system's main
 \figref{fig:throughput} for transfers from the FPGA to the system's main
 memory as well as to the global memory as explained in \ref{sec:host}. 
 memory as well as to the global memory as explained in \ref{sec:host}. 
@@ -304,11 +311,7 @@ is approaching slowly 100 MB/s. From there on, the throughput increases up to
 6.4 GB/s when PCIe bus saturation sets in at about 1 GB data size. The CPU
 6.4 GB/s when PCIe bus saturation sets in at about 1 GB data size. The CPU
 throughput saturates earlier but the maximum throughput is 6.6 GB/s.
 throughput saturates earlier but the maximum throughput is 6.6 GB/s.
 
 
-% We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
-% system based on an Intel Atom CPU. The results showed no significant difference
-% compared to the previous setup. Depending on the application and computing
-% requirements, this result makes smaller acquisition system a cost-effective
-% alternative to larger workstations.
+
 
 
 % \begin{figure}
 % \begin{figure}
 %   \includegraphics[width=\textwidth]{figures/intra-copy}
 %   \includegraphics[width=\textwidth]{figures/intra-copy}
@@ -340,14 +343,20 @@ latency.
 
 
 
 
 \subsection{Latency}
 \subsection{Latency}
-
-\begin{figure}
-  \includegraphics[width=\textwidth]{figures/latency-hist}
-  \caption{%
-    Latency distribution for a single 1024 B packet transferred from FPGA to
-    GPU memory and to main memory.
-  }
-  \label{fig:latency-distribution}
+\begin{figure}[t]
+  \centering
+  \begin{subfigure}[b]{.8\textwidth}
+    \centering
+    \includegraphics[width=\textwidth]{figures/latency}
+    \caption{Latency }
+    \label{fig:latency_vs_size}
+  \end{subfigure}
+  \begin{subfigure}[b]{.8\textwidth}
+    \includegraphics[width=\textwidth]{figures/latency-hist}
+    \caption{Latency distribution.}
+    \label{fig:latency_hist}
+  \end{subfigure}
+  \label{fig:latency}
 \end{figure}
 \end{figure}
 
 
 For HEP experiments, low latencies are necessary to react in a reasonable time
 For HEP experiments, low latencies are necessary to react in a reasonable time