SHA1
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,8 @@ FIGURES=figures/intra-copy.pdf \
 
															 		figures/transf.pdf \
														
 
															 		figures/throughput.pdf \
														
 
															 		figures/latency.pdf \
														
 
															-		figures/latency-hist.pdf
														
 
															+		figures/latency-hist.pdf \
														
 
															+		figures/fpga-arch.pdf
														
 
															 .PHONY: clean figures
														
--- a/data/ipedirectgma/latency-hist.py
+++ b/data/ipedirectgma/latency-hist.py
@@ -12,14 +12,17 @@ gpu_data = gpu_data[gpu_data < 4.5]
 
															 plt.rc('font', **dict(family='serif'))
														
 
															-plt.figure(figsize=(6, 4))
														
 
															+plt.figure(figsize=(8, 3))
														
 
															+
														
 
															+cpu_weights = np.ones_like(cpu_data)/float(len(cpu_data))
														
 
															+gpu_weights = np.ones_like(gpu_data)/float(len(gpu_data))
														
 
															 # divide by 2 for one-way latency
														
 
															 # plt.ylim(0.1, 10000)
														
 
															 # plt.hist(gpu_data, bins=200, label='GPU', log=True)
														
 
															 # plt.hist(cpu_data, bins=200, label='CPU', log=True)
														
 
															-plt.hist(gpu_data, bins=100, color='#3b5b92', label='GPU')
														
 
															-plt.hist(cpu_data, bins=100, color='#d54d4d', label='CPU')
														
 
															+plt.hist(gpu_data, weights=gpu_weights, bins=50, color='#3b5b92', label='GPU', linewidth=0)
														
 
															+plt.hist(cpu_data, weights=cpu_weights, bins=50, color='#d54d4d', label='CPU', linewidth=0)
														
 
															 # plt.semilogy()
														
 
															 plt.xlabel(u'Latency in \u00b5s')
														
--- a/data/latency-hist.py
+++ b/data/latency-hist.py
@@ -17,8 +17,8 @@ plt.rc('font', **dict(family='serif'))
 
															 plt.figure(figsize=(4, 3))
														
 
															 # divide by 2 for one-way latency
														
 
															-plt.hist(gpu_data / 2, bins=100, normed=False, color='#3b5b92', label='GPU')
														
 
															-plt.hist(cpu_data / 2, bins=100, normed=False, color='#d54d4d', label='CPU')
														
 
															+plt.hist(gpu_data / 2, bins=100, normed=True, color='#3b5b92', label='GPU', linewidth=0)
														
 
															+plt.hist(cpu_data / 2, bins=100, normed=True, color='#d54d4d', label='CPU', linewidth=0)
														
 
															 plt.xlabel(u'Latency in \u00b5s')
														
 
															 plt.ylabel('Frequency')
														
--- a/data/throughput.cpu
+++ b/data/throughput.cpu
@@ -17,8 +17,7 @@
 
															 1048000000 		6472		
														
 
															 2097000000 		6528		
														
 
															 4194000000 		6561		
														
 
															-
														
 
															-
														
 
															+8388000000 		6581		
														
--- a/data/throughput.gpu
+++ b/data/throughput.gpu
@@ -18,6 +18,3 @@
 
															 2147483648        6386.3333333333
														
 
															 4294967296        6408
														
 
															 8589934592        6393.8333333333
														
 
															-17179869184       6370.6666666667
														
 
															-34359738368       6372.1666666667
														
 
															-68719476736       6372.3333333333
														
--- a/data/throughput.py
+++ b/data/throughput.py
@@ -6,12 +6,14 @@ cpu_data = np.loadtxt('throughput.cpu')
 
															 plt.rc('font', **dict(family='serif'))
														
 
															-plt.figure(figsize=(8, 1))
														
 
															+plt.figure(figsize=(8, 3))
														
 
															 plt.semilogx(gpu_data[:,0], gpu_data[:,1], '*-', color='#3b5b92', label='GPU')
														
 
															 plt.semilogx(cpu_data[:,0], cpu_data[:,1], 'o-', color='#d54d4d', label='CPU')
														
 
															+plt.xticks([1e4,1e6,1e8,1e10])
														
 
															+plt.yticks([0,2000,4000,6000,8000])
														
 
															-plt.xlabel(u'Data size in B')
														
 
															+plt.xlabel('Data size in B')
														
 
															 plt.ylabel('Throughput in MB/s')
														
 
															 plt.legend(loc='lower right')
														
 
															 plt.savefig('throughput.pdf', dpi=300, bbox_inches='tight')
														
--- a/paper.tex
+++ b/paper.tex
@@ -162,6 +162,15 @@ friendly interfaces with the custom logic with an input bandwidth of 7.45
 
															 GB/s. The user logic and the DMA engine are configured by the host through PIO
														
 
															 registers.
														
 
															+\begin{figure}[t]
														
 
															+  \centering
														
 
															+  \includegraphics[width=0.5\textwidth]{figures/fpga-arch}
														
 
															+  \caption{%
														
 
															+    FPGA AAA
														
 
															+  }
														
 
															+  \label{fig:fpga-arch}
														
 
															+\end{figure}
														
 
															+
														
 
															 The physical addresses of the host's memory buffers are stored into an internal
														
 
															 memory and are dynamically updated by the driver or user, allowing highly
														
 
															 efficient zero-copy data transfers. The maximum size associated with each
														
@@ -248,15 +257,38 @@ Python.
 
															 \section{Results}
														
 
															-We carried out performance measurements on a machine with an Intel Xeon
														
 
															-E5-1630 at 3.7 GHz, Intel C612 chipset running openSUSE 13.1 with Linux
														
 
															-3.11.10. The Xilinx VC709 evaluation board was plugged into one of the PCIe
														
 
															-3.0 x8 slots. In case of FPGA-to-CPU data transfers, the software
														
 
															-implementation is the one  described in~\cite{rota2015dma}.
														
 
															+We carried out performance measurements on two different setups, described in
														
 
															+table~\ref{table:setups}. In Setup 2, a low-end Supermicro X7SPA-HF-D525
														
 
															+system was connected to a Netstor NA255A external PCIe enclosure. In both
														
 
															+cases, a Xilinx VC709 evaluation board was plugged into a PCIe 3.0 x8 slots.
														
 
															+In case of FPGA-to-CPU data transfers, the software implementation is the one
														
 
															+described in~\cite{rota2015dma}.
														
 
															-\subsection{Throughput}
														
 
															+\begin{table}[b]
														
 
															+\centering
														
 
															+\caption{Hardware used for throughput and latency measurements}
														
 
															+\label{table:setups}
														
 
															+\begin{tabular}{@{}llll@{}}
														
 
															+  \toprule
														
 
															+Component & Setup 1 & Setup 2 \\
														
 
															+  \midrule
														
 
															+CPU           & Intel Xeon E5-1630 at 3.7 GHz  & Intel Atom D525   \\
														
 
															+Chipset       & Intel C612                     & Intel ICH9R Express   \\
														
 
															+GPU           & AMD FirePro W9100              & AMD FirePro W9100   \\
														
 
															+PCIe link (FPGA-System memory)    & x8 Gen3                        & x4 Gen1     \\
														
 
															+PCIe link (FPGA-GPU)    & x8 Gen3                        & x8 Gen3     \\
														
 
															+  \bottomrule
														
 
															+\end{tabular}
														
 
															+\end{table}
														
 
															+\subsection{Throughput}
														
 
															+
														
 
															+% We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
														
 
															+% system based on an Intel Atom CPU. The results showed no significant difference
														
 
															+% compared to the previous setup. Depending on the application and computing
														
 
															+% requirements, this result makes smaller acquisition system a cost-effective
														
 
															+% alternative to larger workstations.
														
 
															 \begin{figure}
														
 
															   \includegraphics[width=\textwidth]{figures/throughput}
														
@@ -267,31 +299,6 @@ implementation is the one  described in~\cite{rota2015dma}.
 
															 \label{fig:throughput}
														
 
															 \end{figure}
														
 
															-% \begin{figure}
														
 
															-%   \centering
														
 
															-%   \begin{subfigure}[b]{.49\textwidth}
														
 
															-%     \centering
														
 
															-%     \includegraphics[width=\textwidth]{figures/throughput}
														
 
															-%     \caption{%
														
 
															-%       DMA data transfer throughput.
														
 
															-%     }
														
 
															-%     \label{fig:throughput}
														
 
															-%   \end{subfigure}
														
 
															-%   \begin{subfigure}[b]{.49\textwidth}
														
 
															-%     \includegraphics[width=\textwidth]{figures/latency}
														
 
															-%     \caption{%
														
 
															-%       Latency distribution.
														
 
															-%       % for a single 4 KB packet transferred
														
 
															-%       % from FPGA-to-CPU and FPGA-to-GPU.
														
 
															-%     }
														
 
															-%     \label{fig:latency}
														
 
															-%   \end{subfigure}
														
 
															-%   \caption{%
														
 
															-%     Measured throuhput for data transfers from FPGA to main memory
														
 
															-%     (CPU) and from FPGA to the global GPU memory (GPU). 
														
 
															-%   }
														
 
															-% \end{figure}
														
 
															-
														
 
															 The measured results for the pure data throughput is shown in
														
 
															 \figref{fig:throughput} for transfers from the FPGA to the system's main
														
 
															 memory as well as to the global memory as explained in \ref{sec:host}. 
														
@@ -304,11 +311,7 @@ is approaching slowly 100 MB/s. From there on, the throughput increases up to
 
															 6.4 GB/s when PCIe bus saturation sets in at about 1 GB data size. The CPU
														
 
															 throughput saturates earlier but the maximum throughput is 6.6 GB/s.
														
 
															-% We repeated the FPGA-to-GPU measurements on a low-end Supermicro X7SPA-HF-D525
														
 
															-% system based on an Intel Atom CPU. The results showed no significant difference
														
 
															-% compared to the previous setup. Depending on the application and computing
														
 
															-% requirements, this result makes smaller acquisition system a cost-effective
														
 
															-% alternative to larger workstations.
														
 
															+
														
 
															 % \begin{figure}
														
 
															 %   \includegraphics[width=\textwidth]{figures/intra-copy}
														
@@ -340,14 +343,20 @@ latency.
 
															 \subsection{Latency}
														
 
															-
														
 
															-\begin{figure}
														
 
															-  \includegraphics[width=\textwidth]{figures/latency-hist}
														
 
															-  \caption{%
														
 
															-    Latency distribution for a single 1024 B packet transferred from FPGA to
														
 
															-    GPU memory and to main memory.
														
 
															-  }
														
 
															-  \label{fig:latency-distribution}
														
 
															+\begin{figure}[t]
														
 
															+  \centering
														
 
															+  \begin{subfigure}[b]{.8\textwidth}
														
 
															+    \centering
														
 
															+    \includegraphics[width=\textwidth]{figures/latency}
														
 
															+    \caption{Latency }
														
 
															+    \label{fig:latency_vs_size}
														
 
															+  \end{subfigure}
														
 
															+  \begin{subfigure}[b]{.8\textwidth}
														
 
															+    \includegraphics[width=\textwidth]{figures/latency-hist}
														
 
															+    \caption{Latency distribution.}
														
 
															+    \label{fig:latency_hist}
														
 
															+  \end{subfigure}
														
 
															+  \label{fig:latency}
														
 
															 \end{figure}
														
 
															 For HEP experiments, low latencies are necessary to react in a reasonable time
Tekijä	SHA1 Viesti	Päivämäärä
Lorenzo	60f5751469 Added placeholder for fpga-arch figure, minor changes plots	8 vuotta sitten
Lorenzo	9401d11247 Merge branch 'master' of git.ipe.kit.edu:ufo/twepp2015	8 vuotta sitten
Lorenzo	8b1626010f Added table for setups	8 vuotta sitten