8 years ago · c31762c733
--- a/data/ipedirectgma/latency-hist.py
+++ b/data/ipedirectgma/latency-hist.py
@@ -7,12 +7,12 @@ g = np.loadtxt('ipedirectgma.1024.gpu.txt')
 
				 cpu_data = c[:,1]
			
 
				 gpu_data = g[:,1]
			
 
				 
			
 
				-cpu_data = cpu_data[cpu_data < 4.5]
			
 
				-gpu_data = gpu_data[gpu_data < 4.5]
			
 
				+cpu_data = cpu_data[cpu_data < 5]
			
 
				+gpu_data = gpu_data[gpu_data < 5]
			
 
				 
			
 
				 plt.rc('font', **dict(family='serif'))
			
 
				 
			
 
				-plt.figure(figsize=(8, 3))
			
 
				+plt.figure(figsize=(4, 4.2))
			
 
				 
			
 
				 cpu_weights = np.ones_like(cpu_data)/float(len(cpu_data))
			
 
				 gpu_weights = np.ones_like(gpu_data)/float(len(gpu_data))
			
@@ -23,9 +23,11 @@ gpu_weights = np.ones_like(gpu_data)/float(len(gpu_data))
 
				 # plt.hist(cpu_data, bins=200, label='CPU', log=True)
			
 
				 plt.hist(gpu_data, weights=gpu_weights, bins=50, color='#3b5b92', label='GPU', linewidth=0)
			
 
				 plt.hist(cpu_data, weights=cpu_weights, bins=50, color='#d54d4d', label='CPU', linewidth=0)
			
 
				+plt.xticks([2.0, 3.0, 4.0, 5.0])
			
 
				+plt.yticks([0,0.25,0.5])
			
 
				 # plt.semilogy()
			
 
				 
			
 
				 plt.xlabel(u'Latency in \u00b5s')
			
 
				 plt.ylabel('Frequency')
			
 
				-plt.legend(loc='upper right')
			
 
				+plt.legend(loc='upper right',frameon=False)
			
 
				 plt.savefig('latency-hist.pdf', dpi=300, bbox_inches='tight')
			
--- a/data/ipedirectgma/plot.py
+++ b/data/ipedirectgma/plot.py
@@ -31,16 +31,17 @@ for i in range(128, 4096 + 128, 128):
 
				 
			
 
				 
			
 
				 plt.rc('font', **dict(family='serif'))
			
 
				-plt.figure(figsize=(10, 4))
			
 
				+plt.figure(figsize=(4, 4))
			
 
				 plt.xlabel('Packet size in (B)')
			
 
				 plt.ylabel('Latency (us)')
			
 
				 # plt.errorbar(xs, yscm, yerr=yscs, label='CPU')
			
 
				 # plt.errorbar(xs, ysgm, ls='dotted', yerr=ysgs, label='GPU')
			
 
				-plt.plot(xs, yscm, 'o-', markersize=4, label='Main memory')
			
 
				-plt.plot(xs, ysgm, 'x-', label='GPU memory')
			
 
				+plt.plot(xs, yscm, 'o-', markersize=4, label='Main memory', color='#d54d4d')
			
 
				+plt.plot(xs, ysgm, 'x-', label='GPU memory', color='#3b5b92')
			
 
				 plt.xticks([128, 1024, 2048, 2048+1024, 4096])
			
 
				-plt.xlim(128, 4096)
			
 
				-plt.legend(loc='upper left')
			
 
				+plt.yticks([2,4,6,8])
			
 
				+plt.xlim(0, 4200)
			
 
				+plt.legend(loc='upper left', frameon=False)
			
 
				 plt.savefig('latency.pdf', dpi=300, bbox_inches='tight')
			
 
				 
			
 
				 A = np.vstack([xs, np.ones(len(xs))]).T
			
--- a/data/throughput.py
+++ b/data/throughput.py
@@ -6,7 +6,7 @@ cpu_data = np.loadtxt('throughput.cpu')
 
				 
			
 
				 plt.rc('font', **dict(family='serif'))
			
 
				 
			
 
				-plt.figure(figsize=(8, 3))
			
 
				+plt.figure(figsize=(8, 4))
			
 
				 
			
 
				 plt.semilogx(gpu_data[:,0], gpu_data[:,1], '*-', color='#3b5b92', label='GPU')
			
 
				 plt.semilogx(cpu_data[:,0], cpu_data[:,1], 'o-', color='#d54d4d', label='CPU')
			
@@ -15,5 +15,5 @@ plt.yticks([0,2000,4000,6000,8000])
 
				 
			
 
				 plt.xlabel('Data size in B')
			
 
				 plt.ylabel('Throughput in MB/s')
			
 
				-plt.legend(loc='lower right')
			
 
				+plt.legend(loc='lower right',frameon=False)
			
 
				 plt.savefig('throughput.pdf', dpi=300, bbox_inches='tight')
			
--- a/paper.tex
+++ b/paper.tex
@@ -7,6 +7,7 @@
 
				 \usepackage{subcaption}
			
 
				 \usepackage{textcomp}
			
 
				 \usepackage{booktabs}
			
 
				+\usepackage{floatrow}
			
 
				 
			
 
				 \newboolean{draft}
			
 
				 \setboolean{draft}{true}
			
@@ -164,7 +165,7 @@ registers.
 
				 
			
 
				 \begin{figure}[t]
			
 
				   \centering
			
 
				-  \includegraphics[width=0.5\textwidth]{figures/fpga-arch}
			
 
				+  \includegraphics[width=0.75\textwidth]{figures/fpga-arch}
			
 
				   \caption{%
			
 
				     FPGA AAA
			
 
				   }
			
@@ -174,24 +175,7 @@ registers.
 
				 The physical addresses of the host's memory buffers are stored into an internal
			
 
				 memory and are dynamically updated by the driver or user, allowing highly
			
 
				 efficient zero-copy data transfers. The maximum size associated with each
			
 
				-address is 2 GB. The resource utilization
			
 
				-on a Virtex 7 device is reported in \ref{table:utilization}.
			
 
				-
			
 
				-\begin{table}[]
			
 
				-\centering
			
 
				-\caption{Resource utilization on a Virtex7 device X240VT}
			
 
				-\label{table:utilization}
			
 
				-\begin{tabular}{@{}llll@{}}
			
 
				-  \toprule
			
 
				-Resource & Utilization & Available & Utilization \% \\
			
 
				-  \midrule
			
 
				-LUT      & 5331        & 433200    & 1.23           \\
			
 
				-LUTRAM   & 56          & 174200    & 0.03           \\
			
 
				-FF       & 5437        & 866400    & 0.63           \\
			
 
				-BRAM     & 20.50       & 1470      & 1.39           \\
			
 
				-  \bottomrule
			
 
				-\end{tabular}
			
 
				-\end{table}
			
 
				+address is 2 GB. 
			
 
				 
			
 
				 \subsection{OpenCL management on host side}
			
 
				 \label{sec:host}
			
@@ -224,7 +208,7 @@ Using the \texttt{cl\-Enqueue\-Copy\-Buffer} function call it is possible to
 
				 write entire memory regions in DMA fashion to the FPGA. In this case, the GPU
			
 
				 acts as bus master and pushes data to the FPGA.
			
 
				 
			
 
				-\begin{figure}
			
 
				+\begin{figure}[t]
			
 
				   \centering
			
 
				   \includegraphics[width=0.75\textwidth]{figures/opencl-setup}
			
 
				   \caption{The FPGA writes to GPU memory by mapping the physical address of a
			
@@ -264,10 +248,32 @@ cases, a Xilinx VC709 evaluation board was plugged into a PCIe 3.0 x8 slots.
 
				 In case of FPGA-to-CPU data transfers, the software implementation is the one
			
 
				 described in~\cite{rota2015dma}.
			
 
				 
			
 
				+The resource utilization
			
 
				+on a Virtex 7 device is reported in \ref{table:utilization}.
			
 
				+
			
 
				+\begin{table}[]
			
 
				+\centering
			
 
				+\caption{Resource utilization on a Virtex7 device X240VT}
			
 
				+\label{table:utilization}
			
 
				+\tabcolsep=0.11cm
			
 
				+\small
			
 
				+\begin{tabular}{@{}llll@{}}
			
 
				+  \toprule
			
 
				+Resource & Utilization & Available & Utilization \% \\
			
 
				+  \midrule
			
 
				+LUT      & 5331        & 433200    & 1.23           \\
			
 
				+LUTRAM   & 56          & 174200    & 0.03           \\
			
 
				+FF       & 5437        & 866400    & 0.63           \\
			
 
				+BRAM     & 20.50       & 1470      & 1.39           \\
			
 
				+  \bottomrule
			
 
				+\end{tabular}
			
 
				+\end{table}
			
 
				+
			
 
				 \begin{table}[b]
			
 
				 \centering
			
 
				 \caption{Hardware used for throughput and latency measurements}
			
 
				 \label{table:setups}
			
 
				+\tabcolsep=0.11cm
			
 
				 \begin{tabular}{@{}llll@{}}
			
 
				   \toprule
			
 
				 Component & Setup 1 & Setup 2 \\
			
@@ -279,6 +285,7 @@ PCIe link (FPGA-System memory)    & x8 Gen3                        & x4 Gen1
 
				 PCIe link (FPGA-GPU)    & x8 Gen3                        & x8 Gen3     \\
			
 
				   \bottomrule
			
 
				 \end{tabular}
			
 
				+
			
 
				 \end{table}
			
 
				 
			
 
				 
			
@@ -290,8 +297,8 @@ PCIe link (FPGA-GPU)    & x8 Gen3                        & x8 Gen3     \\
 
				 % requirements, this result makes smaller acquisition system a cost-effective
			
 
				 % alternative to larger workstations.
			
 
				 
			
 
				-\begin{figure}
			
 
				-  \includegraphics[width=\textwidth]{figures/throughput}
			
 
				+\begin{figure}[t]
			
 
				+  \includegraphics[width=0.85\textwidth]{figures/throughput}
			
 
				   \caption{%
			
 
				     Measured results for data transfers from FPGA to main memory
			
 
				     (CPU) and from FPGA to the global GPU memory (GPU).
			
@@ -345,13 +352,13 @@ latency.
 
				 \subsection{Latency}
			
 
				 \begin{figure}[t]
			
 
				   \centering
			
 
				-  \begin{subfigure}[b]{.8\textwidth}
			
 
				+  \begin{subfigure}[b]{.45\textwidth}
			
 
				     \centering
			
 
				     \includegraphics[width=\textwidth]{figures/latency}
			
 
				     \caption{Latency }
			
 
				     \label{fig:latency_vs_size}
			
 
				   \end{subfigure}
			
 
				-  \begin{subfigure}[b]{.8\textwidth}
			
 
				+  \begin{subfigure}[b]{.45\textwidth}
			
 
				     \includegraphics[width=\textwidth]{figures/latency-hist}
			
 
				     \caption{Latency distribution.}
			
 
				     \label{fig:latency_hist}