|
@@ -7,6 +7,7 @@
|
|
|
\usepackage{subcaption}
|
|
|
\usepackage{textcomp}
|
|
|
\usepackage{booktabs}
|
|
|
+\usepackage{floatrow}
|
|
|
|
|
|
\newboolean{draft}
|
|
|
\setboolean{draft}{true}
|
|
@@ -164,7 +165,7 @@ registers.
|
|
|
|
|
|
\begin{figure}[t]
|
|
|
\centering
|
|
|
- \includegraphics[width=0.5\textwidth]{figures/fpga-arch}
|
|
|
+ \includegraphics[width=0.75\textwidth]{figures/fpga-arch}
|
|
|
\caption{%
|
|
|
FPGA AAA
|
|
|
}
|
|
@@ -174,24 +175,7 @@ registers.
|
|
|
The physical addresses of the host's memory buffers are stored into an internal
|
|
|
memory and are dynamically updated by the driver or user, allowing highly
|
|
|
efficient zero-copy data transfers. The maximum size associated with each
|
|
|
-address is 2 GB. The resource utilization
|
|
|
-on a Virtex 7 device is reported in \ref{table:utilization}.
|
|
|
-
|
|
|
-\begin{table}[]
|
|
|
-\centering
|
|
|
-\caption{Resource utilization on a Virtex7 device X240VT}
|
|
|
-\label{table:utilization}
|
|
|
-\begin{tabular}{@{}llll@{}}
|
|
|
- \toprule
|
|
|
-Resource & Utilization & Available & Utilization \% \\
|
|
|
- \midrule
|
|
|
-LUT & 5331 & 433200 & 1.23 \\
|
|
|
-LUTRAM & 56 & 174200 & 0.03 \\
|
|
|
-FF & 5437 & 866400 & 0.63 \\
|
|
|
-BRAM & 20.50 & 1470 & 1.39 \\
|
|
|
- \bottomrule
|
|
|
-\end{tabular}
|
|
|
-\end{table}
|
|
|
+address is 2 GB.
|
|
|
|
|
|
\subsection{OpenCL management on host side}
|
|
|
\label{sec:host}
|
|
@@ -224,7 +208,7 @@ Using the \texttt{cl\-Enqueue\-Copy\-Buffer} function call it is possible to
|
|
|
write entire memory regions in DMA fashion to the FPGA. In this case, the GPU
|
|
|
acts as bus master and pushes data to the FPGA.
|
|
|
|
|
|
-\begin{figure}
|
|
|
+\begin{figure}[t]
|
|
|
\centering
|
|
|
\includegraphics[width=0.75\textwidth]{figures/opencl-setup}
|
|
|
\caption{The FPGA writes to GPU memory by mapping the physical address of a
|
|
@@ -264,10 +248,32 @@ cases, a Xilinx VC709 evaluation board was plugged into a PCIe 3.0 x8 slots.
|
|
|
In case of FPGA-to-CPU data transfers, the software implementation is the one
|
|
|
described in~\cite{rota2015dma}.
|
|
|
|
|
|
+The resource utilization
|
|
|
+on a Virtex 7 device is reported in \ref{table:utilization}.
|
|
|
+
|
|
|
+\begin{table}[]
|
|
|
+\centering
|
|
|
+\caption{Resource utilization on a Virtex7 device X240VT}
|
|
|
+\label{table:utilization}
|
|
|
+\tabcolsep=0.11cm
|
|
|
+\small
|
|
|
+\begin{tabular}{@{}llll@{}}
|
|
|
+ \toprule
|
|
|
+Resource & Utilization & Available & Utilization \% \\
|
|
|
+ \midrule
|
|
|
+LUT & 5331 & 433200 & 1.23 \\
|
|
|
+LUTRAM & 56 & 174200 & 0.03 \\
|
|
|
+FF & 5437 & 866400 & 0.63 \\
|
|
|
+BRAM & 20.50 & 1470 & 1.39 \\
|
|
|
+ \bottomrule
|
|
|
+\end{tabular}
|
|
|
+\end{table}
|
|
|
+
|
|
|
\begin{table}[b]
|
|
|
\centering
|
|
|
\caption{Hardware used for throughput and latency measurements}
|
|
|
\label{table:setups}
|
|
|
+\tabcolsep=0.11cm
|
|
|
\begin{tabular}{@{}llll@{}}
|
|
|
\toprule
|
|
|
Component & Setup 1 & Setup 2 \\
|
|
@@ -279,6 +285,7 @@ PCIe link (FPGA-System memory) & x8 Gen3 & x4 Gen1
|
|
|
PCIe link (FPGA-GPU) & x8 Gen3 & x8 Gen3 \\
|
|
|
\bottomrule
|
|
|
\end{tabular}
|
|
|
+
|
|
|
\end{table}
|
|
|
|
|
|
|
|
@@ -290,8 +297,8 @@ PCIe link (FPGA-GPU) & x8 Gen3 & x8 Gen3 \\
|
|
|
% requirements, this result makes smaller acquisition system a cost-effective
|
|
|
% alternative to larger workstations.
|
|
|
|
|
|
-\begin{figure}
|
|
|
- \includegraphics[width=\textwidth]{figures/throughput}
|
|
|
+\begin{figure}[t]
|
|
|
+ \includegraphics[width=0.85\textwidth]{figures/throughput}
|
|
|
\caption{%
|
|
|
Measured results for data transfers from FPGA to main memory
|
|
|
(CPU) and from FPGA to the global GPU memory (GPU).
|
|
@@ -345,13 +352,13 @@ latency.
|
|
|
\subsection{Latency}
|
|
|
\begin{figure}[t]
|
|
|
\centering
|
|
|
- \begin{subfigure}[b]{.8\textwidth}
|
|
|
+ \begin{subfigure}[b]{.45\textwidth}
|
|
|
\centering
|
|
|
\includegraphics[width=\textwidth]{figures/latency}
|
|
|
\caption{Latency }
|
|
|
\label{fig:latency_vs_size}
|
|
|
\end{subfigure}
|
|
|
- \begin{subfigure}[b]{.8\textwidth}
|
|
|
+ \begin{subfigure}[b]{.45\textwidth}
|
|
|
\includegraphics[width=\textwidth]{figures/latency-hist}
|
|
|
\caption{Latency distribution.}
|
|
|
\label{fig:latency_hist}
|