7 anos atrás · 4bd7af2650
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,26 @@
 
				+# File to generate the makefile.
			
 
				+# Uses src, include and build folders.
			
 
				+
			
 
				+project(GPUFirstComm)
			
 
				+
			
 
				+cmake_minimum_required(VERSION 2.6)
			
 
				+find_package(CUDA REQUIRED)
			
 
				+
			
 
				+include_directories(include)
			
 
				+
			
 
				+#link_directories(${CUDA_LIBRARY_DIRS})
			
 
				+
			
 
				+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS})
			
 
				+set(CMAKE_C_COMPILER "/usr/bin/clang")
			
 
				+set(CMAKE_C_FLAGS "-msse -msse4.1 -mavx")
			
 
				+
			
 
				+cuda_add_executable(gpufirstcomm
			
 
				+  src/main.cu
			
 
				+  src/common.cu
			
 
				+  src/gdrapi.c
			
 
				+  src/memcpy_avx.c
			
 
				+  src/memcpy_sse41.c
			
 
				+  src/memcpy_sse.c
			
 
				+  src/kernels.cu)
			
 
				+
			
 
				+target_link_libraries(gpufirstcomm cuda)
			
--- a/build/reload_cmake.sh
+++ b/build/reload_cmake.sh
@@ -0,0 +1,6 @@
 
				+# Used to reload the CMakeLists.txt file with clang as a compiler instead of gcc
			
 
				+
			
 
				+#!/bin/bash
			
 
				+
			
 
				+rm -r *
			
 
				+CC=clang CXX=clang cmake ..
			
--- a/include/common.h
+++ b/include/common.h
@@ -0,0 +1,15 @@
 
				+#ifndef _COMMON_H_
			
 
				+#define _COMMON_H_
			
 
				+
			
 
				+#include "cuda.h"
			
 
				+#include "cuda_runtime_api.h"
			
 
				+
			
 
				+#define ASSERT_FAIL 0
			
 
				+#define ASSERT_SUCCESS 1
			
 
				+#define GPU_NAME_LENGTH 30
			
 
				+
			
 
				+void assert_cuda(cudaError_t err_id); /* for runtime api*/
			
 
				+void assert_cu(CUresult res_id); /* for driver api */
			
 
				+void assert_gdr(int gdr_id);
			
 
				+
			
 
				+#endif
			
--- a/include/gdrapi.h
+++ b/include/gdrapi.h
@@ -0,0 +1,124 @@
 
				+/*
			
 
				+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in 
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __GDRAPI_H__
			
 
				+#define __GDRAPI_H__
			
 
				+
			
 
				+#include <stdint.h> // for standard [u]intX_t types
			
 
				+#include <stddef.h>
			
 
				+
			
 
				+#define GDR_API_MAJOR_VERSION    1
			
 
				+#define GDR_API_MINOR_VERSION    2
			
 
				+#define GDR_API_VERSION          ((GDR_API_MAJOR_VERSION << 16) | GDR_API_MINOR_VERSION)
			
 
				+
			
 
				+
			
 
				+#define GPU_PAGE_SHIFT   16
			
 
				+#define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
			
 
				+#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
			
 
				+#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
			
 
				+
			
 
				+/*
			
 
				+ * GDRCopy, a low-latency GPU memory copy library (and a kernel-mode
			
 
				+ * driver) based on NVIDIA GPUDirect RDMA technology.
			
 
				+ *
			
 
				+ * supported environment variables:
			
 
				+ *
			
 
				+ * - GDRCOPY_ENABLE_LOGGING, if defined logging is enabled, default is
			
 
				+ *   disabled.
			
 
				+ *
			
 
				+ * - GDRCOPY_LOG_LEVEL, overrides log threshold, default is to print errors
			
 
				+ *   only.
			
 
				+ */
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+struct gdr;
			
 
				+typedef struct gdr *gdr_t;
			
 
				+
			
 
				+// Initialize the library, e.g. by opening a connection to the kernel-mode
			
 
				+// driver. Returns an handle to the library state object.
			
 
				+gdr_t gdr_open();
			
 
				+
			
 
				+// Destroy library state object, e.g. it closes the connection to kernel-mode
			
 
				+// driver.
			
 
				+//
			
 
				+// Note that altough BAR mappings of GPU memory are destroyed, user-space
			
 
				+// mappings are not. So therefore user code is responsible of calling
			
 
				+// gdr_unmap on all mappings before calling gdr_close.
			
 
				+int gdr_close(gdr_t g);
			
 
				+
			
 
				+// Map device memory buffer on GPU BAR1, returning an handle.
			
 
				+// Memory is still not accessible to user-space.
			
 
				+typedef uint32_t gdr_mh_t;
			
 
				+int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
			
 
				+
			
 
				+// Unmap the handle. 
			
 
				+//
			
 
				+// If there exists a corresponding user-space mapping, gdr_unmap should be
			
 
				+// called before this one.
			
 
				+int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
			
 
				+
			
 
				+// flag is set when the kernel callback (relative to the
			
 
				+// nvidia_p2p_get_pages) gets invoked, e.g. cuMemFree() before
			
 
				+// gdr_unpin_buffer.
			
 
				+int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag);
			
 
				+
			
 
				+// After pinning, info struct contains details of the mapped area.  
			
 
				+//
			
 
				+// Note that both info->va and info->mapped_size might be different from
			
 
				+// the original address passed to gdr_pin_buffer due to aligning happening
			
 
				+// in the kernel-mode driver
			
 
				+struct gdr_info {
			
 
				+    uint64_t va;
			
 
				+    uint64_t mapped_size;
			
 
				+    uint32_t page_size;
			
 
				+    uint64_t tm_cycles;
			
 
				+    uint32_t cycles_per_ms;
			
 
				+};
			
 
				+typedef struct gdr_info gdr_info_t;
			
 
				+int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
			
 
				+
			
 
				+// create a user-space mapping for the BAR1 info, length is bar1->size
			
 
				+// above.
			
 
				+//
			
 
				+// WARNING: the BAR physical address will be aligned to the page size
			
 
				+// before being mapped in user-space, so the pointer returned might be
			
 
				+// affected by an offset. gdr_get_info can be used to calculate that
			
 
				+// offset.
			
 
				+int gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
			
 
				+
			
 
				+// get rid of a user-space mapping.
			
 
				+// First invoke gdr_unmap() then gdr_unpin_buffer().
			
 
				+int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
			
 
				+
			
 
				+// gpubar_ptr is a user-space virtual address, i.e. one returned by gdr_map()
			
 
				+int gdr_copy_to_bar(void  *gpubar_ptr, const void *cpumem_ptr, size_t size);
			
 
				+int gdr_copy_from_bar(void *cpumem_ptr, const void *gpubar_ptr, size_t size);
			
 
				+
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // __GDRAPI_H__
			
--- a/include/gdrconfig.h
+++ b/include/gdrconfig.h
@@ -0,0 +1,13 @@
 
				+#pragma once
			
 
				+
			
 
				+#if defined __GNUC__
			
 
				+#if defined(__powerpc__)
			
 
				+#define GDRAPI_POWER
			
 
				+#elif defined(__i386__) || defined(__x86_64__) || defined(__X86__)
			
 
				+#define GDRAPI_X86
			
 
				+#else
			
 
				+#error "architecture is not supported"
			
 
				+#endif // arch
			
 
				+#else
			
 
				+#error "compiler not supported"
			
 
				+#endif // __GNUC__
			
--- a/include/gdrdrv.h
+++ b/include/gdrdrv.h
@@ -0,0 +1,86 @@
 
				+/*
			
 
				+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in 
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __GDR_DRV_H__
			
 
				+#define __GDR_DRV_H__
			
 
				+
			
 
				+#define GDRDRV_IOCTL                 0xDA
			
 
				+
			
 
				+typedef __u32 gdr_hnd_t;
			
 
				+#define GDR_HANDLE_MASK ((1UL<<32)-1)
			
 
				+
			
 
				+//-----------
			
 
				+
			
 
				+struct GDRDRV_IOC_PIN_BUFFER_PARAMS
			
 
				+{
			
 
				+    // in
			
 
				+    __u64 addr;
			
 
				+    __u64 size;
			
 
				+    __u64 p2p_token;
			
 
				+    __u32 va_space;
			
 
				+    // out
			
 
				+    gdr_hnd_t handle;
			
 
				+};
			
 
				+
			
 
				+#define GDRDRV_IOC_PIN_BUFFER _IOWR(GDRDRV_IOCTL, 1, struct GDRDRV_IOC_PIN_BUFFER_PARAMS)
			
 
				+
			
 
				+//-----------
			
 
				+
			
 
				+struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS
			
 
				+{
			
 
				+    // in
			
 
				+    gdr_hnd_t handle;
			
 
				+};
			
 
				+
			
 
				+#define GDRDRV_IOC_UNPIN_BUFFER _IOWR(GDRDRV_IOCTL, 2, struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS *)
			
 
				+
			
 
				+//-----------
			
 
				+
			
 
				+struct GDRDRV_IOC_GET_CB_FLAG_PARAMS
			
 
				+{
			
 
				+    // in
			
 
				+    gdr_hnd_t handle;
			
 
				+    // out
			
 
				+    __u32 flag;
			
 
				+};
			
 
				+
			
 
				+#define GDRDRV_IOC_GET_CB_FLAG _IOWR(GDRDRV_IOCTL, 3, struct GDRDRV_IOC_GET_CB_FLAG_PARAMS *)
			
 
				+
			
 
				+//-----------
			
 
				+
			
 
				+struct GDRDRV_IOC_GET_INFO_PARAMS
			
 
				+{
			
 
				+    // in
			
 
				+    gdr_hnd_t handle;
			
 
				+    // out
			
 
				+    __u64 va;
			
 
				+    __u64 mapped_size;
			
 
				+    __u32 page_size;
			
 
				+    __u32 tsc_khz;
			
 
				+    __u64 tm_cycles;
			
 
				+};
			
 
				+
			
 
				+#define GDRDRV_IOC_GET_INFO _IOWR(GDRDRV_IOCTL, 4, struct GDRDRV_IOC_GET_INFO_PARAMS *)
			
 
				+
			
 
				+//-----------
			
 
				+
			
 
				+#endif // __GDR_DRV_H__
			
--- a/include/kernels.h
+++ b/include/kernels.h
@@ -0,0 +1,6 @@
 
				+#include "cuda.h"
			
 
				+#include "cuda_runtime_api.h"
			
 
				+
			
 
				+__device__ void add_two_device(CUdeviceptr number);
			
 
				+__global__ void add_three_global(CUdeviceptr number);
			
 
				+__global__ void add_one_global(void* number);
			
--- a/src/common.cu
+++ b/src/common.cu
@@ -0,0 +1,32 @@
 
				+/* This files contains useful fonctions like assertions */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "common.h"
			
 
				+
			
 
				+void assert_cuda(cudaError_t err_id)
			
 
				+{
			
 
				+    if( err_id != cudaSuccess )
			
 
				+    {
			
 
				+	printf("%s\n",cudaGetErrorString(err_id));
			
 
				+	exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void assert_cu(CUresult res_id)
			
 
				+{
			
 
				+    if( res_id != CUDA_SUCCESS )
			
 
				+    {
			
 
				+	printf("Error in driver api returned with code: %d\n",res_id);
			
 
				+	exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void assert_gdr(int gdr_id)
			
 
				+{
			
 
				+    if( gdr_id != 0 )
			
 
				+    {
			
 
				+	printf("Error in gdr api returned with code: %d\n",gdr_id);
			
 
				+	exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+}
			
--- a/src/gdrapi.c
+++ b/src/gdrapi.c
@@ -0,0 +1,399 @@
 
				+/*
			
 
				+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in 
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <unistd.h>
			
 
				+#include <stdarg.h>
			
 
				+#include <string.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/socket.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <sys/stat.h>
			
 
				+#include <sys/mman.h>
			
 
				+#include <fcntl.h>
			
 
				+#include <errno.h>
			
 
				+#include <netdb.h>
			
 
				+#include <malloc.h>
			
 
				+#include <getopt.h>
			
 
				+#include <arpa/inet.h>
			
 
				+#include <sys/ioctl.h>
			
 
				+#include <time.h>
			
 
				+#include <asm/types.h>
			
 
				+
			
 
				+#include "gdrapi.h"
			
 
				+#include "gdrdrv.h"
			
 
				+#include "gdrconfig.h"
			
 
				+
			
 
				+// based on post at http://stackoverflow.com/questions/3385515/static-assert-in-c
			
 
				+#define STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(!!(COND))*2-1]
			
 
				+// token pasting madness
			
 
				+#define COMPILE_TIME_ASSERT3(X,L) STATIC_ASSERT(X,static_assertion_at_line_##L)
			
 
				+#define COMPILE_TIME_ASSERT2(X,L) COMPILE_TIME_ASSERT3(X,L)
			
 
				+#define COMPILE_TIME_ASSERT(X)    COMPILE_TIME_ASSERT2(X,__LINE__)
			
 
				+
			
 
				+// hint: use page_size = sysconf(_SC_PAGESIZE) instead
			
 
				+#ifdef GDRAPI_POWER
			
 
				+#define PAGE_SHIFT 16
			
 
				+#else // catching all 4KB page size platforms here
			
 
				+#define PAGE_SHIFT 12
			
 
				+#endif
			
 
				+#define PAGE_SIZE  (1UL << PAGE_SHIFT)
			
 
				+#define PAGE_MASK  (~(PAGE_SIZE-1))
			
 
				+
			
 
				+// logging/tracing
			
 
				+
			
 
				+enum gdrcopy_msg_level {
			
 
				+    GDRCOPY_MSG_DEBUG = 1,
			
 
				+    GDRCOPY_MSG_INFO,
			
 
				+    GDRCOPY_MSG_WARN,
			
 
				+    GDRCOPY_MSG_ERROR
			
 
				+};
			
 
				+
			
 
				+static int gdr_msg_level = GDRCOPY_MSG_ERROR;
			
 
				+static int gdr_enable_logging = -1;
			
 
				+
			
 
				+static void gdr_msg(enum gdrcopy_msg_level lvl, const char* fmt, ...)
			
 
				+{
			
 
				+    if (-1 == gdr_enable_logging) {
			
 
				+        const char *env = getenv("GDRCOPY_ENABLE_LOGGING");
			
 
				+        if (env)
			
 
				+            gdr_enable_logging = 1;
			
 
				+        else
			
 
				+            gdr_enable_logging = 0;
			
 
				+
			
 
				+        env = getenv("GDRCOPY_LOG_LEVEL");
			
 
				+        if (env)
			
 
				+            gdr_msg_level = atoi(env);
			
 
				+    }
			
 
				+    if (gdr_enable_logging) {
			
 
				+        if (lvl >= gdr_msg_level) {
			
 
				+            va_list ap;
			
 
				+            va_start(ap, fmt);
			
 
				+            vfprintf(stderr, fmt, ap);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#define gdr_dbg(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_DEBUG, "DBG:  " FMT, ## ARGS)
			
 
				+#define gdr_dbgc(C, FMT, ARGS...)  do { static int gdr_dbg_cnt=(C); if (gdr_dbg_cnt) { gdr_dbg(FMT, ## ARGS); --gdr_dbg_cnt; }} while (0)
			
 
				+#define gdr_info(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_INFO,  "INFO: " FMT, ## ARGS)
			
 
				+#define gdr_warn(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_WARN,  "WARN: " FMT, ## ARGS)
			
 
				+#define gdr_err(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_ERROR, "ERR:  " FMT, ## ARGS)
			
 
				+
			
 
				+// check GDR HaNDle size
			
 
				+
			
 
				+COMPILE_TIME_ASSERT(sizeof(gdr_hnd_t)==sizeof(gdr_mh_t));
			
 
				+
			
 
				+
			
 
				+
			
 
				+struct gdr {
			
 
				+    int fd;
			
 
				+};
			
 
				+
			
 
				+gdr_t gdr_open()
			
 
				+{
			
 
				+    gdr_t g = NULL;
			
 
				+    const char *gdrinode = "/dev/gdrdrv";
			
 
				+
			
 
				+    g = calloc(1, sizeof(*g));
			
 
				+    if (!g) {
			
 
				+        gdr_err("error while allocating memory\n");
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    int fd = open(gdrinode, O_RDWR);
			
 
				+    if (-1 == fd ) {
			
 
				+        int ret = errno;
			
 
				+        gdr_err("error opening driver (errno=%d/%s)\n", ret, strerror(ret));
			
 
				+        free(g);
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    g->fd = fd;
			
 
				+
			
 
				+    return g;
			
 
				+}
			
 
				+
			
 
				+int gdr_close(gdr_t g)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+    int retcode = close(g->fd);
			
 
				+    if (-1 == retcode) {
			
 
				+        ret = errno;
			
 
				+        gdr_err("error closing driver (errno=%d/%s)\n", ret, strerror(ret));
			
 
				+    }
			
 
				+    g->fd = 0;
			
 
				+    free(g);
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+    int retcode;
			
 
				+
			
 
				+    struct GDRDRV_IOC_PIN_BUFFER_PARAMS params;
			
 
				+    params.addr = addr;
			
 
				+    params.size = size;
			
 
				+    params.p2p_token = p2p_token;
			
 
				+    params.va_space = va_space;
			
 
				+    params.handle = 0;
			
 
				+
			
 
				+    retcode = ioctl(g->fd, GDRDRV_IOC_PIN_BUFFER, &params);
			
 
				+    if (0 != retcode) {
			
 
				+        ret = errno;
			
 
				+        gdr_err("ioctl error (errno=%d)\n", ret);
			
 
				+    }
			
 
				+    *handle = params.handle;
			
 
				+
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+    int retcode;
			
 
				+
			
 
				+    struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS params;
			
 
				+    params.handle = handle;
			
 
				+
			
 
				+    retcode = ioctl(g->fd, GDRDRV_IOC_UNPIN_BUFFER, &params);
			
 
				+    if (0 != retcode) {
			
 
				+        ret = errno;
			
 
				+        gdr_err("ioctl error (errno=%d)\n", ret);
			
 
				+    }
			
 
				+
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+    int retcode;
			
 
				+
			
 
				+    struct GDRDRV_IOC_GET_CB_FLAG_PARAMS params;
			
 
				+    params.handle = handle;
			
 
				+
			
 
				+    retcode = ioctl(g->fd, GDRDRV_IOC_GET_CB_FLAG, &params);
			
 
				+    if (0 != retcode) {
			
 
				+        ret = errno;
			
 
				+        gdr_err("ioctl error (errno=%d)\n", ret);
			
 
				+    } else
			
 
				+        *flag = params.flag;
			
 
				+
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+    int retcode;
			
 
				+
			
 
				+    struct GDRDRV_IOC_GET_INFO_PARAMS params;
			
 
				+    params.handle = handle;
			
 
				+
			
 
				+    retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
			
 
				+    if (0 != retcode) {
			
 
				+        ret = errno;
			
 
				+        gdr_err("ioctl error (errno=%d)\n", ret);
			
 
				+    } else {
			
 
				+        info->va          = params.va;
			
 
				+        info->mapped_size = params.mapped_size;
			
 
				+        info->page_size   = params.page_size;
			
 
				+        info->tm_cycles   = params.tm_cycles;
			
 
				+        info->cycles_per_ms = params.tsc_khz;
			
 
				+    }
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+int gdr_map(gdr_t g, gdr_mh_t handle, void **ptr_va, size_t size)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+    gdr_info_t info = {0,};
			
 
				+
			
 
				+    ret = gdr_get_info(g, handle, &info);
			
 
				+    if (ret) {
			
 
				+        return ret;
			
 
				+    }
			
 
				+    size_t rounded_size = (size + PAGE_SIZE - 1) & PAGE_MASK;
			
 
				+    off_t magic_off = (off_t)handle << PAGE_SHIFT;
			
 
				+    void *mmio;
			
 
				+
			
 
				+    mmio = mmap(NULL, rounded_size, PROT_READ|PROT_WRITE, MAP_SHARED, g->fd, magic_off);
			
 
				+    if (mmio == MAP_FAILED) {
			
 
				+        int __errno = errno;
			
 
				+        mmio = NULL;
			
 
				+        gdr_err("can't mmap BAR, error=%s(%d) rounded_size=%zu offset=%llx handle=%x\n",
			
 
				+                strerror(__errno), __errno, rounded_size, (long long unsigned)magic_off, handle);
			
 
				+        ret = __errno;
			
 
				+    }
			
 
				+
			
 
				+    *ptr_va = mmio;
			
 
				+
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+    int retcode = 0;
			
 
				+    size_t rounded_size = (size + PAGE_SIZE - 1) & PAGE_MASK;
			
 
				+
			
 
				+    retcode = munmap(va, rounded_size);
			
 
				+    if (-1 == retcode) {
			
 
				+        int __errno = errno;
			
 
				+        gdr_err("can't unmap BAR, error=%s(%d) rounded_size=%zu\n",
			
 
				+                strerror(__errno), __errno, rounded_size);
			
 
				+        ret = __errno;
			
 
				+    }
			
 
				+
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+#ifdef GDRAPI_X86
			
 
				+#include <cpuid.h>
			
 
				+
			
 
				+// prepare for AVX2 implementation
			
 
				+#ifndef bit_AVX2
			
 
				+/* Extended Features (%eax == 7) */
			
 
				+/* %ebx */
			
 
				+#define bit_AVX2 (1 << 5)
			
 
				+#endif
			
 
				+
			
 
				+#include <immintrin.h>
			
 
				+
			
 
				+extern int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes);
			
 
				+extern int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes);
			
 
				+extern int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes);
			
 
				+extern int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes);
			
 
				+extern int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes);
			
 
				+#else // GDRAPI_X86
			
 
				+static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
			
 
				+static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
			
 
				+static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
			
 
				+static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
			
 
				+static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
			
 
				+#endif // GDRAPI_X86
			
 
				+
			
 
				+static int first_time = 1;
			
 
				+static int has_sse = 0;
			
 
				+static int has_sse2 = 0;
			
 
				+static int has_sse4_1 = 0;
			
 
				+static int has_avx = 0;
			
 
				+static int has_avx2 = 0;
			
 
				+
			
 
				+static void gdr_init_cpu_flags()
			
 
				+{
			
 
				+#ifdef GDRAPI_X86
			
 
				+    unsigned int info_type = 0x00000001;
			
 
				+    unsigned int ax, bx, cx, dx;
			
 
				+    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
			
 
				+       has_sse4_1 = ((cx & bit_SSE41) != 0);
			
 
				+       has_avx    = ((cx & bit_AVX)    != 0);
			
 
				+       has_sse    = ((dx & bit_SSE)    != 0);
			
 
				+       has_sse2   = ((dx & bit_SSE2)   != 0);
			
 
				+       gdr_dbg("sse4_1=%d avx=%d sse=%d sse2=%d\n", has_sse4_1, has_avx, has_sse, has_sse2);
			
 
				+    }
			
 
				+#ifdef bit_AVX2
			
 
				+    info_type = 0x7;
			
 
				+    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
			
 
				+        has_avx2 = bx & bit_AVX2;
			
 
				+    }
			
 
				+#endif // bit_AVX2
			
 
				+#endif // GDRAPI_X86
			
 
				+
			
 
				+#ifdef GDRAPI_POWER
			
 
				+    // detect and enable Altivec/SMX support
			
 
				+#endif
			
 
				+
			
 
				+    first_time = 0;
			
 
				+}
			
 
				+
			
 
				+// note: more than one implementation may be compiled in
			
 
				+
			
 
				+
			
 
				+int gdr_copy_to_bar(void *bar_ptr, const void *h_ptr, size_t size)
			
 
				+{
			
 
				+    if (first_time) {
			
 
				+        gdr_init_cpu_flags();
			
 
				+    }
			
 
				+
			
 
				+    do {
			
 
				+        // pick the most performing implementation compatible with the platform we are running on
			
 
				+        if (has_avx) {
			
 
				+            gdr_dbgc(1, "using AVX implementation of gdr_copy_to_bar\n");
			
 
				+            memcpy_uncached_store_avx(bar_ptr, h_ptr, size);
			
 
				+            break;
			
 
				+        }
			
 
				+        if (has_sse) {
			
 
				+            gdr_dbgc(1, "using SSE implementation of gdr_copy_to_bar\n");
			
 
				+            memcpy_uncached_store_sse(bar_ptr, h_ptr, size);
			
 
				+            break;
			
 
				+        }
			
 
				+        // fall through
			
 
				+        gdr_dbgc(1, "using plain implementation of gdr_copy_to_bar\n");
			
 
				+        memcpy(bar_ptr, h_ptr, size);
			
 
				+    } while (0);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+int gdr_copy_from_bar(void *h_ptr, const void *bar_ptr, size_t size)
			
 
				+{
			
 
				+    if (first_time) {
			
 
				+        gdr_init_cpu_flags();
			
 
				+    }
			
 
				+
			
 
				+    do {
			
 
				+        // pick the most performing implementation compatible with the platform we are running on
			
 
				+        if (has_sse4_1) {
			
 
				+            gdr_dbgc(1, "using SSE4_1 implementation of gdr_copy_from_bar\n");
			
 
				+            memcpy_uncached_load_sse41(h_ptr, bar_ptr, size);
			
 
				+            break;
			
 
				+        }
			
 
				+        if (has_avx) {
			
 
				+            gdr_dbgc(1, "using AVX implementation of gdr_copy_from_bar\n");
			
 
				+            memcpy_cached_store_avx(h_ptr, bar_ptr, size);
			
 
				+            break;
			
 
				+        }
			
 
				+        if (has_sse) {
			
 
				+            gdr_dbgc(1, "using SSE implementation of gdr_copy_from_bar\n");
			
 
				+            memcpy_cached_store_sse(h_ptr, bar_ptr, size);
			
 
				+            break;
			
 
				+        }
			
 
				+        // fall through
			
 
				+        gdr_dbgc(1, "using plain implementation of gdr_copy_from_bar\n");
			
 
				+        memcpy(h_ptr, bar_ptr, size);
			
 
				+    } while (0);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Local variables:
			
 
				+ *  c-indent-level: 4
			
 
				+ *  c-basic-offset: 4
			
 
				+ *  tab-width: 4
			
 
				+ *  indent-tabs-mode: nil
			
 
				+ * End:
			
 
				+ */
			
--- a/src/kernels.cu
+++ b/src/kernels.cu
@@ -0,0 +1,24 @@
 
				+/* This file contains the kernels i.e. the functions to be executed on the GPU */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "kernels.h"
			
 
				+
			
 
				+__device__
			
 
				+void add_two_device(CUdeviceptr number)
			
 
				+{
			
 
				+    (* (int*) number)+=2;
			
 
				+}
			
 
				+
			
 
				+__global__
			
 
				+void add_three_global(CUdeviceptr number)
			
 
				+{
			
 
				+    (* (int*) number)++;
			
 
				+    add_two_device(number);
			
 
				+}
			
 
				+
			
 
				+__global__
			
 
				+void add_one_global(void* number)
			
 
				+{
			
 
				+    (* (int*) number)++;
			
 
				+}
			
--- a/src/main.cu
+++ b/src/main.cu
@@ -0,0 +1,83 @@
 
				+/* The main program */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "common.h"
			
 
				+/*#include "cuda.h"*/
			
 
				+#include "kernels.h"
			
 
				+#include "gdrapi.h"
			
 
				+
			
 
				+int main()
			
 
				+{
			
 
				+    /* Initialisation of the APIs */
			
 
				+    assert_cu( cuInit(0) );
			
 
				+    gdr_t g = gdr_open();
			
 
				+
			
 
				+    /* First check if a NVIDIA GPU is on the system and see which to use */
			
 
				+    /* For the time being, use number 0 */
			
 
				+    int i;
			
 
				+    int countGPU;
			
 
				+    CUdevice GPU;
			
 
				+    char nameGPU[GPU_NAME_LENGTH];
			
 
				+    CUdevprop GPUProp;
			
 
				+    
			
 
				+    assert_cuda( cudaGetDeviceCount(&countGPU) );
			
 
				+    for(i=0; i<countGPU; i++)
			
 
				+    {
			
 
				+	assert_cu( cuDeviceGet(&GPU,i) );
			
 
				+	assert_cu( cuDeviceGetName(nameGPU,GPU_NAME_LENGTH,GPU) );
			
 
				+	printf("GPU %d is %s\n",i,nameGPU);
			
 
				+    }
			
 
				+
			
 
				+    assert_cuda( cudaSetDevice(0) );
			
 
				+    assert_cu( cuDeviceGet(&GPU,0) );
			
 
				+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
			
 
				+
			
 
				+    /* Check context */
			
 
				+    assert_cu( cuCtxGetDevice(&GPU) );
			
 
				+    printf("Device for this context: %d\n",GPU);
			
 
				+    CUcontext ctx;
			
 
				+    assert_cu( cuCtxCreate(&ctx,0,GPU) );
			
 
				+    assert_cu( cuCtxGetDevice(&GPU) );
			
 
				+    printf("Device for this context: %d\n",GPU);
			
 
				+    
			
 
				+    /* Allocate memory on the device, pin and map */
			
 
				+    CUdeviceptr dptr;
			
 
				+    assert_cu( cuMemAlloc(&dptr,(size_t) GPUProp.sharedMemPerBlock) );
			
 
				+    gdr_mh_t GPUMemHandle;
			
 
				+    assert_gdr( gdr_pin_buffer(g,dptr,(size_t) GPUProp.sharedMemPerBlock,0,0,&GPUMemHandle) );
			
 
				+    void* va;
			
 
				+    assert_gdr( gdr_map(g,GPUMemHandle,&va,(size_t) GPUProp.sharedMemPerBlock) );
			
 
				+    /*CHECK THE OFFSET*/
			
 
				+    gdr_info_t GPUInfo;
			
 
				+    int offset;
			
 
				+    assert_gdr( gdr_get_info(g,GPUMemHandle,&GPUInfo) );
			
 
				+    offset = (GPUInfo.va > dptr) ? GPUInfo.va - dptr:dptr - GPUInfo.va;
			
 
				+    uint32_t *buf_ptr = (uint32_t *)((char *)va + offset);
			
 
				+
			
 
				+    printf("All set\n");
			
 
				+    /* printf("va: %lu\ndptr: %llu\nGPUInfo.va: %lu\noffset: %d\nbuf_ptr: %d\n", */
			
 
				+    /* 	   (uint64_t) va,dptr,GPUInfo.va,offset,*buf_ptr); */
			
 
				+    
			
 
				+    /* At this point the GPU's mem is mapped to a CPU buffer to enable DMA */
			
 
				+
			
 
				+    int set, get;
			
 
				+    printf("Use the nvidia api\n");
			
 
				+    set = 4242;
			
 
				+    get = 0;
			
 
				+    printf("set = %d\nget = %d\n",set,get);
			
 
				+    assert_cu( cuMemcpyHtoD(dptr,&set,sizeof(set)) );
			
 
				+    add_three_global<<< 1,1 >>>(dptr);
			
 
				+    assert_cu( cuMemcpyDtoH(&get,dptr,sizeof(get)) );
			
 
				+    printf("set = %d\nget = %d\n",set,get);
			
 
				+
			
 
				+    /* Close everything */
			
 
				+    assert_gdr( gdr_unmap(g,GPUMemHandle,va,(size_t) GPUProp.sharedMemPerBlock) );
			
 
				+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandle) );
			
 
				+    assert_gdr( gdr_close(g) );
			
 
				+    assert_cu( cuMemFree(dptr) );
			
 
				+
			
 
				+    printf("All Cleared\n");
			
 
				+    
			
 
				+    exit(EXIT_SUCCESS);
			
 
				+}
			
--- a/src/memcpy_avx.c
+++ b/src/memcpy_avx.c
@@ -0,0 +1,198 @@
 
				+/*
			
 
				+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in 
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <immintrin.h>
			
 
				+
			
 
				+#ifndef min
			
 
				+#define min(A,B) ((A)<(B)?(A):(B))
			
 
				+#endif
			
 
				+
			
 
				+int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+#ifdef __AVX__
			
 
				+    char *d = (char*)dest;
			
 
				+    uintptr_t d_int = (uintptr_t)d;
			
 
				+    const char *s = (const char *)src;
			
 
				+    uintptr_t s_int = (uintptr_t)s;
			
 
				+    size_t n = n_bytes;
			
 
				+
			
 
				+    // align dest to 256-bits
			
 
				+    if (d_int & 0x1f) {
			
 
				+        size_t nh = min(0x20 - (d_int & 0x1f), n);
			
 
				+        memcpy(d, s, nh);
			
 
				+        d += nh; d_int += nh;
			
 
				+        s += nh; s_int += nh;
			
 
				+        n -= nh;
			
 
				+    }
			
 
				+
			
 
				+    if (s_int & 0x1f) { // src is not aligned to 256-bits
			
 
				+        __m256d r0,r1,r2,r3;
			
 
				+        // unroll 4
			
 
				+        while (n >= 4*sizeof(__m256d)) {
			
 
				+            r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
			
 
				+            r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
			
 
				+            r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
			
 
				+            r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
			
 
				+            _mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
			
 
				+            _mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
			
 
				+            _mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
			
 
				+            _mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
			
 
				+            s += 4*sizeof(__m256d);
			
 
				+            d += 4*sizeof(__m256d);
			
 
				+            n -= 4*sizeof(__m256d);
			
 
				+        }
			
 
				+        while (n >= sizeof(__m256d)) {
			
 
				+            r0 = _mm256_loadu_pd((double *)(s));
			
 
				+            _mm256_stream_pd((double *)(d), r0);
			
 
				+            s += sizeof(__m256d);
			
 
				+            d += sizeof(__m256d);
			
 
				+            n -= sizeof(__m256d);
			
 
				+        }
			
 
				+    } else { // or it IS aligned
			
 
				+        __m256d r0,r1,r2,r3,r4,r5,r6,r7;
			
 
				+        // unroll 8
			
 
				+        while (n >= 8*sizeof(__m256d)) {
			
 
				+            r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
			
 
				+            r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
			
 
				+            r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
			
 
				+            r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
			
 
				+            r4 = _mm256_load_pd((double *)(s+4*sizeof(__m256d)));
			
 
				+            r5 = _mm256_load_pd((double *)(s+5*sizeof(__m256d)));
			
 
				+            r6 = _mm256_load_pd((double *)(s+6*sizeof(__m256d)));
			
 
				+            r7 = _mm256_load_pd((double *)(s+7*sizeof(__m256d)));
			
 
				+            _mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
			
 
				+            _mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
			
 
				+            _mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
			
 
				+            _mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
			
 
				+            _mm256_stream_pd((double *)(d+4*sizeof(__m256d)), r4);
			
 
				+            _mm256_stream_pd((double *)(d+5*sizeof(__m256d)), r5);
			
 
				+            _mm256_stream_pd((double *)(d+6*sizeof(__m256d)), r6);
			
 
				+            _mm256_stream_pd((double *)(d+7*sizeof(__m256d)), r7);
			
 
				+            s += 8*sizeof(__m256d);
			
 
				+            d += 8*sizeof(__m256d);
			
 
				+            n -= 8*sizeof(__m256d);
			
 
				+        }
			
 
				+        while (n >= sizeof(__m256d)) {
			
 
				+            r0 = _mm256_load_pd((double *)(s));
			
 
				+            _mm256_stream_pd((double *)(d), r0);
			
 
				+            s += sizeof(__m256d);
			
 
				+            d += sizeof(__m256d);
			
 
				+            n -= sizeof(__m256d);
			
 
				+        }            
			
 
				+    }
			
 
				+    _mm_sfence();
			
 
				+    if (n)
			
 
				+        memcpy(d, s, n);
			
 
				+#else
			
 
				+#error "this file should be compiled with -mavx"
			
 
				+#endif
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+#ifdef __AVX__
			
 
				+    char *d = (char*)dest;
			
 
				+    uintptr_t d_int = (uintptr_t)d;
			
 
				+    const char *s = (const char *)src;
			
 
				+    uintptr_t s_int = (uintptr_t)s;
			
 
				+    size_t n = n_bytes;
			
 
				+
			
 
				+    // align dest to 256-bits
			
 
				+    if (d_int & 0x1f) {
			
 
				+        size_t nh = min(0x20 - (d_int & 0x1f), n);
			
 
				+        memcpy(d, s, nh);
			
 
				+        d += nh; d_int += nh;
			
 
				+        s += nh; s_int += nh;
			
 
				+        n -= nh;
			
 
				+    }
			
 
				+
			
 
				+    if (s_int & 0x1f) { // src is not aligned to 256-bits
			
 
				+        __m256d r0,r1,r2,r3;
			
 
				+        // unroll 4
			
 
				+        while (n >= 4*sizeof(__m256d)) {
			
 
				+            r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
			
 
				+            r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
			
 
				+            r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
			
 
				+            r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
			
 
				+            _mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
			
 
				+            _mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
			
 
				+            _mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
			
 
				+            _mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
			
 
				+            s += 4*sizeof(__m256d);
			
 
				+            d += 4*sizeof(__m256d);
			
 
				+            n -= 4*sizeof(__m256d);
			
 
				+        }
			
 
				+        while (n >= sizeof(__m256d)) {
			
 
				+            r0 = _mm256_loadu_pd((double *)(s));
			
 
				+            _mm256_store_pd((double *)(d), r0);
			
 
				+            s += sizeof(__m256d);
			
 
				+            d += sizeof(__m256d);
			
 
				+            n -= sizeof(__m256d);
			
 
				+        }
			
 
				+    } else { // or it IS aligned
			
 
				+        __m256d r0,r1,r2,r3;
			
 
				+        // unroll 4
			
 
				+        while (n >= 4*sizeof(__m256d)) {
			
 
				+            r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
			
 
				+            r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
			
 
				+            r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
			
 
				+            r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
			
 
				+            _mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
			
 
				+            _mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
			
 
				+            _mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
			
 
				+            _mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
			
 
				+            s += 4*sizeof(__m256d);
			
 
				+            d += 4*sizeof(__m256d);
			
 
				+            n -= 4*sizeof(__m256d);
			
 
				+        }
			
 
				+        while (n >= sizeof(__m256d)) {
			
 
				+            r0 = _mm256_load_pd((double *)(s));
			
 
				+            _mm256_store_pd((double *)(d), r0);
			
 
				+            s += sizeof(__m256d);
			
 
				+            d += sizeof(__m256d);
			
 
				+            n -= sizeof(__m256d);
			
 
				+        }            
			
 
				+    }
			
 
				+    if (n)
			
 
				+        memcpy(d, s, n);
			
 
				+#else
			
 
				+#error "this file should be compiled with -mavx"
			
 
				+#endif
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+// add variant for _mm_stream_load_si256() / VMOVNTDQA
			
 
				+
			
 
				+/*
			
 
				+ * Local variables:
			
 
				+ *  c-indent-level: 4
			
 
				+ *  c-basic-offset: 4
			
 
				+ *  tab-width: 4
			
 
				+ *  indent-tabs-mode: nil
			
 
				+ * End:
			
 
				+ */
			
--- a/src/memcpy_sse.c
+++ b/src/memcpy_sse.c
@@ -0,0 +1,188 @@
 
				+/*
			
 
				+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in 
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <immintrin.h>
			
 
				+
			
 
				+#ifndef min
			
 
				+#define min(A,B) ((A)<(B)?(A):(B))
			
 
				+#endif
			
 
				+
			
 
				+int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+#ifdef __SSE__
			
 
				+    char *d = (char*)dest;
			
 
				+    uintptr_t d_int = (uintptr_t)d;
			
 
				+    const char *s = (const char *)src;
			
 
				+    uintptr_t s_int = (uintptr_t)s;
			
 
				+    size_t n = n_bytes;
			
 
				+
			
 
				+    // align dest to 128-bits
			
 
				+    if (d_int & 0xf) {
			
 
				+        size_t nh = min(0x10 - (d_int & 0x0f), n);
			
 
				+        memcpy(d, s, nh);
			
 
				+        d += nh; d_int += nh;
			
 
				+        s += nh; s_int += nh;
			
 
				+        n -= nh;
			
 
				+    }
			
 
				+
			
 
				+    if (s_int & 0xf) { // src is not aligned to 128-bits
			
 
				+        __m128 r0,r1,r2,r3;
			
 
				+        // unroll 4
			
 
				+        while (n >= 4*4*sizeof(float)) {
			
 
				+            r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
			
 
				+            r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
			
 
				+            r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
			
 
				+            r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
			
 
				+            _mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
			
 
				+            _mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
			
 
				+            _mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
			
 
				+            _mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
			
 
				+            s += 4*4*sizeof(float);
			
 
				+            d += 4*4*sizeof(float);
			
 
				+            n -= 4*4*sizeof(float);
			
 
				+        }
			
 
				+        while (n >= 4*sizeof(float)) {
			
 
				+            r0 = _mm_loadu_ps((float *)(s));
			
 
				+            _mm_stream_ps((float *)(d), r0);
			
 
				+            s += 4*sizeof(float);
			
 
				+            d += 4*sizeof(float);
			
 
				+            n -= 4*sizeof(float);
			
 
				+        }
			
 
				+    } else { // or it IS aligned
			
 
				+        __m128 r0,r1,r2,r3;
			
 
				+        // unroll 4
			
 
				+        while (n >= 4*4*sizeof(float)) {
			
 
				+            r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
			
 
				+            r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
			
 
				+            r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
			
 
				+            r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
			
 
				+            _mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
			
 
				+            _mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
			
 
				+            _mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
			
 
				+            _mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
			
 
				+            s += 4*4*sizeof(float);
			
 
				+            d += 4*4*sizeof(float);
			
 
				+            n -= 4*4*sizeof(float);
			
 
				+        }
			
 
				+        while (n >= 4*sizeof(float)) {
			
 
				+            r0 = _mm_load_ps((float *)(s));
			
 
				+            _mm_stream_ps((float *)(d), r0);
			
 
				+            s += 4*sizeof(float);
			
 
				+            d += 4*sizeof(float);
			
 
				+            n -= 4*sizeof(float);
			
 
				+        }            
			
 
				+    }
			
 
				+    _mm_sfence();
			
 
				+    if (n)
			
 
				+        memcpy(d, s, n);
			
 
				+#else
			
 
				+#error "this file should be compiled with -msse"
			
 
				+#endif
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+#ifdef __SSE__
			
 
				+    char *d = (char*)dest;
			
 
				+    uintptr_t d_int = (uintptr_t)d;
			
 
				+    const char *s = (const char *)src;
			
 
				+    uintptr_t s_int = (uintptr_t)s;
			
 
				+    size_t n = n_bytes;
			
 
				+
			
 
				+    // align dest to 128-bits
			
 
				+    if (d_int & 0xf) {
			
 
				+        size_t nh = min(0x10 - (d_int & 0x0f), n);
			
 
				+        memcpy(d, s, nh);
			
 
				+        d += nh; d_int += nh;
			
 
				+        s += nh; s_int += nh;
			
 
				+        n -= nh;
			
 
				+    }
			
 
				+
			
 
				+    if (s_int & 0xf) { // src is not aligned to 128-bits
			
 
				+        __m128 r0,r1,r2,r3;
			
 
				+        // unroll 4
			
 
				+        while (n >= 4*4*sizeof(float)) {
			
 
				+            r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
			
 
				+            r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
			
 
				+            r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
			
 
				+            r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
			
 
				+            _mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
			
 
				+            _mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
			
 
				+            _mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
			
 
				+            _mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
			
 
				+            s += 4*4*sizeof(float);
			
 
				+            d += 4*4*sizeof(float);
			
 
				+            n -= 4*4*sizeof(float);
			
 
				+        }
			
 
				+        while (n >= 4*sizeof(float)) {
			
 
				+            r0 = _mm_loadu_ps((float *)(s));
			
 
				+            _mm_store_ps((float *)(d), r0);
			
 
				+            s += 4*sizeof(float);
			
 
				+            d += 4*sizeof(float);
			
 
				+            n -= 4*sizeof(float);
			
 
				+        }
			
 
				+    } else { // or it IS aligned
			
 
				+        __m128 r0,r1,r2,r3;
			
 
				+        // unroll 4
			
 
				+        while (n >= 4*4*sizeof(float)) {
			
 
				+            r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
			
 
				+            r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
			
 
				+            r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
			
 
				+            r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
			
 
				+            _mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
			
 
				+            _mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
			
 
				+            _mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
			
 
				+            _mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
			
 
				+            s += 4*4*sizeof(float);
			
 
				+            d += 4*4*sizeof(float);
			
 
				+            n -= 4*4*sizeof(float);
			
 
				+        }
			
 
				+        while (n >= 4*sizeof(float)) {
			
 
				+            r0 = _mm_load_ps((float *)(s));
			
 
				+            _mm_store_ps((float *)(d), r0);
			
 
				+            s += 4*sizeof(float);
			
 
				+            d += 4*sizeof(float);
			
 
				+            n -= 4*sizeof(float);
			
 
				+        }            
			
 
				+    }
			
 
				+    if (n)
			
 
				+        memcpy(d, s, n);
			
 
				+#else
			
 
				+#error "this file should be compiled with -msse"
			
 
				+#endif
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Local variables:
			
 
				+ *  c-indent-level: 4
			
 
				+ *  c-basic-offset: 4
			
 
				+ *  tab-width: 4
			
 
				+ *  indent-tabs-mode: nil
			
 
				+ * End:
			
 
				+ */
			
--- a/src/memcpy_sse41.c
+++ b/src/memcpy_sse41.c
@@ -0,0 +1,134 @@
 
				+/*
			
 
				+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and associated documentation files (the "Software"),
			
 
				+ * to deal in the Software without restriction, including without limitation
			
 
				+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
			
 
				+ * and/or sell copies of the Software, and to permit persons to whom the
			
 
				+ * Software is furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in 
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
			
 
				+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
			
 
				+ * DEALINGS IN THE SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <immintrin.h>
			
 
				+
			
 
				+#ifndef min
			
 
				+#define min(A,B) ((A)<(B)?(A):(B))
			
 
				+#endif
			
 
				+
			
 
				+// implementation of copy from BAR using MOVNTDQA 
			
 
				+// suggested by Nicholas Wilt <nwilt@amazon.com>
			
 
				+
			
 
				+int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes)
			
 
				+{
			
 
				+    int ret = 0;
			
 
				+#ifdef __SSE4_1__
			
 
				+    char *d = (char*)dest;
			
 
				+    uintptr_t d_int = (uintptr_t)d;
			
 
				+    const char *s = (const char *)src;
			
 
				+    uintptr_t s_int = (uintptr_t)s;
			
 
				+    size_t n = n_bytes;
			
 
				+
			
 
				+    // align src to 128-bits
			
 
				+    if (s_int & 0xf) {
			
 
				+        size_t nh = min(0x10 - (s_int & 0x0f), n);
			
 
				+        memcpy(d, s, nh);
			
 
				+        d += nh; d_int += nh;
			
 
				+        s += nh; s_int += nh;
			
 
				+        n -= nh;
			
 
				+    }
			
 
				+
			
 
				+    if (d_int & 0xf) { // dest is not aligned to 128-bits
			
 
				+        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
			
 
				+        // unroll 8
			
 
				+        while (n >= 8*sizeof(__m128i)) {
			
 
				+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
			
 
				+            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
			
 
				+            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
			
 
				+            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
			
 
				+            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
			
 
				+            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
			
 
				+            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
			
 
				+            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
			
 
				+            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
			
 
				+            _mm_storeu_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
			
 
				+            _mm_storeu_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
			
 
				+            _mm_storeu_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
			
 
				+            _mm_storeu_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
			
 
				+            _mm_storeu_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
			
 
				+            _mm_storeu_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
			
 
				+            _mm_storeu_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
			
 
				+            s += 8*sizeof(__m128i);
			
 
				+            d += 8*sizeof(__m128i);
			
 
				+            n -= 8*sizeof(__m128i);
			
 
				+        }
			
 
				+        while (n >= sizeof(__m128i)) {
			
 
				+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
			
 
				+            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
			
 
				+            s += sizeof(__m128i);
			
 
				+            d += sizeof(__m128i);
			
 
				+            n -= sizeof(__m128i);
			
 
				+        }
			
 
				+    } else { // or it IS aligned
			
 
				+        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
			
 
				+        // unroll 8
			
 
				+        while (n >= 4*sizeof(__m128i)) {
			
 
				+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
			
 
				+            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
			
 
				+            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
			
 
				+            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
			
 
				+            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
			
 
				+            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
			
 
				+            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
			
 
				+            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
			
 
				+            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
			
 
				+            _mm_stream_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
			
 
				+            _mm_stream_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
			
 
				+            _mm_stream_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
			
 
				+            _mm_stream_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
			
 
				+            _mm_stream_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
			
 
				+            _mm_stream_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
			
 
				+            _mm_stream_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
			
 
				+            s += 8*sizeof(__m128i);
			
 
				+            d += 8*sizeof(__m128i);
			
 
				+            n -= 8*sizeof(__m128i);
			
 
				+        }
			
 
				+        while (n >= sizeof(__m128i)) {
			
 
				+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
			
 
				+            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
			
 
				+            s += sizeof(__m128i);
			
 
				+            d += sizeof(__m128i);
			
 
				+            n -= sizeof(__m128i);
			
 
				+        }
			
 
				+    }
			
 
				+    _mm_sfence();
			
 
				+    if (n)
			
 
				+        memcpy(d, s, n);
			
 
				+#else
			
 
				+#error "this file should be compiled with -msse4.1"
			
 
				+#endif
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Local variables:
			
 
				+ *  c-indent-level: 4
			
 
				+ *  c-basic-offset: 4
			
 
				+ *  tab-width: 4
			
 
				+ *  indent-tabs-mode: nil
			
 
				+ * End:
			
 
				+ */