Ver código fonte

Initial commit with a working simple app

mathiasb 7 anos atrás
commit
4bd7af2650
14 arquivos alterados com 1334 adições e 0 exclusões
  1. 26 0
      CMakeLists.txt
  2. 6 0
      build/reload_cmake.sh
  3. 15 0
      include/common.h
  4. 124 0
      include/gdrapi.h
  5. 13 0
      include/gdrconfig.h
  6. 86 0
      include/gdrdrv.h
  7. 6 0
      include/kernels.h
  8. 32 0
      src/common.cu
  9. 399 0
      src/gdrapi.c
  10. 24 0
      src/kernels.cu
  11. 83 0
      src/main.cu
  12. 198 0
      src/memcpy_avx.c
  13. 188 0
      src/memcpy_sse.c
  14. 134 0
      src/memcpy_sse41.c

+ 26 - 0
CMakeLists.txt

@@ -0,0 +1,26 @@
+# File to generate the makefile.
+# Uses src, include and build folders.
+
+project(GPUFirstComm)
+
+cmake_minimum_required(VERSION 2.6)
+find_package(CUDA REQUIRED)
+
+include_directories(include)
+
+#link_directories(${CUDA_LIBRARY_DIRS})
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS})
+set(CMAKE_C_COMPILER "/usr/bin/clang")
+set(CMAKE_C_FLAGS "-msse -msse4.1 -mavx")
+
+cuda_add_executable(gpufirstcomm
+  src/main.cu
+  src/common.cu
+  src/gdrapi.c
+  src/memcpy_avx.c
+  src/memcpy_sse41.c
+  src/memcpy_sse.c
+  src/kernels.cu)
+
+target_link_libraries(gpufirstcomm cuda)

+ 6 - 0
build/reload_cmake.sh

@@ -0,0 +1,6 @@
+# Used to reload the CMakeLists.txt file with clang as a compiler instead of gcc
+
+#!/bin/bash
+
+rm -r *
+CC=clang CXX=clang cmake ..

+ 15 - 0
include/common.h

@@ -0,0 +1,15 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+
+#define ASSERT_FAIL 0
+#define ASSERT_SUCCESS 1
+#define GPU_NAME_LENGTH 30
+
+void assert_cuda(cudaError_t err_id); /* for runtime api*/
+void assert_cu(CUresult res_id); /* for driver api */
+void assert_gdr(int gdr_id);
+
+#endif

+ 124 - 0
include/gdrapi.h

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __GDRAPI_H__
+#define __GDRAPI_H__
+
+#include <stdint.h> // for standard [u]intX_t types
+#include <stddef.h>
+
+#define GDR_API_MAJOR_VERSION    1
+#define GDR_API_MINOR_VERSION    2
+#define GDR_API_VERSION          ((GDR_API_MAJOR_VERSION << 16) | GDR_API_MINOR_VERSION)
+
+
+#define GPU_PAGE_SHIFT   16
+#define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
+#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
+
+/*
+ * GDRCopy, a low-latency GPU memory copy library (and a kernel-mode
+ * driver) based on NVIDIA GPUDirect RDMA technology.
+ *
+ * supported environment variables:
+ *
+ * - GDRCOPY_ENABLE_LOGGING, if defined logging is enabled, default is
+ *   disabled.
+ *
+ * - GDRCOPY_LOG_LEVEL, overrides log threshold, default is to print errors
+ *   only.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct gdr;
+typedef struct gdr *gdr_t;
+
+// Initialize the library, e.g. by opening a connection to the kernel-mode
+// driver. Returns an handle to the library state object.
+gdr_t gdr_open();
+
+// Destroy library state object, e.g. it closes the connection to kernel-mode
+// driver.
+//
+// Note that altough BAR mappings of GPU memory are destroyed, user-space
+// mappings are not. So therefore user code is responsible of calling
+// gdr_unmap on all mappings before calling gdr_close.
+int gdr_close(gdr_t g);
+
+// Map device memory buffer on GPU BAR1, returning an handle.
+// Memory is still not accessible to user-space.
+typedef uint32_t gdr_mh_t;
+int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
+
+// Unmap the handle. 
+//
+// If there exists a corresponding user-space mapping, gdr_unmap should be
+// called before this one.
+int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
+
+// flag is set when the kernel callback (relative to the
+// nvidia_p2p_get_pages) gets invoked, e.g. cuMemFree() before
+// gdr_unpin_buffer.
+int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag);
+
+// After pinning, info struct contains details of the mapped area.  
+//
+// Note that both info->va and info->mapped_size might be different from
+// the original address passed to gdr_pin_buffer due to aligning happening
+// in the kernel-mode driver
+struct gdr_info {
+    uint64_t va;
+    uint64_t mapped_size;
+    uint32_t page_size;
+    uint64_t tm_cycles;
+    uint32_t cycles_per_ms;
+};
+typedef struct gdr_info gdr_info_t;
+int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
+
+// create a user-space mapping for the BAR1 info, length is bar1->size
+// above.
+//
+// WARNING: the BAR physical address will be aligned to the page size
+// before being mapped in user-space, so the pointer returned might be
+// affected by an offset. gdr_get_info can be used to calculate that
+// offset.
+int gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+
+// get rid of a user-space mapping.
+// First invoke gdr_unmap() then gdr_unpin_buffer().
+int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+
+// gpubar_ptr is a user-space virtual address, i.e. one returned by gdr_map()
+int gdr_copy_to_bar(void  *gpubar_ptr, const void *cpumem_ptr, size_t size);
+int gdr_copy_from_bar(void *cpumem_ptr, const void *gpubar_ptr, size_t size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __GDRAPI_H__

+ 13 - 0
include/gdrconfig.h

@@ -0,0 +1,13 @@
+#pragma once
+
+#if defined __GNUC__
+#if defined(__powerpc__)
+#define GDRAPI_POWER
+#elif defined(__i386__) || defined(__x86_64__) || defined(__X86__)
+#define GDRAPI_X86
+#else
+#error "architecture is not supported"
+#endif // arch
+#else
+#error "compiler not supported"
+#endif // __GNUC__

+ 86 - 0
include/gdrdrv.h

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __GDR_DRV_H__
+#define __GDR_DRV_H__
+
+#define GDRDRV_IOCTL                 0xDA
+
+typedef __u32 gdr_hnd_t;
+#define GDR_HANDLE_MASK ((1UL<<32)-1)
+
+//-----------
+
+struct GDRDRV_IOC_PIN_BUFFER_PARAMS
+{
+    // in
+    __u64 addr;
+    __u64 size;
+    __u64 p2p_token;
+    __u32 va_space;
+    // out
+    gdr_hnd_t handle;
+};
+
+#define GDRDRV_IOC_PIN_BUFFER _IOWR(GDRDRV_IOCTL, 1, struct GDRDRV_IOC_PIN_BUFFER_PARAMS)
+
+//-----------
+
+struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+};
+
+#define GDRDRV_IOC_UNPIN_BUFFER _IOWR(GDRDRV_IOCTL, 2, struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS *)
+
+//-----------
+
+struct GDRDRV_IOC_GET_CB_FLAG_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+    // out
+    __u32 flag;
+};
+
+#define GDRDRV_IOC_GET_CB_FLAG _IOWR(GDRDRV_IOCTL, 3, struct GDRDRV_IOC_GET_CB_FLAG_PARAMS *)
+
+//-----------
+
+struct GDRDRV_IOC_GET_INFO_PARAMS
+{
+    // in
+    gdr_hnd_t handle;
+    // out
+    __u64 va;
+    __u64 mapped_size;
+    __u32 page_size;
+    __u32 tsc_khz;
+    __u64 tm_cycles;
+};
+
+#define GDRDRV_IOC_GET_INFO _IOWR(GDRDRV_IOCTL, 4, struct GDRDRV_IOC_GET_INFO_PARAMS *)
+
+//-----------
+
+#endif // __GDR_DRV_H__

+ 6 - 0
include/kernels.h

@@ -0,0 +1,6 @@
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+
+__device__ void add_two_device(CUdeviceptr number);
+__global__ void add_three_global(CUdeviceptr number);
+__global__ void add_one_global(void* number);

+ 32 - 0
src/common.cu

@@ -0,0 +1,32 @@
+/* This files contains useful fonctions like assertions */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+void assert_cuda(cudaError_t err_id)
+{
+    if( err_id != cudaSuccess )
+    {
+	printf("%s\n",cudaGetErrorString(err_id));
+	exit(EXIT_FAILURE);
+    }
+}
+
+void assert_cu(CUresult res_id)
+{
+    if( res_id != CUDA_SUCCESS )
+    {
+	printf("Error in driver api returned with code: %d\n",res_id);
+	exit(EXIT_FAILURE);
+    }
+}
+
+void assert_gdr(int gdr_id)
+{
+    if( gdr_id != 0 )
+    {
+	printf("Error in gdr api returned with code: %d\n",gdr_id);
+	exit(EXIT_FAILURE);
+    }
+}

+ 399 - 0
src/gdrapi.c

@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <netdb.h>
+#include <malloc.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <sys/ioctl.h>
+#include <time.h>
+#include <asm/types.h>
+
+#include "gdrapi.h"
+#include "gdrdrv.h"
+#include "gdrconfig.h"
+
+// based on post at http://stackoverflow.com/questions/3385515/static-assert-in-c
+#define STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(!!(COND))*2-1]
+// token pasting madness
+#define COMPILE_TIME_ASSERT3(X,L) STATIC_ASSERT(X,static_assertion_at_line_##L)
+#define COMPILE_TIME_ASSERT2(X,L) COMPILE_TIME_ASSERT3(X,L)
+#define COMPILE_TIME_ASSERT(X)    COMPILE_TIME_ASSERT2(X,__LINE__)
+
+// hint: use page_size = sysconf(_SC_PAGESIZE) instead
+#ifdef GDRAPI_POWER
+#define PAGE_SHIFT 16
+#else // catching all 4KB page size platforms here
+#define PAGE_SHIFT 12
+#endif
+#define PAGE_SIZE  (1UL << PAGE_SHIFT)
+#define PAGE_MASK  (~(PAGE_SIZE-1))
+
+// logging/tracing
+
+enum gdrcopy_msg_level {
+    GDRCOPY_MSG_DEBUG = 1,
+    GDRCOPY_MSG_INFO,
+    GDRCOPY_MSG_WARN,
+    GDRCOPY_MSG_ERROR
+};
+
+static int gdr_msg_level = GDRCOPY_MSG_ERROR;
+static int gdr_enable_logging = -1;
+
+static void gdr_msg(enum gdrcopy_msg_level lvl, const char* fmt, ...)
+{
+    if (-1 == gdr_enable_logging) {
+        const char *env = getenv("GDRCOPY_ENABLE_LOGGING");
+        if (env)
+            gdr_enable_logging = 1;
+        else
+            gdr_enable_logging = 0;
+
+        env = getenv("GDRCOPY_LOG_LEVEL");
+        if (env)
+            gdr_msg_level = atoi(env);
+    }
+    if (gdr_enable_logging) {
+        if (lvl >= gdr_msg_level) {
+            va_list ap;
+            va_start(ap, fmt);
+            vfprintf(stderr, fmt, ap);
+        }
+    }
+}
+
+#define gdr_dbg(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_DEBUG, "DBG:  " FMT, ## ARGS)
+#define gdr_dbgc(C, FMT, ARGS...)  do { static int gdr_dbg_cnt=(C); if (gdr_dbg_cnt) { gdr_dbg(FMT, ## ARGS); --gdr_dbg_cnt; }} while (0)
+#define gdr_info(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_INFO,  "INFO: " FMT, ## ARGS)
+#define gdr_warn(FMT, ARGS...) gdr_msg(GDRCOPY_MSG_WARN,  "WARN: " FMT, ## ARGS)
+#define gdr_err(FMT, ARGS...)  gdr_msg(GDRCOPY_MSG_ERROR, "ERR:  " FMT, ## ARGS)
+
+// check GDR HaNDle size
+
+COMPILE_TIME_ASSERT(sizeof(gdr_hnd_t)==sizeof(gdr_mh_t));
+
+
+
+struct gdr {
+    int fd;
+};
+
+gdr_t gdr_open()
+{
+    gdr_t g = NULL;
+    const char *gdrinode = "/dev/gdrdrv";
+
+    g = calloc(1, sizeof(*g));
+    if (!g) {
+        gdr_err("error while allocating memory\n");
+        return NULL;
+    }
+
+    int fd = open(gdrinode, O_RDWR);
+    if (-1 == fd ) {
+        int ret = errno;
+        gdr_err("error opening driver (errno=%d/%s)\n", ret, strerror(ret));
+        free(g);
+        return NULL;
+    }
+
+    g->fd = fd;
+
+    return g;
+}
+
+int gdr_close(gdr_t g)
+{
+    int ret = 0;
+    int retcode = close(g->fd);
+    if (-1 == retcode) {
+        ret = errno;
+        gdr_err("error closing driver (errno=%d/%s)\n", ret, strerror(ret));
+    }
+    g->fd = 0;
+    free(g);
+    return ret;
+}
+
+int gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle)
+{
+    int ret = 0;
+    int retcode;
+
+    struct GDRDRV_IOC_PIN_BUFFER_PARAMS params;
+    params.addr = addr;
+    params.size = size;
+    params.p2p_token = p2p_token;
+    params.va_space = va_space;
+    params.handle = 0;
+
+    retcode = ioctl(g->fd, GDRDRV_IOC_PIN_BUFFER, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("ioctl error (errno=%d)\n", ret);
+    }
+    *handle = params.handle;
+
+    return ret;
+}
+
+int gdr_unpin_buffer(gdr_t g, gdr_mh_t handle)
+{
+    int ret = 0;
+    int retcode;
+
+    struct GDRDRV_IOC_UNPIN_BUFFER_PARAMS params;
+    params.handle = handle;
+
+    retcode = ioctl(g->fd, GDRDRV_IOC_UNPIN_BUFFER, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("ioctl error (errno=%d)\n", ret);
+    }
+
+    return ret;
+}
+
+int gdr_get_callback_flag(gdr_t g, gdr_mh_t handle, int *flag)
+{
+    int ret = 0;
+    int retcode;
+
+    struct GDRDRV_IOC_GET_CB_FLAG_PARAMS params;
+    params.handle = handle;
+
+    retcode = ioctl(g->fd, GDRDRV_IOC_GET_CB_FLAG, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("ioctl error (errno=%d)\n", ret);
+    } else
+        *flag = params.flag;
+
+    return ret;
+}
+
+int gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info)
+{
+    int ret = 0;
+    int retcode;
+
+    struct GDRDRV_IOC_GET_INFO_PARAMS params;
+    params.handle = handle;
+
+    retcode = ioctl(g->fd, GDRDRV_IOC_GET_INFO, &params);
+    if (0 != retcode) {
+        ret = errno;
+        gdr_err("ioctl error (errno=%d)\n", ret);
+    } else {
+        info->va          = params.va;
+        info->mapped_size = params.mapped_size;
+        info->page_size   = params.page_size;
+        info->tm_cycles   = params.tm_cycles;
+        info->cycles_per_ms = params.tsc_khz;
+    }
+    return ret;
+}
+
+int gdr_map(gdr_t g, gdr_mh_t handle, void **ptr_va, size_t size)
+{
+    int ret = 0;
+    gdr_info_t info = {0,};
+
+    ret = gdr_get_info(g, handle, &info);
+    if (ret) {
+        return ret;
+    }
+    size_t rounded_size = (size + PAGE_SIZE - 1) & PAGE_MASK;
+    off_t magic_off = (off_t)handle << PAGE_SHIFT;
+    void *mmio;
+
+    mmio = mmap(NULL, rounded_size, PROT_READ|PROT_WRITE, MAP_SHARED, g->fd, magic_off);
+    if (mmio == MAP_FAILED) {
+        int __errno = errno;
+        mmio = NULL;
+        gdr_err("can't mmap BAR, error=%s(%d) rounded_size=%zu offset=%llx handle=%x\n",
+                strerror(__errno), __errno, rounded_size, (long long unsigned)magic_off, handle);
+        ret = __errno;
+    }
+
+    *ptr_va = mmio;
+
+    return ret;
+}
+
+int gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size)
+{
+    int ret = 0;
+    int retcode = 0;
+    size_t rounded_size = (size + PAGE_SIZE - 1) & PAGE_MASK;
+
+    retcode = munmap(va, rounded_size);
+    if (-1 == retcode) {
+        int __errno = errno;
+        gdr_err("can't unmap BAR, error=%s(%d) rounded_size=%zu\n",
+                strerror(__errno), __errno, rounded_size);
+        ret = __errno;
+    }
+
+    return ret;
+}
+
+#ifdef GDRAPI_X86
+#include <cpuid.h>
+
+// prepare for AVX2 implementation
+#ifndef bit_AVX2
+/* Extended Features (%eax == 7) */
+/* %ebx */
+#define bit_AVX2 (1 << 5)
+#endif
+
+#include <immintrin.h>
+
+extern int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes);
+extern int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes);
+extern int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes);
+extern int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes);
+extern int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes);
+#else // GDRAPI_X86
+static int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
+static int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)  { return 1; }
+static int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
+static int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)    { return 1; }
+static int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { return 1; }
+#endif // GDRAPI_X86
+
+static int first_time = 1;
+static int has_sse = 0;
+static int has_sse2 = 0;
+static int has_sse4_1 = 0;
+static int has_avx = 0;
+static int has_avx2 = 0;
+
+static void gdr_init_cpu_flags()
+{
+#ifdef GDRAPI_X86
+    unsigned int info_type = 0x00000001;
+    unsigned int ax, bx, cx, dx;
+    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
+       has_sse4_1 = ((cx & bit_SSE41) != 0);
+       has_avx    = ((cx & bit_AVX)    != 0);
+       has_sse    = ((dx & bit_SSE)    != 0);
+       has_sse2   = ((dx & bit_SSE2)   != 0);
+       gdr_dbg("sse4_1=%d avx=%d sse=%d sse2=%d\n", has_sse4_1, has_avx, has_sse, has_sse2);
+    }
+#ifdef bit_AVX2
+    info_type = 0x7;
+    if (__get_cpuid(info_type, &ax, &bx, &cx, &dx) == 1) {
+        has_avx2 = bx & bit_AVX2;
+    }
+#endif // bit_AVX2
+#endif // GDRAPI_X86
+
+#ifdef GDRAPI_POWER
+    // detect and enable Altivec/SMX support
+#endif
+
+    first_time = 0;
+}
+
+// note: more than one implementation may be compiled in
+
+
+int gdr_copy_to_bar(void *bar_ptr, const void *h_ptr, size_t size)
+{
+    if (first_time) {
+        gdr_init_cpu_flags();
+    }
+
+    do {
+        // pick the most performing implementation compatible with the platform we are running on
+        if (has_avx) {
+            gdr_dbgc(1, "using AVX implementation of gdr_copy_to_bar\n");
+            memcpy_uncached_store_avx(bar_ptr, h_ptr, size);
+            break;
+        }
+        if (has_sse) {
+            gdr_dbgc(1, "using SSE implementation of gdr_copy_to_bar\n");
+            memcpy_uncached_store_sse(bar_ptr, h_ptr, size);
+            break;
+        }
+        // fall through
+        gdr_dbgc(1, "using plain implementation of gdr_copy_to_bar\n");
+        memcpy(bar_ptr, h_ptr, size);
+    } while (0);
+
+    return 0;
+}
+
+int gdr_copy_from_bar(void *h_ptr, const void *bar_ptr, size_t size)
+{
+    if (first_time) {
+        gdr_init_cpu_flags();
+    }
+
+    do {
+        // pick the most performing implementation compatible with the platform we are running on
+        if (has_sse4_1) {
+            gdr_dbgc(1, "using SSE4_1 implementation of gdr_copy_from_bar\n");
+            memcpy_uncached_load_sse41(h_ptr, bar_ptr, size);
+            break;
+        }
+        if (has_avx) {
+            gdr_dbgc(1, "using AVX implementation of gdr_copy_from_bar\n");
+            memcpy_cached_store_avx(h_ptr, bar_ptr, size);
+            break;
+        }
+        if (has_sse) {
+            gdr_dbgc(1, "using SSE implementation of gdr_copy_from_bar\n");
+            memcpy_cached_store_sse(h_ptr, bar_ptr, size);
+            break;
+        }
+        // fall through
+        gdr_dbgc(1, "using plain implementation of gdr_copy_from_bar\n");
+        memcpy(h_ptr, bar_ptr, size);
+    } while (0);
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */

+ 24 - 0
src/kernels.cu

@@ -0,0 +1,24 @@
+/* This file contains the kernels i.e. the functions to be executed on the GPU */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "kernels.h"
+
+__device__
+void add_two_device(CUdeviceptr number)
+{
+    (* (int*) number)+=2;
+}
+
+__global__
+void add_three_global(CUdeviceptr number)
+{
+    (* (int*) number)++;
+    add_two_device(number);
+}
+
+__global__
+void add_one_global(void* number)
+{
+    (* (int*) number)++;
+}

+ 83 - 0
src/main.cu

@@ -0,0 +1,83 @@
+/* The main program */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+/*#include "cuda.h"*/
+#include "kernels.h"
+#include "gdrapi.h"
+
+int main()
+{
+    /* Initialisation of the APIs */
+    assert_cu( cuInit(0) );
+    gdr_t g = gdr_open();
+
+    /* First check if a NVIDIA GPU is on the system and see which to use */
+    /* For the time being, use number 0 */
+    int i;
+    int countGPU;
+    CUdevice GPU;
+    char nameGPU[GPU_NAME_LENGTH];
+    CUdevprop GPUProp;
+    
+    assert_cuda( cudaGetDeviceCount(&countGPU) );
+    for(i=0; i<countGPU; i++)
+    {
+	assert_cu( cuDeviceGet(&GPU,i) );
+	assert_cu( cuDeviceGetName(nameGPU,GPU_NAME_LENGTH,GPU) );
+	printf("GPU %d is %s\n",i,nameGPU);
+    }
+
+    assert_cuda( cudaSetDevice(0) );
+    assert_cu( cuDeviceGet(&GPU,0) );
+    assert_cu( cuDeviceGetProperties(&GPUProp,GPU) );
+
+    /* Check context */
+    assert_cu( cuCtxGetDevice(&GPU) );
+    printf("Device for this context: %d\n",GPU);
+    CUcontext ctx;
+    assert_cu( cuCtxCreate(&ctx,0,GPU) );
+    assert_cu( cuCtxGetDevice(&GPU) );
+    printf("Device for this context: %d\n",GPU);
+    
+    /* Allocate memory on the device, pin and map */
+    CUdeviceptr dptr;
+    assert_cu( cuMemAlloc(&dptr,(size_t) GPUProp.sharedMemPerBlock) );
+    gdr_mh_t GPUMemHandle;
+    assert_gdr( gdr_pin_buffer(g,dptr,(size_t) GPUProp.sharedMemPerBlock,0,0,&GPUMemHandle) );
+    void* va;
+    assert_gdr( gdr_map(g,GPUMemHandle,&va,(size_t) GPUProp.sharedMemPerBlock) );
+    /*CHECK THE OFFSET*/
+    gdr_info_t GPUInfo;
+    int offset;
+    assert_gdr( gdr_get_info(g,GPUMemHandle,&GPUInfo) );
+    offset = (GPUInfo.va > dptr) ? GPUInfo.va - dptr:dptr - GPUInfo.va;
+    uint32_t *buf_ptr = (uint32_t *)((char *)va + offset);
+
+    printf("All set\n");
+    /* printf("va: %lu\ndptr: %llu\nGPUInfo.va: %lu\noffset: %d\nbuf_ptr: %d\n", */
+    /* 	   (uint64_t) va,dptr,GPUInfo.va,offset,*buf_ptr); */
+    
+    /* At this point the GPU's mem is mapped to a CPU buffer to enable DMA */
+
+    int set, get;
+    printf("Use the nvidia api\n");
+    set = 4242;
+    get = 0;
+    printf("set = %d\nget = %d\n",set,get);
+    assert_cu( cuMemcpyHtoD(dptr,&set,sizeof(set)) );
+    add_three_global<<< 1,1 >>>(dptr);
+    assert_cu( cuMemcpyDtoH(&get,dptr,sizeof(get)) );
+    printf("set = %d\nget = %d\n",set,get);
+
+    /* Close everything */
+    assert_gdr( gdr_unmap(g,GPUMemHandle,va,(size_t) GPUProp.sharedMemPerBlock) );
+    assert_gdr( gdr_unpin_buffer(g,GPUMemHandle) );
+    assert_gdr( gdr_close(g) );
+    assert_cu( cuMemFree(dptr) );
+
+    printf("All Cleared\n");
+    
+    exit(EXIT_SUCCESS);
+}

+ 198 - 0
src/memcpy_avx.c

@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#ifndef min
+#define min(A,B) ((A)<(B)?(A):(B))
+#endif
+
+int memcpy_uncached_store_avx(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __AVX__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align dest to 256-bits
+    if (d_int & 0x1f) {
+        size_t nh = min(0x20 - (d_int & 0x1f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (s_int & 0x1f) { // src is not aligned to 256-bits
+        __m256d r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*sizeof(__m256d)) {
+            r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
+            r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
+            r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
+            r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
+            _mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
+            _mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
+            _mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
+            _mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
+            s += 4*sizeof(__m256d);
+            d += 4*sizeof(__m256d);
+            n -= 4*sizeof(__m256d);
+        }
+        while (n >= sizeof(__m256d)) {
+            r0 = _mm256_loadu_pd((double *)(s));
+            _mm256_stream_pd((double *)(d), r0);
+            s += sizeof(__m256d);
+            d += sizeof(__m256d);
+            n -= sizeof(__m256d);
+        }
+    } else { // or it IS aligned
+        __m256d r0,r1,r2,r3,r4,r5,r6,r7;
+        // unroll 8
+        while (n >= 8*sizeof(__m256d)) {
+            r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
+            r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
+            r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
+            r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
+            r4 = _mm256_load_pd((double *)(s+4*sizeof(__m256d)));
+            r5 = _mm256_load_pd((double *)(s+5*sizeof(__m256d)));
+            r6 = _mm256_load_pd((double *)(s+6*sizeof(__m256d)));
+            r7 = _mm256_load_pd((double *)(s+7*sizeof(__m256d)));
+            _mm256_stream_pd((double *)(d+0*sizeof(__m256d)), r0);
+            _mm256_stream_pd((double *)(d+1*sizeof(__m256d)), r1);
+            _mm256_stream_pd((double *)(d+2*sizeof(__m256d)), r2);
+            _mm256_stream_pd((double *)(d+3*sizeof(__m256d)), r3);
+            _mm256_stream_pd((double *)(d+4*sizeof(__m256d)), r4);
+            _mm256_stream_pd((double *)(d+5*sizeof(__m256d)), r5);
+            _mm256_stream_pd((double *)(d+6*sizeof(__m256d)), r6);
+            _mm256_stream_pd((double *)(d+7*sizeof(__m256d)), r7);
+            s += 8*sizeof(__m256d);
+            d += 8*sizeof(__m256d);
+            n -= 8*sizeof(__m256d);
+        }
+        while (n >= sizeof(__m256d)) {
+            r0 = _mm256_load_pd((double *)(s));
+            _mm256_stream_pd((double *)(d), r0);
+            s += sizeof(__m256d);
+            d += sizeof(__m256d);
+            n -= sizeof(__m256d);
+        }            
+    }
+    _mm_sfence();
+    if (n)
+        memcpy(d, s, n);
+#else
+#error "this file should be compiled with -mavx"
+#endif
+    return ret;
+}
+
+int memcpy_cached_store_avx(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __AVX__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align dest to 256-bits
+    if (d_int & 0x1f) {
+        size_t nh = min(0x20 - (d_int & 0x1f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (s_int & 0x1f) { // src is not aligned to 256-bits
+        __m256d r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*sizeof(__m256d)) {
+            r0 = _mm256_loadu_pd((double *)(s+0*sizeof(__m256d)));
+            r1 = _mm256_loadu_pd((double *)(s+1*sizeof(__m256d)));
+            r2 = _mm256_loadu_pd((double *)(s+2*sizeof(__m256d)));
+            r3 = _mm256_loadu_pd((double *)(s+3*sizeof(__m256d)));
+            _mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
+            _mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
+            _mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
+            _mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
+            s += 4*sizeof(__m256d);
+            d += 4*sizeof(__m256d);
+            n -= 4*sizeof(__m256d);
+        }
+        while (n >= sizeof(__m256d)) {
+            r0 = _mm256_loadu_pd((double *)(s));
+            _mm256_store_pd((double *)(d), r0);
+            s += sizeof(__m256d);
+            d += sizeof(__m256d);
+            n -= sizeof(__m256d);
+        }
+    } else { // or it IS aligned
+        __m256d r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*sizeof(__m256d)) {
+            r0 = _mm256_load_pd((double *)(s+0*sizeof(__m256d)));
+            r1 = _mm256_load_pd((double *)(s+1*sizeof(__m256d)));
+            r2 = _mm256_load_pd((double *)(s+2*sizeof(__m256d)));
+            r3 = _mm256_load_pd((double *)(s+3*sizeof(__m256d)));
+            _mm256_store_pd((double *)(d+0*sizeof(__m256d)), r0);
+            _mm256_store_pd((double *)(d+1*sizeof(__m256d)), r1);
+            _mm256_store_pd((double *)(d+2*sizeof(__m256d)), r2);
+            _mm256_store_pd((double *)(d+3*sizeof(__m256d)), r3);
+            s += 4*sizeof(__m256d);
+            d += 4*sizeof(__m256d);
+            n -= 4*sizeof(__m256d);
+        }
+        while (n >= sizeof(__m256d)) {
+            r0 = _mm256_load_pd((double *)(s));
+            _mm256_store_pd((double *)(d), r0);
+            s += sizeof(__m256d);
+            d += sizeof(__m256d);
+            n -= sizeof(__m256d);
+        }            
+    }
+    if (n)
+        memcpy(d, s, n);
+#else
+#error "this file should be compiled with -mavx"
+#endif
+    return ret;
+}
+
+// add variant for _mm_stream_load_si256() / VMOVNTDQA
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */

+ 188 - 0
src/memcpy_sse.c

@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#ifndef min
+#define min(A,B) ((A)<(B)?(A):(B))
+#endif
+
+int memcpy_uncached_store_sse(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __SSE__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align dest to 128-bits
+    if (d_int & 0xf) {
+        size_t nh = min(0x10 - (d_int & 0x0f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (s_int & 0xf) { // src is not aligned to 128-bits
+        __m128 r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*4*sizeof(float)) {
+            r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
+            r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
+            r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
+            r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
+            _mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
+            _mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
+            _mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
+            _mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
+            s += 4*4*sizeof(float);
+            d += 4*4*sizeof(float);
+            n -= 4*4*sizeof(float);
+        }
+        while (n >= 4*sizeof(float)) {
+            r0 = _mm_loadu_ps((float *)(s));
+            _mm_stream_ps((float *)(d), r0);
+            s += 4*sizeof(float);
+            d += 4*sizeof(float);
+            n -= 4*sizeof(float);
+        }
+    } else { // or it IS aligned
+        __m128 r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*4*sizeof(float)) {
+            r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
+            r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
+            r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
+            r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
+            _mm_stream_ps((float *)(d+0*4*sizeof(float)), r0);
+            _mm_stream_ps((float *)(d+1*4*sizeof(float)), r1);
+            _mm_stream_ps((float *)(d+2*4*sizeof(float)), r2);
+            _mm_stream_ps((float *)(d+3*4*sizeof(float)), r3);
+            s += 4*4*sizeof(float);
+            d += 4*4*sizeof(float);
+            n -= 4*4*sizeof(float);
+        }
+        while (n >= 4*sizeof(float)) {
+            r0 = _mm_load_ps((float *)(s));
+            _mm_stream_ps((float *)(d), r0);
+            s += 4*sizeof(float);
+            d += 4*sizeof(float);
+            n -= 4*sizeof(float);
+        }            
+    }
+    _mm_sfence();
+    if (n)
+        memcpy(d, s, n);
+#else
+#error "this file should be compiled with -msse"
+#endif
+    return ret;
+}
+
+int memcpy_cached_store_sse(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __SSE__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align dest to 128-bits
+    if (d_int & 0xf) {
+        size_t nh = min(0x10 - (d_int & 0x0f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (s_int & 0xf) { // src is not aligned to 128-bits
+        __m128 r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*4*sizeof(float)) {
+            r0 = _mm_loadu_ps((float *)(s+0*4*sizeof(float)));
+            r1 = _mm_loadu_ps((float *)(s+1*4*sizeof(float)));
+            r2 = _mm_loadu_ps((float *)(s+2*4*sizeof(float)));
+            r3 = _mm_loadu_ps((float *)(s+3*4*sizeof(float)));
+            _mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
+            _mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
+            _mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
+            _mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
+            s += 4*4*sizeof(float);
+            d += 4*4*sizeof(float);
+            n -= 4*4*sizeof(float);
+        }
+        while (n >= 4*sizeof(float)) {
+            r0 = _mm_loadu_ps((float *)(s));
+            _mm_store_ps((float *)(d), r0);
+            s += 4*sizeof(float);
+            d += 4*sizeof(float);
+            n -= 4*sizeof(float);
+        }
+    } else { // or it IS aligned
+        __m128 r0,r1,r2,r3;
+        // unroll 4
+        while (n >= 4*4*sizeof(float)) {
+            r0 = _mm_load_ps((float *)(s+0*4*sizeof(float)));
+            r1 = _mm_load_ps((float *)(s+1*4*sizeof(float)));
+            r2 = _mm_load_ps((float *)(s+2*4*sizeof(float)));
+            r3 = _mm_load_ps((float *)(s+3*4*sizeof(float)));
+            _mm_store_ps((float *)(d+0*4*sizeof(float)), r0);
+            _mm_store_ps((float *)(d+1*4*sizeof(float)), r1);
+            _mm_store_ps((float *)(d+2*4*sizeof(float)), r2);
+            _mm_store_ps((float *)(d+3*4*sizeof(float)), r3);
+            s += 4*4*sizeof(float);
+            d += 4*4*sizeof(float);
+            n -= 4*4*sizeof(float);
+        }
+        while (n >= 4*sizeof(float)) {
+            r0 = _mm_load_ps((float *)(s));
+            _mm_store_ps((float *)(d), r0);
+            s += 4*sizeof(float);
+            d += 4*sizeof(float);
+            n -= 4*sizeof(float);
+        }            
+    }
+    if (n)
+        memcpy(d, s, n);
+#else
+#error "this file should be compiled with -msse"
+#endif
+    return ret;
+}
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */

+ 134 - 0
src/memcpy_sse41.c

@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#ifndef min
+#define min(A,B) ((A)<(B)?(A):(B))
+#endif
+
+// implementation of copy from BAR using MOVNTDQA 
+// suggested by Nicholas Wilt <nwilt@amazon.com>
+
+int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes)
+{
+    int ret = 0;
+#ifdef __SSE4_1__
+    char *d = (char*)dest;
+    uintptr_t d_int = (uintptr_t)d;
+    const char *s = (const char *)src;
+    uintptr_t s_int = (uintptr_t)s;
+    size_t n = n_bytes;
+
+    // align src to 128-bits
+    if (s_int & 0xf) {
+        size_t nh = min(0x10 - (s_int & 0x0f), n);
+        memcpy(d, s, nh);
+        d += nh; d_int += nh;
+        s += nh; s_int += nh;
+        n -= nh;
+    }
+
+    if (d_int & 0xf) { // dest is not aligned to 128-bits
+        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
+        // unroll 8
+        while (n >= 8*sizeof(__m128i)) {
+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
+            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
+            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
+            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
+            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
+            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
+            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
+            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
+            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
+            _mm_storeu_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
+            _mm_storeu_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
+            _mm_storeu_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
+            _mm_storeu_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
+            _mm_storeu_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
+            _mm_storeu_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
+            _mm_storeu_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
+            s += 8*sizeof(__m128i);
+            d += 8*sizeof(__m128i);
+            n -= 8*sizeof(__m128i);
+        }
+        while (n >= sizeof(__m128i)) {
+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
+            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
+            s += sizeof(__m128i);
+            d += sizeof(__m128i);
+            n -= sizeof(__m128i);
+        }
+    } else { // or it IS aligned
+        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
+        // unroll 8
+        while (n >= 4*sizeof(__m128i)) {
+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
+            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
+            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
+            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
+            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
+            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
+            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
+            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
+            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
+            _mm_stream_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
+            _mm_stream_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
+            _mm_stream_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
+            _mm_stream_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
+            _mm_stream_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
+            _mm_stream_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
+            _mm_stream_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
+            s += 8*sizeof(__m128i);
+            d += 8*sizeof(__m128i);
+            n -= 8*sizeof(__m128i);
+        }
+        while (n >= sizeof(__m128i)) {
+            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
+            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
+            s += sizeof(__m128i);
+            d += sizeof(__m128i);
+            n -= sizeof(__m128i);
+        }
+    }
+    _mm_sfence();
+    if (n)
+        memcpy(d, s, n);
+#else
+#error "this file should be compiled with -msse4.1"
+#endif
+    return ret;
+}
+
+
+/*
+ * Local variables:
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */