Browse Source

Fix AVX memory copy

Suren A. Chilingaryan 8 years ago
parent
commit
2bda41263f
2 changed files with 40 additions and 40 deletions
  1. 39 39
      pcilib/pagecpy.c
  2. 1 1
      pcilib/pagecpy.h

+ 39 - 39
pcilib/pagecpy.c

@@ -87,7 +87,7 @@ void *memcpy128(void * dst, void const * src, size_t len) {
 } 
 */
 
-void pcilib_memcpy4k_avx(void *dst, void *src, size_t size) {
+void pcilib_memcpy4k_avx(void *dst, const void *src, size_t size) {
     size_t sse_size = (size / 512);
 
     __asm__ __volatile__ (
@@ -96,57 +96,57 @@ void pcilib_memcpy4k_avx(void *dst, void *src, size_t size) {
 
             "1:					\n\t"
 
-            "vmovdqa 	   (%0,%%rax), %%ymm0	\n\t"
-            "vmovdqa 	 32(%0,%%rax), %%ymm1	\n\t"
-            "vmovdqa 	 64(%0,%%rax), %%ymm2	\n\t"
-            "vmovdqa 	 96(%0,%%rax), %%ymm3	\n\t"
-            "vmovdqa 	128(%0,%%rax), %%ymm4	\n\t"
-            "vmovdqa 	160(%0,%%rax), %%ymm5	\n\t"
-            "vmovdqa 	192(%0,%%rax), %%ymm6	\n\t"
-            "vmovdqa 	224(%0,%%rax), %%ymm7	\n\t"
-
-            "vmovdqa 	256(%0,%%rax), %%ymm8	\n\t"
-            "vmovdqa 	288(%0,%%rax), %%ymm9	\n\t"
-            "vmovdqa 	320(%0,%%rax), %%ymm10	\n\t"
-            "vmovdqa 	352(%0,%%rax), %%ymm11	\n\t"
-            "vmovdqa 	384(%0,%%rax), %%ymm12	\n\t"
-            "vmovdqa 	416(%0,%%rax), %%ymm13	\n\t"
-            "vmovdqa 	448(%0,%%rax), %%ymm14	\n\t"
-            "vmovdqa 	480(%0,%%rax), %%ymm15	\n\t"
-
-            "vmovntps	%%ymm0,    (%1,%%rax)	\n\t"
-            "vmovntps	%%ymm1,  32(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm2,  64(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm3,  96(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm4, 128(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm5, 160(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm6, 192(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm7, 224(%1,%%rax)	\n\t"
-
-            "vmovntps	%%ymm8,  256(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm9,  288(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm10, 320(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm11, 352(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm12, 384(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm13, 416(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm14, 448(%1,%%rax)	\n\t"
-            "vmovntps	%%ymm15, 480(%1,%%rax)	\n\t"
+            "vmovdqa 	   (%1,%%rax), %%ymm0	\n\t"
+            "vmovdqa 	 32(%1,%%rax), %%ymm1	\n\t"
+            "vmovdqa 	 64(%1,%%rax), %%ymm2	\n\t"
+            "vmovdqa 	 96(%1,%%rax), %%ymm3	\n\t"
+            "vmovdqa 	128(%1,%%rax), %%ymm4	\n\t"
+            "vmovdqa 	160(%1,%%rax), %%ymm5	\n\t"
+            "vmovdqa 	192(%1,%%rax), %%ymm6	\n\t"
+            "vmovdqa 	224(%1,%%rax), %%ymm7	\n\t"
+
+            "vmovdqa 	256(%1,%%rax), %%ymm8	\n\t"
+            "vmovdqa 	288(%1,%%rax), %%ymm9	\n\t"
+            "vmovdqa 	320(%1,%%rax), %%ymm10	\n\t"
+            "vmovdqa 	352(%1,%%rax), %%ymm11	\n\t"
+            "vmovdqa 	384(%1,%%rax), %%ymm12	\n\t"
+            "vmovdqa 	416(%1,%%rax), %%ymm13	\n\t"
+            "vmovdqa 	448(%1,%%rax), %%ymm14	\n\t"
+            "vmovdqa 	480(%1,%%rax), %%ymm15	\n\t"
+
+            "vmovdqa	%%ymm0,    (%0,%%rax)	\n\t"
+            "vmovdqa	%%ymm1,  32(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm2,  64(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm3,  96(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm4, 128(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm5, 160(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm6, 192(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm7, 224(%0,%%rax)	\n\t"
+
+            "vmovntps	%%ymm8,  256(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm9,  288(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm10, 320(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm11, 352(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm12, 384(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm13, 416(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm14, 448(%0,%%rax)	\n\t"
+            "vmovntps	%%ymm15, 480(%0,%%rax)	\n\t"
 
             "add	$512, %%rax		\n\t"
             "dec	%2			\n\t"
             "jnz 	1b			\n\t"
             "pop 	%2			\n\t"
 
-            "sfence"
+            "mfence"
     :
     : "p" (dst), "p" (src), "r" (sse_size)
     : "%rax"
         );
 }
 
-void pcilib_pagecpy(void *dst, void *src, size_t size) {
+void pcilib_pagecpy(void *dst, const void *src, size_t size) {
     int gen = pcilib_get_cpu_gen();
-    if ((gen > 3)&&(size%4096==0)&&((uintptr_t)dst%32==0)&&((uintptr_t)src%32==0)) {
+    if ((gen > 3)&&((size%4096)==0)&&(((uintptr_t)dst%32)==0)&&(((uintptr_t)src%32)==0)) {
 	pcilib_memcpy4k_avx(dst, src, size);
     } else
 	memcpy(dst, src, size);

+ 1 - 1
pcilib/pagecpy.h

@@ -20,7 +20,7 @@ extern "C" {
  * @param[in] size - size of memory region in bytes.
  * @return - `dst` or NULL on error
  */
-void pcilib_pagecpy(void *dst, void *src, size_t size);
+void pcilib_pagecpy(void *dst, const void *src, size_t size);
 
 #ifdef __cplusplus
 }