pagecpy.c 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. #define _POSIX_C_SOURCE 200112L
  2. #define _GNU_SOURCE
  3. #include <stdio.h>
  4. #include <string.h>
  5. #include <unistd.h>
  6. #include <stdint.h>
  7. #include <assert.h>
  8. #include <ctype.h>
  9. #include <time.h>
  10. #include <sched.h>
  11. #include <arpa/inet.h>
  12. #include <sys/time.h>
  13. #include "cpu.h"
  14. #include "pci.h"
  15. #include "tools.h"
  16. #include "error.h"
  17. /*
  18. void *memcpy128(void * dst, void const * src, size_t len) {
  19. long pos = - (len>>2);
  20. char * plDst = (char *) dst - 4 * pos;
  21. char const * plSrc = (char const *) src - 4 * pos;
  22. if (pos) {
  23. __asm__ __volatile__ (
  24. "1: \n\t"
  25. "mov (%0,%2,4), %%edi \n\t"
  26. "mov %%edi, (%1,%2,4) \n\t"
  27. "inc %2 \n\t"
  28. "jnz 1b \n\t"
  29. :
  30. : "r" (plSrc), "r" (plDst), "r" (pos)
  31. : "%edi"
  32. );
  33. }
  34. long pos = - ((len>>4)<<4);
  35. char * plDst = (char *) dst - pos;
  36. char const * plSrc = (char const *) src - pos;
  37. if (pos) {
  38. __asm__ __volatile__ (
  39. "1: \n\t"
  40. // "movdqa (%0,%2), %%xmm0 \n\t"
  41. "mov (%0,%2), %%esi \n\t"
  42. "movd %%esi, %%xmm0 \n\t"
  43. "mov 4(%0,%2), %%esi \n\t"
  44. "movd %%esi, %%xmm1 \n\t"
  45. "mov 8(%0,%2), %%esi \n\t"
  46. "movd %%esi, %%xmm2 \n\t"
  47. "mov 12(%0,%2), %%esi \n\t"
  48. "movd %%esi, %%xmm3 \n\t"
  49. "pslldq $4, %%xmm1 \n\t"
  50. "por %%xmm1, %%xmm0 \n\t"
  51. "pslldq $8, %%xmm2 \n\t"
  52. "por %%xmm2, %%xmm0 \n\t"
  53. "pslldq $12, %%xmm3 \n\t"
  54. "por %%xmm3, %%xmm0 \n\t"
  55. "movntdq %%xmm0, (%1,%2) \n\t"
  56. "add $16, %2 \n\t"
  57. "jnz 1b \n\t"
  58. :
  59. : "r" (plSrc), "r" (plDst), "r" (pos)
  60. : "%rsi"
  61. );
  62. }
  63. len &= 0x3;
  64. char * pcDst = (char *) plDst;
  65. char const * pcSrc = (char const *) plSrc;
  66. while (len--) {
  67. *pcDst++ = *pcSrc++;
  68. }
  69. return (dst);
  70. }
  71. */
  72. void pcilib_memcpy4k_avx(void *dst, const void *src, size_t size) {
  73. size_t sse_size = (size / 512);
  74. __asm__ __volatile__ (
  75. "push %2 \n\t"
  76. "mov $0, %%rax \n\t"
  77. "1: \n\t"
  78. "vmovdqa (%1,%%rax), %%ymm0 \n\t"
  79. "vmovdqa 32(%1,%%rax), %%ymm1 \n\t"
  80. "vmovdqa 64(%1,%%rax), %%ymm2 \n\t"
  81. "vmovdqa 96(%1,%%rax), %%ymm3 \n\t"
  82. "vmovdqa 128(%1,%%rax), %%ymm4 \n\t"
  83. "vmovdqa 160(%1,%%rax), %%ymm5 \n\t"
  84. "vmovdqa 192(%1,%%rax), %%ymm6 \n\t"
  85. "vmovdqa 224(%1,%%rax), %%ymm7 \n\t"
  86. "vmovdqa 256(%1,%%rax), %%ymm8 \n\t"
  87. "vmovdqa 288(%1,%%rax), %%ymm9 \n\t"
  88. "vmovdqa 320(%1,%%rax), %%ymm10 \n\t"
  89. "vmovdqa 352(%1,%%rax), %%ymm11 \n\t"
  90. "vmovdqa 384(%1,%%rax), %%ymm12 \n\t"
  91. "vmovdqa 416(%1,%%rax), %%ymm13 \n\t"
  92. "vmovdqa 448(%1,%%rax), %%ymm14 \n\t"
  93. "vmovdqa 480(%1,%%rax), %%ymm15 \n\t"
  94. "vmovdqa %%ymm0, (%0,%%rax) \n\t"
  95. "vmovdqa %%ymm1, 32(%0,%%rax) \n\t"
  96. "vmovntps %%ymm2, 64(%0,%%rax) \n\t"
  97. "vmovntps %%ymm3, 96(%0,%%rax) \n\t"
  98. "vmovntps %%ymm4, 128(%0,%%rax) \n\t"
  99. "vmovntps %%ymm5, 160(%0,%%rax) \n\t"
  100. "vmovntps %%ymm6, 192(%0,%%rax) \n\t"
  101. "vmovntps %%ymm7, 224(%0,%%rax) \n\t"
  102. "vmovntps %%ymm8, 256(%0,%%rax) \n\t"
  103. "vmovntps %%ymm9, 288(%0,%%rax) \n\t"
  104. "vmovntps %%ymm10, 320(%0,%%rax) \n\t"
  105. "vmovntps %%ymm11, 352(%0,%%rax) \n\t"
  106. "vmovntps %%ymm12, 384(%0,%%rax) \n\t"
  107. "vmovntps %%ymm13, 416(%0,%%rax) \n\t"
  108. "vmovntps %%ymm14, 448(%0,%%rax) \n\t"
  109. "vmovntps %%ymm15, 480(%0,%%rax) \n\t"
  110. "add $512, %%rax \n\t"
  111. "dec %2 \n\t"
  112. "jnz 1b \n\t"
  113. "pop %2 \n\t"
  114. "mfence"
  115. :
  116. : "p" (dst), "p" (src), "r" (sse_size)
  117. : "%rax"
  118. );
  119. }
  120. void pcilib_pagecpy(void *dst, const void *src, size_t size) {
  121. int gen = pcilib_get_cpu_gen();
  122. if ((gen > 3)&&((size%4096)==0)&&(((uintptr_t)dst%32)==0)&&(((uintptr_t)src%32)==0)) {
  123. pcilib_memcpy4k_avx(dst, src, size);
  124. } else
  125. memcpy(dst, src, size);
  126. }