123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- #define _POSIX_C_SOURCE 200112L
- #define _GNU_SOURCE
- #include <stdio.h>
- #include <string.h>
- #include <unistd.h>
- #include <stdint.h>
- #include <assert.h>
- #include <ctype.h>
- #include <time.h>
- #include <sched.h>
- #include <arpa/inet.h>
- #include <sys/time.h>
- #include "cpu.h"
- #include "pci.h"
- #include "tools.h"
- #include "error.h"
- /*
- void *memcpy128(void * dst, void const * src, size_t len) {
- long pos = - (len>>2);
- char * plDst = (char *) dst - 4 * pos;
- char const * plSrc = (char const *) src - 4 * pos;
- if (pos) {
- __asm__ __volatile__ (
- "1: \n\t"
- "mov (%0,%2,4), %%edi \n\t"
- "mov %%edi, (%1,%2,4) \n\t"
- "inc %2 \n\t"
- "jnz 1b \n\t"
- :
- : "r" (plSrc), "r" (plDst), "r" (pos)
- : "%edi"
- );
- }
- long pos = - ((len>>4)<<4);
- char * plDst = (char *) dst - pos;
- char const * plSrc = (char const *) src - pos;
- if (pos) {
- __asm__ __volatile__ (
- "1: \n\t"
- // "movdqa (%0,%2), %%xmm0 \n\t"
- "mov (%0,%2), %%esi \n\t"
- "movd %%esi, %%xmm0 \n\t"
- "mov 4(%0,%2), %%esi \n\t"
- "movd %%esi, %%xmm1 \n\t"
- "mov 8(%0,%2), %%esi \n\t"
- "movd %%esi, %%xmm2 \n\t"
- "mov 12(%0,%2), %%esi \n\t"
- "movd %%esi, %%xmm3 \n\t"
- "pslldq $4, %%xmm1 \n\t"
- "por %%xmm1, %%xmm0 \n\t"
- "pslldq $8, %%xmm2 \n\t"
- "por %%xmm2, %%xmm0 \n\t"
- "pslldq $12, %%xmm3 \n\t"
- "por %%xmm3, %%xmm0 \n\t"
-
- "movntdq %%xmm0, (%1,%2) \n\t"
- "add $16, %2 \n\t"
- "jnz 1b \n\t"
- :
- : "r" (plSrc), "r" (plDst), "r" (pos)
- : "%rsi"
- );
- }
- len &= 0x3;
- char * pcDst = (char *) plDst;
- char const * pcSrc = (char const *) plSrc;
- while (len--) {
- *pcDst++ = *pcSrc++;
- }
- return (dst);
- }
- */
- void pcilib_memcpy4k_avx(void *dst, const void *src, size_t size) {
- size_t sse_size = (size / 512);
- __asm__ __volatile__ (
- "push %2 \n\t"
- "mov $0, %%rax \n\t"
- "1: \n\t"
- "vmovdqa (%1,%%rax), %%ymm0 \n\t"
- "vmovdqa 32(%1,%%rax), %%ymm1 \n\t"
- "vmovdqa 64(%1,%%rax), %%ymm2 \n\t"
- "vmovdqa 96(%1,%%rax), %%ymm3 \n\t"
- "vmovdqa 128(%1,%%rax), %%ymm4 \n\t"
- "vmovdqa 160(%1,%%rax), %%ymm5 \n\t"
- "vmovdqa 192(%1,%%rax), %%ymm6 \n\t"
- "vmovdqa 224(%1,%%rax), %%ymm7 \n\t"
- "vmovdqa 256(%1,%%rax), %%ymm8 \n\t"
- "vmovdqa 288(%1,%%rax), %%ymm9 \n\t"
- "vmovdqa 320(%1,%%rax), %%ymm10 \n\t"
- "vmovdqa 352(%1,%%rax), %%ymm11 \n\t"
- "vmovdqa 384(%1,%%rax), %%ymm12 \n\t"
- "vmovdqa 416(%1,%%rax), %%ymm13 \n\t"
- "vmovdqa 448(%1,%%rax), %%ymm14 \n\t"
- "vmovdqa 480(%1,%%rax), %%ymm15 \n\t"
- "vmovdqa %%ymm0, (%0,%%rax) \n\t"
- "vmovdqa %%ymm1, 32(%0,%%rax) \n\t"
- "vmovntps %%ymm2, 64(%0,%%rax) \n\t"
- "vmovntps %%ymm3, 96(%0,%%rax) \n\t"
- "vmovntps %%ymm4, 128(%0,%%rax) \n\t"
- "vmovntps %%ymm5, 160(%0,%%rax) \n\t"
- "vmovntps %%ymm6, 192(%0,%%rax) \n\t"
- "vmovntps %%ymm7, 224(%0,%%rax) \n\t"
- "vmovntps %%ymm8, 256(%0,%%rax) \n\t"
- "vmovntps %%ymm9, 288(%0,%%rax) \n\t"
- "vmovntps %%ymm10, 320(%0,%%rax) \n\t"
- "vmovntps %%ymm11, 352(%0,%%rax) \n\t"
- "vmovntps %%ymm12, 384(%0,%%rax) \n\t"
- "vmovntps %%ymm13, 416(%0,%%rax) \n\t"
- "vmovntps %%ymm14, 448(%0,%%rax) \n\t"
- "vmovntps %%ymm15, 480(%0,%%rax) \n\t"
- "add $512, %%rax \n\t"
- "dec %2 \n\t"
- "jnz 1b \n\t"
- "pop %2 \n\t"
- "mfence"
- :
- : "p" (dst), "p" (src), "r" (sse_size)
- : "%rax"
- );
- }
- void pcilib_pagecpy(void *dst, const void *src, size_t size) {
- int gen = pcilib_get_cpu_gen();
- if ((gen > 3)&&((size%4096)==0)&&(((uintptr_t)dst%32)==0)&&(((uintptr_t)src%32)==0)) {
- pcilib_memcpy4k_avx(dst, src, size);
- } else
- memcpy(dst, src, size);
- }
|