SSE

説明

まだ書いていない.

使い方

よく使いそうなもの:

そうではないもの:

コード(省スペース)

#define SSE2_M128I_ADD32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "paddd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE2_M128I_ADD64(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "paddq %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE2_M128I_SUB32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "psubd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);

#define SSE2_M128U_MUL32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pmuludq %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);

#define SSE4_M128I_MIN32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pminsd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE4_M128I_MAX32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pmaxsd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE4_M128U_MIN32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pminud %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE4_M128U_MAX32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pmaxud %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);

// d = min(a, b+c)
#define SSE4_M128I_MIN_NOP_ADD32(d,a,b,c) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "movdqu %3, %%xmm2;" "paddd %%xmm2, %%xmm1;" "pminsd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(d) :"m"(a), "m"(b), "m"(c) :);
 
// d = max(a, b+c)
#define SSE4_M128I_MAX_NOP_ADD32(d,a,b,c) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "movdqu %3, %%xmm2;" "paddd %%xmm2, %%xmm1;" "pmaxsd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(d) :"m"(a), "m"(b), "m"(c) :);
 
#define SSE2_M128I_EQ32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pcmpeqd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE2_M128I_NEQ32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pcmpeqd %%xmm1, %%xmm0;" "pcmpeqd %%xmm1, %%xmm1;" "pxor %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE2_M128I_GT32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pcmpgtd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE2_M128I_LT32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqa %%xmm0, %%xmm1;" "movdqu %2, %%xmm2;" "pcmpgtd %%xmm2, %%xmm0;" "pcmpeqd %%xmm2, %%xmm1;" "por %%xmm1, %%xmm0;" "pcmpeqd %%xmm2, %%xmm2;" "pxor %%xmm2, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE2_M128I_GE32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqa %%xmm0, %%xmm1;" "movdqu %2, %%xmm2;" "pcmpgtd %%xmm2, %%xmm0;" "pcmpeqd %%xmm2, %%xmm1;" "por %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);

#define SSE2_M128I_OR(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "por %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE2_M128I_AND(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pand %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);

#define SSE2_M128I_SHIFT_R4(c,a) asm volatile("movdqu %1, %%xmm0;" "psrldq $4, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a) :);

#define SSE2_M128I_SHUFFLE32_0020(c,a) asm volatile("movdqu %1, %%xmm0;" "pshufd $8, %%xmm0, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a) :);
#define SSE2_M128I_SHUFFLE32_0031(c,a) asm volatile("movdqu %1, %%xmm0;" "pshufd $13, %%xmm0, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a) :);
#define SSE2_M128I_UNPACKLO32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "punpckldq %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);

#define SSE2_M128U_MULLO32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "movdqa %%xmm0, %%xmm2;" "movdqa %%xmm1, %%xmm3;" "psrldq $4, %%xmm2;" "psrldq $4, %%xmm3;" "pmuludq %%xmm1, %%xmm0;" "pmuludq %%xmm3, %%xmm2;" "pshufd $8, %%xmm0, %%xmm0;" "pshufd $8, %%xmm2, %%xmm2;" "punpckldq %%xmm2, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE4_M128U_MULLO32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "pmulld %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);
#define SSE2_M128U_MULHI32(c,a,b) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "movdqa %%xmm0, %%xmm2;" "movdqa %%xmm1, %%xmm3;" "psrldq $4, %%xmm2;" "psrldq $4, %%xmm3;" "pmuludq %%xmm1, %%xmm0;" "pmuludq %%xmm3, %%xmm2;" "pshufd $13, %%xmm0, %%xmm0;" "pshufd $13, %%xmm2, %%xmm2;" "punpckldq %%xmm2, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b) :);

#define SSE2_M128U_ADDMOD32(c,a,b,md) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "paddd %%xmm1, %%xmm0;" "movdqu %3, %%xmm1;" "movdqa %%xmm1, %%xmm2;" "movdqa %%xmm0, %%xmm3;" "pcmpeqd %%xmm0, %%xmm2;" "pcmpgtd %%xmm1, %%xmm3;" "por %%xmm3, %%xmm2;" "pxor %%xmm3, %%xmm3;" "pcmpgtd %%xmm0, %%xmm3;" "por %%xmm3, %%xmm2;" "pand %%xmm2, %%xmm1;" "psubd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b), "m"(md) :);
#define SSE2_M128U_SUBMOD32(c,a,b,md) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "psubd %%xmm1, %%xmm0;" "movdqu %3, %%xmm1;" "pxor %%xmm2, %%xmm2;" "pcmpgtd %%xmm0, %%xmm2;" "pand %%xmm2, %%xmm1;" "paddd %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b), "m"(md) :);

#define SSE4_M128U_ADDMOD32(c,a,b,md) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "paddd %%xmm1, %%xmm0;" "movdqa %%xmm0, %%xmm1;" "movdqu %3, %%xmm2;" "psubd %%xmm2, %%xmm1;" "pminud %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b), "m"(md) :);
#define SSE4_M128U_SUBMOD32(c,a,b,md) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "psubd %%xmm1, %%xmm0;" "movdqa %%xmm0, %%xmm1;" "movdqu %3, %%xmm2;" "paddd %%xmm2, %%xmm1;" "pminud %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b), "m"(md) :);
#define SSE4_M128U_MULMOD32M(c,a,b,md,mdninv) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm2;" "movdqa %%xmm0, %%xmm1;" "pmuludq %%xmm2, %%xmm0;" "psrldq $4, %%xmm1;" "psrldq $4, %%xmm2;" "pmuldq %%xmm2, %%xmm1;" "pshufd $8, %%xmm0, %%xmm2;" "pshufd $8, %%xmm1, %%xmm3;" "punpckldq %%xmm3, %%xmm2;" "pmulld %4, %%xmm2;" "movdqu %3, %%xmm4;" "movdqa %%xmm2, %%xmm3;" "pmuludq %%xmm4, %%xmm2;" "psrldq $4, %%xmm3;" "pmuludq %%xmm4, %%xmm3;" "paddq %%xmm0, %%xmm2;" "paddq %%xmm1, %%xmm3;" "pshufd $13, %%xmm2, %%xmm2;" "pshufd $13, %%xmm3, %%xmm3;" "punpckldq %%xmm3, %%xmm2;" "movdqa %%xmm2, %%xmm0;" "psubd %%xmm4, %%xmm2;" "pminud %%xmm2, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(c) :"m"(a), "m"(b), "m"(md), "m"(mdninv) :);

// c = a+b, d = a-b (mod md)
#define SSE4_M128U_ADDSUBMOD32(c,d,a,b,md) asm volatile("movdqu %2, %%xmm0;" "movdqu %3, %%xmm2;" "movdqa %%xmm0, %%xmm1;" "paddd %%xmm2, %%xmm0;" "psubd %%xmm2, %%xmm1;" "movdqu %4, %%xmm3;" "movdqa %%xmm0, %%xmm2;" "psubd %%xmm3, %%xmm2;" "paddd %%xmm1, %%xmm3;" "pminud %%xmm2, %%xmm0;" "pminud %%xmm3, %%xmm1;" "movdqu %%xmm0, %0;" "movdqu %%xmm1, %1;" :"=m"(c), "=m"(d) :"m"(a), "m"(b), "m"(md) :);
// d = a + (b&c)
#define SSE4_M128U_ADDANDMOD32(d,a,b,c,md) asm volatile("movdqu %1, %%xmm0;" "movdqu %2, %%xmm1;" "movdqu %3, %%xmm2;" "pand %%xmm2, %%xmm1;" "paddd %%xmm1, %%xmm0;" "movdqa %%xmm0, %%xmm1;" "movdqu %4, %%xmm2;" "psubd %%xmm2, %%xmm1;" "pminud %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(d) :"m"(a), "m"(b), "m"(c), "m"(md) :);
// d = a + b*c
#define SSE4_M128U_ADDMULMOD32M(d,a,b,c,md,mdninv) asm volatile("movdqu %2, %%xmm0;" "movdqu %3, %%xmm2;" "movdqa %%xmm0, %%xmm1;" "pmuludq %%xmm2, %%xmm0;" "psrldq $4, %%xmm1;" "psrldq $4, %%xmm2;" "pmuldq %%xmm2, %%xmm1;" "pshufd $8, %%xmm0, %%xmm2;" "pshufd $8, %%xmm1, %%xmm3;" "punpckldq %%xmm3, %%xmm2;" "pmulld %5, %%xmm2;" "movdqu %4, %%xmm4;" "movdqa %%xmm2, %%xmm3;" "pmuludq %%xmm4, %%xmm2;" "psrldq $4, %%xmm3;" "pmuludq %%xmm4, %%xmm3;" "paddq %%xmm0, %%xmm2;" "paddq %%xmm1, %%xmm3;" "pshufd $13, %%xmm2, %%xmm2;" "pshufd $13, %%xmm3, %%xmm3;" "punpckldq %%xmm3, %%xmm2;" "movdqa %%xmm2, %%xmm0;" "psubd %%xmm4, %%xmm2;" "pminud %%xmm2, %%xmm0;" "movdqu %1, %%xmm1;" "paddd %%xmm1, %%xmm0;" "movdqa %%xmm0, %%xmm1;" "psubd %%xmm4, %%xmm1;" "pminud %%xmm1, %%xmm0;" "movdqu %%xmm0, %0;" :"=m"(d) :"m"(a), "m"(b), "m"(c), "m"(md), "m"(mdninv) :);

#define SSE4_M128U_CHAR2INT(c,a) asm volatile("movdqu %1, %%xmm0;" "pmovzxbd %%xmm0, %%xmm1;" "movdqu %%xmm1, %0;" :"=m"(c) :"m"(a) :);

コード

hoge


Current time: 2017年09月22日15時09分16秒
Last modified: 2015年12月12日01時20分10秒 (by laycrs)
Tags: no_tags
トップページに戻る

Logged in as: unknown user (not login)

ログイン: