static void cmd_cpuid(uint32_t info_eax,uint32_t info_ecx,uint32_t* cpu_info) {
__asm {
mov eax, info_eax
mov ecx, info_ecx
mov edi, cpu_info
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12],edx
}
}
//检测CPU指令集, 注:不支持AVX/AVX2的检测,AVX/AVX2不但要CPU支持,而且还需要OS的支持
int cpu_detect(void) {
uint32_t cpu_info0[4] = { 0, 0, 0, 0 };
uint32_t cpu_info1[4] = { 0, 0, 0, 0 };
uint32_t cpu_info7[4] = { 0, 0, 0, 0 };
int cpu_info_ = 0;
cmd_cpuid(0, 0, cpu_info0);
cmd_cpuid(1, 0, cpu_info1);
if(cpu_info0[0] >= 7) cmd_cpuid(7, 0, cpu_info7);
cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
return cpu_info_;
}
//yuv -> bgr格式, 15bit精度
void yuv_to_bgr_row_c(uint8_t *ybuf,uint8_t *ubuf,uint8_t *vbuf,uint8_t *bgr,int width)
{
for(int x=0; x<width; x++) {
int y = (ybuf[x] - 16)*38142;
int u = ubuf[x/2] - 128;
int v = vbuf[x/2] - 128;
bgr[x*3+2] = pix_clip((y + 58753*v)>>15); //r
bgr[x*3+1] = pix_clip((y - 6980*u - 17465*v)>>15); //g
bgr[x*3+0] = pix_clip((y + 69206*u)>>15); //b
}
}
void yuv_to_bgra_row_c(uint8_t *ybuf,uint8_t *ubuf,uint8_t *vbuf,uint8_t *bgr,int width)
{
for(int x=0; x<width; x++) {
int y = (ybuf[x] - 16);
int u = ubuf[x/2] - 128;
int v = vbuf[x/2] - 128;
y = y * 38142;
bgr[x*4] = pix_clip((y + 69206*u)>>15);
bgr[x*4+1] = pix_clip((y - 6980*u - 17465*v)>>15);
bgr[x*4+2] = pix_clip((y + 58753*v)>>15);
}
}
#define R_INDEX 2
#define G_INDEX 1
#define B_INDEX 0
#define C_BPP4 4
void bgra_to_y_row_c(uint8_t *src,uint8_t *ybuf,int width) {
for(int y=0; y<width; y++) {
int r = src[y*C_BPP4+R_INDEX];
int g = src[y*C_BPP4+G_INDEX];
int b = src[y*C_BPP4+B_INDEX];
int pix = (r*5983 +20127*g+2032*b+524288)>>15;
if( pix < 16 ) pix = 16;
else if( pix > 235 ) pix = 235;
ybuf[y] = (uint8_t)pix;
}
}
void bgra_to_uv_row_c(uint8_t *src,uint8_t *ubuf,uint8_t *vbuf,int stride,int width) {
for(int y=0,ii=0; y<width; y+=2,ii++) {
int r[4],g[4],b[4];
r[0] = (int)src[y*C_BPP4+R_INDEX];
g[0] = (int)src[y*C_BPP4+G_INDEX];
b[0] = (int)src[y*C_BPP4+B_INDEX];
r[1] = (int)src[(y+1)*C_BPP4+R_INDEX];
g[1] = (int)src[(y+1)*C_BPP4+G_INDEX];
b[1] = (int)src[(y+1)*C_BPP4+B_INDEX];
r[2] = (int)src[y*C_BPP4+R_INDEX+stride];
g[2] = (int)src[y*C_BPP4+G_INDEX+stride];
b[2] = (int)src[y*C_BPP4+B_INDEX+stride];
r[3] = (int)src[(y+1)*C_BPP4+R_INDEX+stride];
g[3] = (int)src[(y+1)*C_BPP4+G_INDEX+stride];
b[3] = (int)src[(y+1)*C_BPP4+B_INDEX+stride];
int uu=0,vv=0;
for(int k=0; k<4; k++) {
uu += (((-3298*r[k]-11094*g[k]+14392*b[k])+4194304)>>15);
vv += (((14392*r[k]-13073*g[k]-1320*b[k])+4194304)>>15);
}
uu /= 4;
vv /= 4;
if( uu < 16 ) uu = 16;
else if( uu > 240.0 ) uu = 240;
if( vv < 16 ) vv = 16;
else if( vv > 240.0 ) vv = 240;
ubuf[ii] = (uint8_t)uu;
vbuf[ii] = (uint8_t)vv;
}
}
void yuv_to_bgra_row_sse41(uint8_t *ybuf,uint8_t *ubuf,uint8_t *vbuf,uint8_t *bgra,int width)
{
__asm {
mov edi, bgra
mov esi, ybuf
mov edx, ubuf
mov ecx, vbuf
mov eax, width
mov ebx, 16
movd xmm7,ebx
pshufd xmm7,xmm7,0x00
mov ebx, 128
movd xmm6,ebx
pshufd xmm6,xmm6,0x00
loop_convert:
pxor xmm4,xmm4
movdqu xmm0, [esi]
movdqu xmm1, [edx]
movdqu xmm2, [ecx]
movdqa xmm3, xmm1
movdqa xmm5, xmm2
punpcklbw xmm1, xmm3
punpcklbw xmm2, xmm5
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm4
punpcklbw xmm2, xmm4
punpcklwd xmm0, xmm4
punpcklwd xmm1, xmm4
punpcklwd xmm2, xmm4
psubd xmm0, xmm7
psubd xmm1, xmm6
psubd xmm2, xmm6
mov ebx, 38142
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmulld xmm0, xmm5
movdqu xmm3, xmm2
mov ebx, 58753
movd xmm4, ebx
pshufd xmm4, xmm4,0x00
pmulld xmm3, xmm4
paddd xmm3, xmm0
movdqu xmm4, xmm1
mov ebx, 69206
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmulld xmm4, xmm5
paddd xmm4, xmm0
mov ebx, 6980
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmulld xmm1, xmm5
psubd xmm0, xmm1
mov ebx, 17465
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmulld xmm2, xmm5
psubd xmm0, xmm2
psrad xmm3, 15
psrad xmm0, 15
psrad xmm4, 15
pxor xmm1, xmm1
pxor xmm2, xmm2
packusdw xmm3, xmm1
packusdw xmm0, xmm2
packusdw xmm4, xmm1
packuswb xmm3, xmm2 //
packuswb xmm0, xmm1 //
packuswb xmm4, xmm2 //
punpcklbw xmm4, xmm0
punpcklbw xmm3, xmm2
punpcklwd xmm4, xmm3
movdqu [edi],xmm4
add edi,16
add esi,4
add edx,2
add ecx,2
sub eax,4
jg loop_convert
emms
}
}
void yuv_to_bgra_row_sse2(uint8_t *ybuf,uint8_t *ubuf,uint8_t *vbuf,uint8_t *bgra,int width)
{
__asm {
mov ebx, 0x00100010
movd xmm7,ebx
pshufd xmm7,xmm7,0x00
mov ecx, 0x00800080
movd xmm6,ecx
pshufd xmm6,xmm6,0x00
mov edi, bgra
mov esi, ybuf
mov edx, ubuf
mov ecx, vbuf
mov eax, width
loop_convert:
pxor xmm4, xmm4
movdqu xmm0, [esi]
movdqu xmm1, [edx]
movdqu xmm2, [ecx]
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm1
punpcklbw xmm2, xmm2
punpcklbw xmm1, xmm4
punpcklbw xmm2, xmm4
psubw xmm0, xmm7
psubw xmm1, xmm6
psubw xmm2, xmm6
mov ebx, 0x00250025
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmullw xmm0, xmm5
movdqa xmm3, xmm2
mov ebx, 0x00390039
movd xmm4, ebx
pshufd xmm4, xmm4,0x00
pmullw xmm3, xmm4
paddw xmm3, xmm0
movdqa xmm4, xmm1
mov ebx, 0x00440044
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmullw xmm4, xmm5
paddw xmm4, xmm0
mov ebx, 0x00070007
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmullw xmm1, xmm5
psubw xmm0, xmm1
mov ebx, 0x00110011
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmullw xmm2, xmm5
psubw xmm0, xmm2
psraw xmm3, 5
psraw xmm0, 5
psraw xmm4, 5
pxor xmm5, xmm5
packuswb xmm3, xmm5
packuswb xmm0, xmm5
packuswb xmm4, xmm5
punpcklbw xmm4, xmm0
punpcklbw xmm3, xmm5
movhlps xmm0, xmm4
movhlps xmm1, xmm3
punpcklwd xmm4, xmm3
punpcklwd xmm0, xmm1
movdqu [edi], xmm4
movdqu [edi+16],xmm0
add edi,32
add esi,8
add edx,4
add ecx,4
sub eax,8
jg loop_convert
emms
}
}
IMGALIGN32 const char krgb24_index[16]={0,1,2,4,5,6,8,9,10,12,13,14,0,0,0,0};
void yuv_to_bgr_row_ssse3(uint8_t *ybuf,uint8_t *ubuf,uint8_t *vbuf,uint8_t *bgr,int width)
{
__asm {
mov ebx, 0x00100010
movd xmm7,ebx
pshufd xmm7,xmm7,0x00
mov ecx, 0x00800080
movd xmm6,ecx
pshufd xmm6,xmm6,0x00
mov edi, bgr
mov esi, ybuf
mov edx, ubuf
mov ecx, vbuf
mov eax, width
loop_convert:
pxor xmm4, xmm4
movdqu xmm0, [esi]
movdqu xmm1, [edx]
movdqu xmm2, [ecx]
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm1
punpcklbw xmm2, xmm2
punpcklbw xmm1, xmm4
punpcklbw xmm2, xmm4
psubsw xmm0,xmm7
psubsw xmm1,xmm6
psubsw xmm2,xmm6
mov ebx, 0x00250025
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmullw xmm0, xmm5
movdqa xmm3, xmm2
mov ebx, 0x00390039
movd xmm4, ebx
pshufd xmm4, xmm4,0x00
pmullw xmm3, xmm4
paddw xmm3, xmm0
movdqa xmm4, xmm1
mov ebx, 0x00440044
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmullw xmm4, xmm5
paddsw xmm4, xmm0
mov ebx, 0x00070007
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmullw xmm1, xmm5
psubsw xmm0, xmm1
mov ebx, 0x00110011
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmullw xmm2, xmm5
psubsw xmm0, xmm2
psraw xmm3, 5
psraw xmm0, 5
psraw xmm4, 5
pxor xmm5, xmm5
packuswb xmm3, xmm5
packuswb xmm0, xmm5
packuswb xmm4, xmm5
punpcklbw xmm4, xmm0
punpcklbw xmm3, xmm5
movhlps xmm0, xmm4
movhlps xmm1, xmm3
punpcklwd xmm4, xmm3
punpcklwd xmm0, xmm1
movdqa xmm2, krgb24_index
pshufb xmm4, xmm2
pshufb xmm0, xmm2
movdqu [edi], xmm4
movdqu [edi+12],xmm0
add edi,24
add esi,8
add edx,4
add ecx,4
sub eax,8
jg loop_convert
emms
}
}
void yuv_to_bgr_row_sse41(uint8_t *ybuf,uint8_t *ubuf,uint8_t *vbuf,uint8_t *bgr,int width)
{
__asm {
mov edi, bgr
mov esi, ybuf
mov edx, ubuf
mov ecx, vbuf
mov eax, width
mov ebx, 16
movd xmm7,ebx
pshufd xmm7,xmm7,0x00
mov ebx, 128
movd xmm6,ebx
pshufd xmm6,xmm6,0x00
loop_convert:
pxor xmm4,xmm4
movdqu xmm0, [esi]
movdqu xmm1, [edx]
movdqu xmm2, [ecx]
movdqa xmm3, xmm1
movdqa xmm5, xmm2
punpcklbw xmm1, xmm3
punpcklbw xmm2, xmm5
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm4
punpcklbw xmm2, xmm4
punpcklwd xmm0, xmm4
punpcklwd xmm1, xmm4
punpcklwd xmm2, xmm4
psubd xmm0, xmm7
psubd xmm1, xmm6
psubd xmm2, xmm6
mov ebx, 38142
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmulld xmm0, xmm5
movdqu xmm3, xmm2
mov ebx, 58753
movd xmm4, ebx
pshufd xmm4, xmm4,0x00
pmulld xmm3, xmm4
paddd xmm3, xmm0
movdqu xmm4, xmm1
mov ebx, 69206
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmulld xmm4, xmm5
paddd xmm4, xmm0
mov ebx, 6980
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmulld xmm1, xmm5
psubd xmm0, xmm1
mov ebx, 17465
movd xmm5, ebx
pshufd xmm5, xmm5,0x00
pmulld xmm2, xmm5
psubd xmm0, xmm2
psrad xmm3, 15
psrad xmm0, 15
psrad xmm4, 15
pxor xmm1, xmm1
pxor xmm2, xmm2
packusdw xmm3, xmm1
packusdw xmm0, xmm2
packusdw xmm4, xmm1
packuswb xmm3, xmm2 //
packuswb xmm0, xmm1 //
packuswb xmm4, xmm2 //
punpcklbw xmm4, xmm0
punpcklbw xmm3, xmm2
punpcklwd xmm4, xmm3
movdqa xmm0, krgb24_index
pshufb xmm4, xmm0
movdqu [edi],xmm4
add edi,12
add esi,4
add edx,2
add ecx,2
sub eax,4
jg loop_convert
emms
}
}
IMGALIGN32 const short krgba2y_index[8]={2032,20127,5983,0, 2032,20127,5983,0};
IMGALIGN32 const char kj_rgba_index[16]={8,79,23,0,8,79,23,0,8,79,23,0,8,79,23,0};
void bgra_to_y_row_sse41(uint8_t *src,uint8_t *ybuf,int width) {
#ifdef __PIX_15BITS__
__asm {
mov esi, src
mov edi, ybuf
mov eax, width
movdqa xmm7, krgba2y_index
mov ecx, 524288
movd xmm5, ecx
pshufd xmm6, xmm5,0x00
pxor xmm4, xmm4
loop_convert:
movdqu xmm0, [esi]
movdqu xmm2, [esi+16]
movhlps xmm1, xmm0
movhlps xmm3, xmm2
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm4
punpcklbw xmm2, xmm4
punpcklbw xmm3, xmm4
pmaddwd xmm0, xmm7
pmaddwd xmm1, xmm7
pmaddwd xmm2, xmm7
pmaddwd xmm3, xmm7
phaddd xmm0, xmm1
phaddd xmm2, xmm3
paddd xmm0, xmm6
paddd xmm2, xmm6
psrad xmm0, 15
psrad xmm2, 15
packusdw xmm0, xmm2
packuswb xmm0, xmm4
//next 16
movdqu xmm1, [esi+32]
movdqu xmm3, [esi+48]
movhlps xmm2, xmm1
punpcklbw xmm1, xmm4
punpcklbw xmm2, xmm4
pmaddwd xmm1, xmm7
pmaddwd xmm2, xmm7
phaddd xmm1, xmm2
movhlps xmm2, xmm3
punpcklbw xmm3, xmm4
punpcklbw xmm2, xmm4
pmaddwd xmm3, xmm7
pmaddwd xmm2, xmm7
phaddd xmm3, xmm2
paddd xmm1, xmm6
paddd xmm3, xmm6
psrld xmm1, 15
psrld xmm3, 15
packusdw xmm1, xmm3
packuswb xmm1, xmm4
movlhps xmm0, xmm1
movdqu [edi],xmm0
add edi, 16
add esi, 64
sub eax, 16
jg loop_convert
emms
}
#else
__asm {
mov esi, src
mov edi, ybuf
mov eax, width
movdqa xmm7, kj_rgba_index
mov ecx, 0x08000800
movd xmm5, ecx
pshufd xmm6, xmm5,0x00
pxor xmm4, xmm4
loop_convert:
movdqu xmm0, [esi]
movdqu xmm1, [esi+16]
movdqu xmm2, [esi+32]
movdqu xmm3, [esi+48]
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm7
pmaddubsw xmm3, xmm7
phaddw xmm0, xmm1
phaddw xmm2, xmm3
paddw xmm0, xmm6
paddw xmm2, xmm6
psraw xmm0, 7
psraw xmm2, 7
packuswb xmm0, xmm4
packuswb xmm2, xmm4
movlhps xmm0, xmm2
movdqu [edi],xmm0
add edi, 16
add esi, 64
sub eax, 16
jg loop_convert
emms
}
#endif //
}
//IMGALIGN32 const short krgba2u_index[16]={540,-347,-103,0, 540,-347,-103,0};
//IMGALIGN32 const short krgba2v_index[16]={-41,-409,450,0, -41,-409,450,0};
IMGALIGN32 const short krgba2u_index[8]={14392,-11094,-3298,0, 14392,-11094,-3298,0};
IMGALIGN32 const short krgba2v_index[8]={-1320,-13073,14392,0, -1320,-13073,14392,0};
IMGALIGN32 const char kua_rgba_index[16]={56,-43,-13,0, 56,-43,-13,0, 56,-43,-13,0, 56,-43,-13,0};
IMGALIGN32 const char kva_rgba_index[16]={-5,-51,56,0,-5,-51,56,0,-5,-51,56,0,-5,-51,56,0};
void bgra_to_uv_row_sse41(uint8_t *src,uint8_t *ubuf,uint8_t *vbuf,int stride,int width) {
#ifdef __PIX_15BITS__
__asm {
mov eax, 4194304
movd xmm4, eax
pshufd xmm5, xmm4, 0x00
movdqa xmm7, krgba2u_index
movdqa xmm6, krgba2v_index
mov esi, src
mov edi, ubuf
mov edx, vbuf
mov eax, width
mov ecx, stride
pxor xmm4, xmm4
loop_convert:
movdqu xmm0, [esi]
movdqu xmm1, [esi+ecx]
movhlps xmm2, xmm0
movhlps xmm3, xmm1
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm4
punpcklbw xmm2, xmm4
punpcklbw xmm3, xmm4
pmaddwd xmm0, xmm7
pmaddwd xmm1, xmm7
pmaddwd xmm2, xmm7
pmaddwd xmm3, xmm7
phaddd xmm0, xmm1
phaddd xmm2, xmm3
paddd xmm0, xmm5
paddd xmm2, xmm5
psrad xmm0, 15
psrad xmm2, 15
packssdw xmm0, xmm2
phaddw xmm0, xmm4
phaddw xmm0, xmm4
psraw xmm0, 2
packuswb xmm0, xmm4
movd ebx, xmm0
mov [edi], bx
//v
movdqu xmm0, [esi]
movdqu xmm1, [esi+ecx]
movhlps xmm2, xmm0
movhlps xmm3, xmm1
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm4
punpcklbw xmm2, xmm4
punpcklbw xmm3, xmm4
pmaddwd xmm0, xmm6
pmaddwd xmm1, xmm6
pmaddwd xmm2, xmm6
pmaddwd xmm3, xmm6
phaddd xmm0, xmm1
phaddd xmm2, xmm3
paddd xmm0, xmm5
paddd xmm2, xmm5
psrad xmm0, 15
psrad xmm2, 15
packssdw xmm0, xmm2
phaddw xmm0, xmm4
phaddw xmm0, xmm4
psraw xmm0, 2
packuswb xmm0, xmm4
movd ebx, xmm0
mov [edx], bx
add esi, 16
add edi, 2
add edx, 2
sub eax, 4
jg loop_convert
emms
}
#else
__asm {
mov eax, 0x40004000
movd xmm4, eax
pshufd xmm5, xmm4, 0x00
movdqa xmm7, kua_rgba_index
movdqa xmm6, kva_rgba_index
mov esi, src
mov edi, ubuf
mov edx, vbuf
mov eax, width
mov ecx, stride
pxor xmm4, xmm4
loop_convert:
movdqu xmm0, [esi]
movdqu xmm1, [esi+ecx]
movdqa xmm2, xmm0
movdqa xmm3, xmm1
pmaddubsw xmm0, xmm7
pmaddubsw xmm1, xmm7
pmaddubsw xmm2, xmm6
pmaddubsw xmm3, xmm6
phaddsw xmm0, xmm2
phaddsw xmm1, xmm3
paddsw xmm0, xmm5
paddsw xmm1, xmm5
psraw xmm0, 7
psraw xmm1, 7
paddsw xmm0, xmm1
phaddsw xmm0, xmm4
psraw xmm0, 2
packuswb xmm0, xmm4
movd ebx, xmm0
mov [edi],bx
shr ebx, 16
mov [edx],bx
add esi, 16
add edi, 2
add edx, 2
sub eax, 4
jg loop_convert
emms
}
#endif //
}
typedef void (*fn_yuv_to_bgr_row)(uint8_t *ybuf,uint8_t *ubuf,uint8_t *vbuf,uint8_t *bgr,int width);
typedef void (*fn_bgra_to_y_row)(uint8_t *src,uint8_t *ybuf,int width);
typedef void (*fn_bgra_to_uv_row)(uint8_t *src,uint8_t *ubuf,uint8_t *vbuf,int stride,int width);
void yv12_to_bgra(uint8_t *ybuf,int ystride,
uint8_t *ubuf,int ustride,
uint8_t *vbuf,int vstride,
uint8_t *bgra, int stride,
int width, int height)
{
int iheight = abs(height);
int istride = stride;
if( height < 0 ) {
istride = -stride;
bgra = bgra + (iheight-1)*stride;
}
fn_yuv_to_bgr_row func = yuv_to_bgra_row_c;
if( g_cpu_caps & kCpuHasSSE2 ) func = yuv_to_bgra_row_sse2;
if( g_cpu_caps & kCpuHasSSE41 ) func = yuv_to_bgra_row_sse41;
for(int y=0; y<iheight-1; y+=2) {
func(ybuf,ubuf,vbuf,bgra,width);
func(ybuf+ystride,ubuf,vbuf,bgra+istride,width);
bgra += (2*istride);
ybuf += 2*ystride;
ubuf += ustride;
vbuf += vstride;
}
if( iheight & 1 ) {
func(ybuf,ubuf,vbuf,bgra,width);
}
}
void yv12_to_bgr(uint8_t *ybuf,int ystride,
uint8_t *ubuf,int ustride,
uint8_t *vbuf,int vstride,
uint8_t *bgr, int stride,
int width, int height)
{
int iheight = abs(height);
int istride = stride;
if( height < 0 ) {
istride = -stride;
bgr = bgr + (iheight-1)*stride;
}
fn_yuv_to_bgr_row func = yuv_to_bgr_row_c;
if( g_cpu_caps & kCpuHasSSSE3 ) func = yuv_to_bgr_row_ssse3;
if( g_cpu_caps & kCpuHasSSE41 ) func = yuv_to_bgr_row_sse41;
for(int y=0; y<iheight-1; y+=2) {
func(ybuf,ubuf,vbuf,bgr,width);
func(ybuf+ystride,ubuf,vbuf,bgr+istride,width);
bgr += (2*istride);
ybuf += 2*ystride;
ubuf += ustride;
vbuf += vstride;
}
if( iheight & 1 ) {
func(ybuf,ubuf,vbuf,bgr,width);
}
}
void bgra_to_yv12(uint8_t *src, int stride,
uint8_t *ybuf, int ystride,
uint8_t *ubuf, int ustride,
uint8_t *vbuf, int vstride,
int width, int height)
{
int iheight = abs(height);
int istride = height<0?-stride:stride;
if( height < 0 ) src = src + (iheight-1)*stride;
fn_bgra_to_y_row func_y = bgra_to_y_row_c;
fn_bgra_to_uv_row func_uv = bgra_to_uv_row_c;
if( g_cpu_caps & kCpuHasSSE41 ) {
func_y = bgra_to_y_row_sse41;
func_uv = bgra_to_uv_row_sse41;
}
for(int y=0; y<iheight-1; y+=2) {
func_y(src,ybuf,width);
func_y(src+istride,ybuf+ystride,width);
func_uv(src,ubuf,vbuf,istride,width);
src += (istride*2);
ybuf += (ystride*2);
ubuf += ustride;
vbuf += vstride;
}
if( iheight & 1 ) {
func_uv(src,ubuf,vbuf,0,width/2);
func_y(src,ybuf,width);
}
}
void init_cpu_convert() {
g_cpu_caps = cpu_detect();
}