/********************************************************

Miscellaneous MMX-accelerated routines
Copyright 2000 Eugene Kuznetsov (divx@euro.ru)

*********************************************************/

#include "default.h"
#include "mmx.h"
#include "cpuinfo.h"
//#include <iostream>  //cout error message
//using namespace std;

#if !defined WIN32 && defined i386
static void zoom_16_bpp(uint16_t* dest, const uint16_t* src, int dst_w, int dst_h, int src_w, int src_h, int xdim)
{
    int x_min = (src_w%dst_w);
    int y_min = src_h%dst_h;
    int x_accum = 0;
    int y_accum = 0;
    const uint16_t *src2=src;
    int dest_delta = (xdim!=0 ? xdim-dst_w : 0);

    int x_maj = src_w/dst_w*2;
    int y_maj = (src_h/dst_h)*src_w;
    if ((x_maj>0) || (y_maj>0))
    {
	for(int i=0; i<dst_h; i++)
	{

	    int j = dst_w;
	    // for(int j=0; j<dst_w; j++)
	    __asm__ __volatile__
		("pushl %%ebx            \n\t"
		 "m0:                    \n\t"
		 //   *(dest+j)=*src;
		 "movw   (%%eax),%%bx    \n\t"
		 "movw   %%bx, (%%ecx)   \n\t"
		 //   dest++;
		 "addl   $2, %%ecx       \n\t"
		 //   src+=x_maj;
		 "addl   %9, %%eax       \n\t"
		 //   x_accum-=x_min;
		 "subl   %6, %%edx       \n\t"
		 //   if(x_accum<=0) {
		 "jnc    m1              \n\t"
		 //      xaccum+=dst_w;
		 "addl   %7, %%edx       \n\t"
		 //         src++;
		 "addl   $2,%%eax        \n\t"
		 "m1:                            \n\t"
		 "decl   %8              \n\t"
		 "jnz    m0              \n\t"
		 "popl   %%ebx           \n\t"
		 : "=a"(src), "=c"(dest), "=d"(x_accum)           // output
		 : "a"(src),"c"(dest), "d"(x_accum), "m"(x_min), "m"(dst_w), "m"(j), "m"(x_maj)
		);

	    dest+=dest_delta;
	    src2+=y_maj;
	    y_accum+=y_min;

	    if(y_accum>=dst_h)
	    {
		y_accum-=dst_h;
		src2+=src_w;
	    }

	    src=src2;
	}
    }
    else
	//     cout << "FIXME error - Tried to use ASM code which is not shlib friendly" << endl;

	// has to be fixed - causes reference .rel.text
	// objdump --headers --private-headers -T libaviplay.so | less
	// if this output doesn't contain .rel.text than its OK!
#if 1
#warning untested
    {
	int dest2=0;
	int i=0;
	__asm__ __volatile__
	    (
//	     ".data                          \n\t"
//	     "x_accum: .long  0              \n\t"
//	     "y_accum: .long  0              \n\t"
//	     "dest2:   .long  0              \n\t"
//	     "i:       .long  0              \n\t"
//	     ".text                          \n\t"
	     "pushal                         \n\t"
	     "subl $16, %%esp		     \n\t"	
	     "movl  %5, %%ebx                \n\t"   // 8(%%esp) = dest_h
	     "movl  %%ebx, 8(%%esp)          \n\t"
	     "movl  %4, %%ebx                \n\t"   // 12(%%esp) = dest_w
	     "movl  %%ebx, 12(%%esp)         \n\t"
	     "movl  %5, %%ebx                \n\t"   // i = src_h

	     "movl  %%ebx, (%%esp)           \n\t"

	     "for_i:                         \n\t"
	     "movl  %0, 4(%%esp)             \n\t"   // 4(%%esp) = dest

	     "movl  12(%%esp), %%edx         \n\t"

	     "movl  %4, %%ecx                \n\t"   // j = src_w
	     "for_j:                         \n\t"

	     "movw  (%1), %%bx               \n\t"

	     "x_dest_copy_again:             \n\t"
	     "movw   %%bx, (%0)              \n\t"   // *dest = *src
	     "addl   $2, %0                  \n\t"   // dest++;
	     "subl   %6, %%edx               \n\t"   // 12(%%esp)-=x_min
	     "jnc    x_dest_copy_again       \n\t"   // while (12(%%esp) > 0)

	     "no_xdestcopy:                  \n\t"
	     "addl   %2, %%edx               \n\t"   // 12(%%esp) += dest_w
	     "addl   $2, %1                  \n\t"   // src++;
	     "decl   %%ecx                   \n\t"   // j--;
	     "jnz    for_j                   \n\t"   // while (j>0) Next

	     "y:                             \n\t"
	     "movl   %%edx, 12(%%esp)        \n\t"

	     "addl   %8, %0                  \n\t"   // dest += dest_delta

	     "movl   8(%%esp), %%edx         \n\t"
	     "push   %1                      \n\t"   // safe src

	     "y_dest_copy_again:             \n\t"
	     "subl   %7, %%edx               \n\t"   // 8(%%esp) -= y_min
	     "jc     y_no_cpy_again          \n\t"   // while (8(%%esp)>0)
	     "movl   8(%%esp),%%esi          \n\t"
	     "movl   %2, %%ecx               \n\t"
	     "shrl   $1,%%ecx                \n\t"
	     "cld                            \n\t"
	     "rep; movsl                     \n\t"
	     "addl   %8, %0                  \n\t"   // dest += dest_delta

	     "jmp    y_dest_copy_again       \n\t"

	     "y_no_cpy_again:                \n\t"
	     "addl   %3, %%edx               \n\t"   // 8(%%esp) += dest_h
	     "movl   %%edx, 12(%%esp)         \n\t"

	     "pop    %1                      \n\t"   // get src

	     "decl   (%%esp)                 \n\t"
	     "jnz    for_i                   \n\t"
	     "addl   $16, %%esp		     \n\t"

	     "popal                          \n\t"
	     :
	     : "D"(dest), "S"(src), "m"(dst_w), "m"(dst_h), "m"(src_w), "m"(src_h), "a"(x_min), "m"(y_min), "m"(dest_delta)
	    );
    }
#else
    {
	// slow dumb implementation which doesn't work
	uint16_t* pdest = dest;
	for (int i = 0; i < src_w && i < dst_h; i++)
	{
	    for (int j = 0; j < src_h; j++)
		pdest[j] = *src++;
	    pdest += dst_w;
	}
    }
#endif
}


static void zoom_24_bpp(int* dest, const int* src, int dst_w, int dst_h, int src_w, int src_h, int xdim)
{
    int x_maj = src_w/dst_w;
    int x_min = src_w%dst_w;
    int y_maj = (src_h/dst_h)*src_w;
    int y_min = src_h%dst_h;
    int x_accum = 0;
    int y_accum = 0;
    const int *src2=src;
    int dest_delta = (xdim!=0 ? 3*(xdim-dst_w) : 0);

    //    cout << "(" << dst_w << "," << dst_h << "," << src_w << "," << src_h << ")" << endl;
    for (int i=0; i<dst_h; i++)
    {

	for(int j=0; j<dst_w; j++)
	{
	    *dest=*src;
	    dest=(int*)((char*)dest+3);

	    src=(const int*)((const char*)src+x_maj*3);
	    x_accum+=x_min;
	    if(x_accum>=dst_w) {
		x_accum-=dst_w;
		src=(const int*)((const char*)src+3);
	    }
	}

	dest = (int*) ((char*) dest + dest_delta);

	src2=(const int*)((const char*)src2+y_maj*3);
	y_accum+=y_min;
	if(y_accum>=dst_h)
	{
	    y_accum-=dst_h;
	    src2=(const int*)((const char*)src2+3*src_w);
	}
	src=src2;
    }
}

static void zoom_32_bpp(int* dest, const int* src, int dst_w, int dst_h, int src_w, int src_h, int xdim)
{
    int x_maj = src_w/dst_w;
    int x_min = src_w%dst_w;
    int y_maj = (src_h/dst_h)*src_w;
    int y_min = src_h%dst_h;
    int x_accum = 0;
    int y_accum = 0;
    const int *src2=src;
    int dest_delta = (xdim!=0 ? xdim-dst_w : 0);

    //    cout << "(" << dst_w << "," << dst_h << "," << src_w << "," << src_h << ")" << endl;
    for(int i=0; i<dst_h; i++)
    {

	for(int j=0; j<dst_w; j++)
	{
	    *dest=*src;
	    dest++;

	    src+=x_maj;
	    x_accum+=x_min;
	    if(x_accum>=dst_w) {
		x_accum-=dst_w;
		src++;
	    }
	}

	dest+=dest_delta;

	src2+=y_maj;
	y_accum+=y_min;
	if(y_accum>=dst_h)
	{
	    y_accum-=dst_h;
	    src2+=src_w;
	}
	src=src2;
    }
}

void zoom(uint16_t* dest, const uint16_t* src, int dst_w, int dst_h, int src_w, int src_h, int bpp, int xdim)
{
    switch(bpp)
    {
    case 15:
    case 16:
	return zoom_16_bpp(dest,src,dst_w,dst_h,src_w,src_h,xdim);
    case 24:
	return zoom_24_bpp((int*)dest,(const int*)src,dst_w,dst_h,src_w,src_h,xdim);
    case 32:
	return zoom_32_bpp((int*)dest,(const int*)src,dst_w,dst_h,src_w,src_h,xdim);
    }
}

#endif // WIN32

static void v555to565_nommx(uint16_t* dest, const uint16_t* src, int w, int h)
{
    //    dest+=w*(h-1);
    bool flip=(h<0);
    if(flip)
    {
	h=-h;
	src+=w*(h-1);
    }
    for(int i=0; i<h; i++)
    {
	for(int j=0; j<w; j++)
	{
	    uint16_t q=*src++;
	    q+=(q&0xFFE0);
	    *dest++=q;
	}
	if(flip)
	    src-=2*w;
    }
}

static void zoom_2_16_nommx(uint16_t* dest, const uint16_t* src, int w, int h)
{
    for(int i=0; i<h; i+=2)
    {
	for(int j=0; j<w; j+=2)
	{
	    *dest=*src;
	    dest++;
	    src+=2;
	}
	src+=w;
    }
}

static void zoom_2_16_to565_nommx(uint16_t *dest, const uint16_t *src, int w, int h)
{
    //    dest+=w/2*(h/2-1);
    for(int i=0; i<h/2; i++)
    {
	for(int j=0; j<w/2; j++)
	{
	    uint16_t q=*src;
	    q+=(q&0xFFE0);
	    *dest++=q;
	    src+=2;
	}
	src+=w;
	//	dest-=w;
    }
}
#if !defined WIN32 && defined i386
/*********************************************

WARNING

All MMX code assumes that dest scanline sizes
are multiple of 8 bytes.

*********************************************/
static void v555to565_mmx(uint16_t* dest, const uint16_t* src, int w, int h)
{
    bool flip=(h<0);
    if(flip)
    {
	h=-h;
	src+=w*(h-1);
    }
    int64_t line=0xFFe0FFe0FFe0FFe0LL;
    char qw[200];
    __asm__ __volatile__
	(
	 "fsave (%0)\n\t"
	 "emms\n\t"
	 :
	 :"r"(&qw));
    //    dest+=w*(h-1);
    for(int i=0; i<h; i++)
    {
	__asm__ __volatile__
	    (
	     "pushl 	%%ecx		\n\t"
	     "pushl 	%%edx		\n\t"

	     "movq       %3,     %%mm2   \n\t"
	     "movl       %2,     %%edx   \n\t"
	     "addl       %%eax,  %%edx   \n\t"
	     //loop:
	     "pos0:\n\t"

	     "movq       (%%eax),%%mm0   \n\t"
	     "movq       %%mm0,  %%mm1   \n\t"
	     "pand       %%mm2,  %%mm1   \n\t"
	     "paddw      %%mm1,  %%mm0   \n\t"
	     "movq       %%mm0,  (%%ecx) \n\t"
	     "addl       $8,     %%eax   \n\t"
	     "addl       $8,     %%ecx   \n\t"
	     "cmpl       %%eax,  %%edx   \n\t"
	     "ja 	pos0		\n\t"

	     "popl 	%%edx		\n\t"
	     "popl 	%%ecx		\n\t"
	     :
	     :"a"(src), "c"(dest), "m"(2*w), "m"(line)
	    );
	if(flip)
	    src-=w;
	else
	    src+=w;
	dest+=w;
    }
    __asm__ __volatile__ ("frstor (%0)\n\t": :"r"(&qw));
}
static void zoom_2_16_to565_mmx(uint16_t *dest, const uint16_t *src, int w, int h)
{
    int64_t line=0xFFe0FFe0FFe0FFe0LL;
    int64_t line2=0x0000FFFF0000FFFFLL;
    char qw[200];
    __asm__ __volatile__
	(
	 "fsave (%0)\n\t"
	 "emms\n\t"
	 :
	 :"r"(&qw)
	);//who knows what this crazy compiler decides to store in FPU stack?

    //    dest+=w/2*(h/2-1);
    for(int i=0; i<h/2; i++)
    {
	__asm__ __volatile__
	    (
	     "pushl 	%%ecx		\n\t"
	     "pushl 	%%edx		\n\t"

	     "movq 	%2, 	%%mm2	\n\t"
	     "movq 	%3, 	%%mm3	\n\t"
	     "movl   %4,     %%edx   \n\t"
	     "addl   %%eax,  %%edx   \n\t"

	     "pos1:			\n\t"
	     //Load 8 subsequent pixels into mm0 and mm1.
	     //Drop each second one by pand.
	     "movq   (%%eax),%%mm0   \n\t"
	     "addl 	$8,     %%eax   \n\t"
	     "pand 	%%mm3, 	%%mm0	\n\t"
	     "movq   (%%eax),%%mm1   \n\t"
	     "addl   $8,     %%eax	\n\t"
	     "pand 	%%mm3, 	%%mm1	\n\t"
	     //Pack 4 remaining pixels into mm0.
	     "packssdw %%mm1,%%mm0   \n\t"
	     //Convert 555 -> 565.
	     "movq 	%%mm0,  %%mm1   \n\t"
	     "pand   %%mm2,  %%mm1   \n\t"
	     "paddw  %%mm1, 	%%mm0 	\n\t"
	     //Store the result.
	     "movq   %%mm0, (%%ecx) 	\n\t"

	     "addl 	$8,	%%ecx	\n\t"
	     "cmpl   %%eax,  %%edx   \n\t"
	     "ja 	pos1		\n\t"

	     "popl 	%%edx		\n\t"
	     "popl 	%%ecx		\n\t"
	     :
	     :"a"(src), "c"(dest), "m"(line), "m"(line2), "m"(2*w)
	    );
	src+=2*w;
	dest+=w/2;
    }
    __asm__ __volatile__ ("frstor (%0)\n\t": :"r"(&qw));
}

static void zoom_2_16_mmx(uint16_t *dest, const uint16_t *src, int w, int h)
{
    int64_t line2=0x0000FFFF0000FFFFLL;
    char qw[200];
    __asm__ __volatile__
	(
	 "fsave (%0)\n\t"
	 "emms\n\t"
	 :
	 :"r"(&qw)
	);

    for(int i=0; i<h/2; i++)
    {
	__asm__ __volatile__
	    (
	     "pushl 	%%ecx		\n\t"
	     "pushl 	%%edx		\n\t"

	     "movq 	%2, 	%%mm3	\n\t"
	     "movl   %3,     %%edx   \n\t"
	     "addl   %%eax,  %%edx   \n\t"

	     "pos2:			\n\t"
	     //Load 8 subsequent pixels into mm0 and mm1.
	     //Drop each second one by pand.
	     "movq   (%%eax),%%mm0   \n\t"
	     "addl 	$8,     %%eax   \n\t"
	     "pand 	%%mm3, 	%%mm0	\n\t"
	     "movq   (%%eax),%%mm1   \n\t"
	     "addl   $8,     %%eax	\n\t"
	     "pand 	%%mm3, 	%%mm1	\n\t"
	     //Pack 4 remaining pixels into mm0.
	     "packssdw %%mm1,%%mm0   \n\t"
	     //Store the result.
	     "movq   %%mm0, (%%ecx) 	\n\t"

	     "addl 	$8,	%%ecx	\n\t"
	     "cmpl   %%eax,  %%edx   \n\t"
	     "ja 	pos2		\n\t"

	     "popl 	%%edx		\n\t"
	     "popl 	%%ecx		\n\t"
	     :
	     :"a"(src), "c"(dest),  "m"(line2), "m"(2*w)
	    );
	src+=2*w;
	dest+=w/2;
    }
    __asm__ __volatile__ ("frstor (%0)\n\t": :"r"(&qw));
}


void zoom_2_32_mmx(uint32_t *dest, const uint32_t *src, int w, int h)
{
    int64_t line2=0x00000000FFFFFFFFLL;
    char qw[200];
    __asm__ __volatile__
	(
	 "fsave (%0)\n\t"
	 "emms\n\t"
	 :
	 :"r"(&qw)
	);

    //    dest+=w/2*(h/2-1);
    for(int i=0; i<h/2; i++)
    {
	__asm__ __volatile__
	    (
	     "pushl 	%%ecx		\n\t"
	     "pushl 	%%edx		\n\t"

	     "movl   %2,     %%edx   \n\t"
	     "addl   %%eax,  %%edx   \n\t"

	     "pos3:			\n\t"
	     //Load 4 subsequent pixels into mm0 and mm1.
	     "movq   (%%eax),%%mm0   \n\t"
	     "addl 	$8,     %%eax   \n\t"
	     "movq   (%%eax),%%mm1   \n\t"
	     "addl   $8,     %%eax	\n\t"
	     //Put 2 pixels into mm0.
	     //this should work, but I'm too lazy to check it.
	     "punpckhdq %%mm1,%%mm0   \n\t"
	     //Store the result.
	     "movq   %%mm0, (%%ecx) 	\n\t"

	     "addl 	$8,	%%ecx	\n\t"
	     "cmpl   %%eax,  %%edx   \n\t"
	     "ja 	pos3		\n\t"

	     "popl 	%%edx		\n\t"
	     "popl 	%%ecx		\n\t"
	     :
	     :"a"(src), "c"(dest), "m"(4*w)
	    );
	src+=2*w;
	dest+=w/2;
    }
    __asm__ __volatile__ ("frstor (%0)\n\t": :"r"(&qw));
}

void zoom_2_32_nommx(uint32_t *dest, const uint32_t *src, int w, int h)
{
    for(int i=0; i<h/2; i++)
    {
	for(int j=0; j<w/2; j++)
	{
	    *dest=*src;
	    dest++;
	    src+=2;
	}
	src+=w;
    }
}

static void v555to565_stub(uint16_t* dest, const uint16_t* src, int w, int h)
{
    if(freq.HaveMMX())
	v555to565=v555to565_mmx;
    else
	v555to565=v555to565_nommx;
    v555to565(dest,src,w,h);
}

static void zoom_2_16_stub(uint16_t* dest, const uint16_t* src, int w, int h)
{
    if(freq.HaveMMX())
	zoom_2_16=zoom_2_16_mmx;
    else
	zoom_2_16=zoom_2_16_nommx;
    zoom_2_16(dest,src,w,h);
}

static void zoom_2_16_to565_stub(uint16_t *dest, const uint16_t *src, int w, int h)
{
    if(freq.HaveMMX())
	zoom_2_16_to565=zoom_2_16_to565_mmx;
    else
	zoom_2_16_to565=zoom_2_16_to565_nommx;
    zoom_2_16_to565(dest,src,w,h);
}

static void zoom_2_32_stub(uint32_t *dest, const uint32_t *src, int w, int h)
{
    if(freq.HaveMMX())
	zoom_2_32=zoom_2_32_mmx;
    else
	zoom_2_32=zoom_2_32_nommx;
    zoom_2_32(dest,src,w,h);
}

void (*v555to565)(uint16_t*, const uint16_t*, int, int)=v555to565_stub;
void (*zoom_2_16)(uint16_t*, const uint16_t*, int, int)=zoom_2_16_stub;
void (*zoom_2_16_to565)(uint16_t *, const uint16_t *, int,int)=zoom_2_16_to565_stub;
void (*zoom_2_32)(uint32_t *, const uint32_t *, int, int)=zoom_2_32_stub;

#else

void (*v555to565)(uint16_t*, const uint16_t*, int, int)=v555to565_nommx;
void zoom(uint16_t* dest, const uint16_t* src, int dst_w, int dst_h, int src_w, int src_h, int bpp, int xdim)
{
#warning ZOOM has to be written
}
#endif
