/*====================================================================

filename:     trx_ppc_rec_fpu_ps_opcodes_x87.cpp
project:      GCemu
created:      2004-6-18
mail:		  duddie@walla.com

Copyright (c) 2005 Duddie & Tratax

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

====================================================================*/
/*
 *	Tratax PowerPC recompiler
 *
 *  x87 FPU Opcode implementations for floating point and paired single
 *
 *  2004-8-19 split off from main opcodes
 *
 * Note:
 *
 * Very important:
 * Only EAX, ECX, EDX can be freely used. any other register needs to be preserved
 * because the register cache is using EBX, ESI, EDI, EBP
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "cpu/trx_ppc_cpu.h"
#include "config.h"
#include "trx_ppc_rec.h"
#include "asm_x86.h"
#include "trx_ppc_rec_fpu_ps_opcodes_x87.h"
#include "ppc_disasm.h"
#include "debug/tracers.h"

void gx_write_fifo32(uint32 data)
{
	syslog_error(CPU, "CPU called gx_write_fifo32()... thats bad :)\n");
	exit(0);
}

void gx_write_fifo16(uint16 data)
{
	syslog_error(CPU, "CPU called gx_write_fifo16()... thats bad :)\n");
	exit(0);
}

void gx_write_fifo8(uint8 data)
{
	syslog_error(CPU, "CPU called gx_write_fifo8()... thats bad :)\n");
	exit(0);
}



#pragma warning (disable:4311)

#define CR0_LT (1<<31)
#define CR0_GT (1<<30)
#define CR0_EQ (1<<29)
#define CR0_SO (1<<28)

extern uint32 zero, one;
extern uint32 cr_gt, cr_lt, cr_eq, cr_so;

static uint32 trx_ppc_cmp_and_mask[8] = 
{
	0xfffffff0,
	0xffffff0f,
	0xfffff0ff,
	0xffff0fff,
	0xfff0ffff,
	0xff0fffff,
	0xf0ffffff,
	0x0fffffff
};

static void trx_ppc_gekko_ill(void)
{
	printf("GEKKO UNKNOWN OPCODE: %d\n", (trxCPUrec.opcode >> 1) & 0x1f);
	char buf[64], opStr[16], parmStr[32];
	uint32 target;

	GekkoDisassemble(opStr, parmStr, trxCPUrec.opcode, trxCPUrec.pc, &target);
	sprintf(buf, "%-10s %s", opStr, parmStr);    
	printf("%.8X  %.8X  %s\n", trxCPUrec.pc, trxCPUrec.opcode, buf);
	exit(0);
}

void trx_ppc_gen_gekko(void)
{
	switch((trxCPUrec.opcode >> 1) & 0x1f)
	{
        case 0:
			switch((trxCPUrec.opcode >> 6) & 3)
			{
				case 0: trx_ppc_gen_ps_cmpu0(); break;
				case 1: trx_ppc_gen_ps_cmpo0(); break;
				case 2: trx_ppc_gen_ps_cmpu1(); break;
				case 3: trx_ppc_gen_ps_cmpo1(); break;
				default:
					printf("cmp unhandled!: %d\n",(trxCPUrec.opcode >> 6) & 3);
					exit(0);
					break;
			}
			break;
        case 6:  
			if(trxCPUrec.opcode & 0x40) trx_ppc_gen_psq_lux();
	        else trx_ppc_gen_psq_lx();   
	        break;
        case 7:
			if(trxCPUrec.opcode & 0x40) trx_ppc_gen_psq_stux();
            else trx_ppc_gen_psq_stx();
            break;
        case 8:
          switch((trxCPUrec.opcode >> 6) & 0x1f)
          {
            case 1:	trx_ppc_gen_ps_neg(); break;
	        case 2:	trx_ppc_gen_ps_mr(); break;
	        case 4:	trx_ppc_gen_ps_nabs(); break;
            case 8:	trx_ppc_gen_ps_abs(); break;
            default:trx_ppc_gekko_ill(); break;
          }
          break;
        case 10: trx_ppc_gen_ps_sum0();	break;
        case 11: trx_ppc_gen_ps_sum1();	break;
        case 12: trx_ppc_gen_ps_muls0();break;
        case 13: trx_ppc_gen_ps_muls1();break;
        case 14: trx_ppc_gen_ps_madds0();break;
        case 15: trx_ppc_gen_ps_madds1();break;
        case 16:
		  switch((trxCPUrec.opcode >> 6) & 0x1f)
          {
            case 16: trx_ppc_gen_ps_merge00(); break;
            case 17: trx_ppc_gen_ps_merge01(); break;
            case 18: trx_ppc_gen_ps_merge10(); break;
            case 19: trx_ppc_gen_ps_merge11(); break;
            default: trx_ppc_gekko_ill(); break;
          }
          break;
        case 18: trx_ppc_gen_ps_div(); break;
        case 20: trx_ppc_gen_ps_sub(); break;
        case 21: trx_ppc_gen_ps_add(); break;
		case 22: trx_ppc_gen_dcbz_l(); break;
        case 23: trx_ppc_gen_ps_sel(); break;
		case 24: trx_ppc_gen_ps_res(); break;
        case 25: trx_ppc_gen_ps_mul(); break;
        case 26: trx_ppc_gen_ps_rsqrte(); break;
        case 28: trx_ppc_gen_ps_msub(); break;
        case 29: trx_ppc_gen_ps_madd(); break;
        case 30: trx_ppc_gen_ps_nmsub(); break;
        case 31: trx_ppc_gen_ps_nmadd(); break;
        default: trx_ppc_gekko_ill();break;
      }
}

// main opcode 59
void trx_ppc_gen_group59()
{
	uint32 ext = ((trxCPUrec.opcode>>1)&0x3ff);

	switch (ext & 0x1f) 
	{
		case 18: trx_ppc_gen_fdivsx(); break;
		case 20: trx_ppc_gen_fsubsx(); break;
		case 21: trx_ppc_gen_faddsx(); break;
//		case 22: ppc_alt_gen_fsqrtsx(); return;
		case 24: trx_ppc_gen_fresx(); return;
		case 25: trx_ppc_gen_fmulsx(); break;
		case 28: trx_ppc_gen_fmsubsx(); return;
		case 29: trx_ppc_gen_fmaddsx(); break;
		case 30: trx_ppc_gen_fnmsubsx(); break;
		case 31: trx_ppc_gen_fnmaddsx(); break;
		default:
			printf("[trxCPUrec] unhandled op59: %d\n", ext & 0x1f);
			exit(0);
			break;
	}
}
// main opcode 63, floating point instructions
void trx_ppc_gen_group63()
{
	uint32 ext = ((trxCPUrec.opcode>>1)&0x3ff);
	if (ext & 16) 
	{
		switch (ext & 0x1f) 
		{
		case 18: trx_ppc_gen_fdivx(); return;
		case 20: trx_ppc_gen_fsubx(); return;
		case 21: trx_ppc_gen_faddx(); return;
		//case 22: ppc_alt_fsqrtx(); return;
		case 23: trx_ppc_gen_fselx(); return;
		case 25: trx_ppc_gen_fmulx(); return;
		case 26: trx_ppc_gen_frsqrtex(); return;
		case 28: trx_ppc_gen_fmsubx(); return;
		case 29: trx_ppc_gen_fmaddx(); return;
		case 30: trx_ppc_gen_fnmsubx(); return;
		case 31: trx_ppc_gen_fnmaddx(); return;
		}
		printf("[trxCPUrec] unhandled op63: %d\n", ext & 0x1f);
		exit(0);
	} else {
		switch (ext) 
		{
		case 0: trx_ppc_gen_fcmpu(); return;
		case 12: trx_ppc_gen_frspx(); return;
		case 14: trx_ppc_gen_fctiwx(); return;
		case 15: trx_ppc_gen_fctiwzx(); return;
		//--
		case 32: trx_ppc_gen_fcmpo(); return;
		case 38: trx_ppc_gen_mtfsb1x(); return;
		case 40: trx_ppc_gen_fnegx(); return;
		case 64: trx_ppc_gen_mcrfs(); return;
		case 70: trx_ppc_gen_mtfsb0x(); return;
		case 72: trx_ppc_gen_fmrx(); return;
		//case 134: ppc_opc_mtfsfix(); return;
		case 136: trx_ppc_gen_fnabsx(); return;
		case 264: trx_ppc_gen_fabsx(); return;
		case 583: trx_ppc_gen_mffsx(); return;
		case 711: trx_ppc_gen_mtfsfx(); return;
		}
		printf("[trxCPUrec] unhandled op63: %d\n", ext);
		exit(0);
	}
}


//==============================================================================
// Floating point control opcodes
//

// used
void trx_ppc_gen_mtfsb1x()
{
	int crb, n1, n2;
	
	crb = (trxCPUrec.opcode >> 21)& 0x1f;
	n1 = (trxCPUrec.opcode >> 16)& 0x1f;
	n2 = (trxCPUrec.opcode >> 11)& 0x1f;

	if (crb != 1 && crb != 2) 
	{
//		trxCPUrec.fpscr |= 1<<(31-crb);
		gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
		gen_asm(OR_RI32, EAX, (1<<(31-crb)));
		gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);
	}
}

// used
void trx_ppc_gen_mtfsb0x()
{
	int crb, n1, n2;
	
	crb = (trxCPUrec.opcode >> 21)& 0x1f;
	n1 = (trxCPUrec.opcode >> 16)& 0x1f;
	n2 = (trxCPUrec.opcode >> 11)& 0x1f;

	if (crb != 1 && crb != 2) 
	{
		//trxCPUrec.fpscr &= ~(1<<(31-crbD));
		gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
		gen_asm(AND_RI32, EAX, ~(1<<(31-crb)));
		gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);
	}
}

void trx_ppc_gen_mffsx()
{
	uint32 rD;

	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	
	// trxCPUrec.fpr[rD] = trxCPUrec.fpscr;
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpr[rD], EAX);
	// kill low order bits
	gen_asm(XOR_RR, EAX, EAX);
	gen_asm(MOV_MR, (uint32)(&trxCPUrec.fpr[rD])+4, EAX);
}

void trx_ppc_gen_mtfsfx()
{
	uint32 rB,fm, FM;

	fm = ((trxCPUrec.opcode)>>17)&0xff;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	FM = ((fm&0x80)?0xf0000000:0)|((fm&0x40)?0x0f000000:0)|((fm&0x20)?0x00f00000:0)|((fm&0x10)?0x000f0000:0)|
	     ((fm&0x08)?0x0000f000:0)|((fm&0x04)?0x00000f00:0)|((fm&0x02)?0x000000f0:0)|((fm&0x01)?0x0000000f:0);

	//trxCPUrec.fpscr = (trxCPUrec.fpr[rB] & FM) | (trxCPUrec.fpscr & ~FM);
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	gen_asm(AND_RI32, EAX, ~FM);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.fpr[rB]);
	gen_asm(AND_RI32, ECX, FM);
	gen_asm(OR_RR, EAX, ECX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);
}

// unknown,unverified FPSCR bits not calculated correctly anyway!
void trx_ppc_gen_mcrfs()
{
	uint32 crD, crS, src_mask, dst_mask;
	crD = (trxCPUrec.opcode >> 23)& 0x7;
	crS = (trxCPUrec.opcode >> 18)& 0x7;
	crD = 7-crD;
	crS = 7-crS;
	
	dst_mask = trx_ppc_cmp_and_mask[crD];
	src_mask = trx_ppc_cmp_and_mask[crS];
	
	//trxCPUrec.cr &= trx_ppc_cmp_and_mask[crD];
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, dst_mask);
	//c = (trxCPUrec.fpscr >> (crS*4))&0xf;
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	gen_asm(SHR_RI8, EAX, crS*4);
	gen_asm(AND_RI32, EAX, 0xf);
	//trxCPUrec.cr |= c<<(crD*4);
	gen_asm(SHL_RI8, EAX, crD*4);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
	//trxCPUrec.fpscr &= trx_ppc_cmp_and_mask[crS];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpscr);
	gen_asm(AND_RI32, EAX, src_mask);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, EAX);
}

//==============================================================================
// Floating point load/store opcodes
//
static float scale_value;
static float scale_watch=0xffffffff;
static double double_store;
static float float_store;
static uint32 int32_store;
// used
void trx_ppc_gen_lfs()
{
	uint32 rD, rA;
	sint16 imm;

	// FPU exceptions check already generated outside
	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(rA == 0)
	{
	//	printf("[Tratax recompiler] lfs constant access at %8.8x\n", trxCPUrec.pc);
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, imm);
	}
	else
	{
		if(regc_is_constant(rA))
		{
//			printf("[Tratax recompiler] lfs constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, imm);
	}	
	//r = mem_read32(EA);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&float_store, EAX);
	gen_asm(FLD32_M, (uint32)&float_store);
	// here is actual storing
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		//trx_rec_ps1_double[rD] = f;
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_lfsx()
{
	uint32 rD, rA, rB;

	// FPU exceptions check already generated outside
	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] lfsx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] lfsx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&float_store, EAX);
	gen_asm(FLD32_M, (uint32)&float_store);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		//trx_rec_ps1_double[rD] = f;
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_lfsux()
{
	uint32 rD, rA, rB;

	// FPU exceptions check already generated outside
	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(regc_is_constant(rA) && regc_is_constant(rB))
	{
//		printf("[Tratax recompiler] lfsux constant access at %8.8x\n", trxCPUrec.pc);
	}
	regc_load(EAX, rA);
	regc_load(ECX, rB);
	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
	gen_asm(ADD_RR, ECX, EAX);
	regc_store(ECX, rA);
	//	trxCPUrec.gpr[rD] = mem_read32(EA);	
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&float_store, EAX);
	gen_asm(FLD32_M, (uint32)&float_store);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		//trx_rec_ps1_double[rD] = f;
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_lfsu()
{
	uint32 rD, rA;
	sint16 imm;

	// FPU exceptions check already generated outside
	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(regc_is_constant(rA))
	{
//		printf("[Tratax recompiler] lfsu constant access at %8.8x\n", trxCPUrec.pc);
	}
	//EA = trxCPUrec.gpr[rA] + imm;
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, imm);
	regc_store(ECX, rA);
	//r = mem_read32(EA);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&float_store, EAX);
	gen_asm(FLD32_M, (uint32)&float_store);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		//trx_rec_ps1_double[rD] = f;
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_lfd()
{
	uint32 rD, rA;
	sint16 imm;

	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(rA == 0)
	{
//		printf("[Tratax recompiler] lfd constant access at %8.8x\n", trxCPUrec.pc);
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, imm);
	}
	else
	{
		if(regc_is_constant(rA))
		{
//			printf("[Tratax recompiler] lfd constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, imm);
	}	
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, EAX);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);	
}

// used
void trx_ppc_gen_lfdu()
{
	uint32 rD, rA;
	sint16 imm;

	rD = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(regc_is_constant(rA))
	{
//		printf("[Tratax recompiler] lfdu constant access at %8.8x\n", trxCPUrec.pc);
	}
	//EA = trxCPUrec.gpr[rA] + imm;
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, imm);
	regc_store(ECX, rA);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, EAX);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);	
}

// used
void trx_ppc_gen_lfdx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] lfdx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] lfdx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, EAX);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);	
}

// used
void trx_ppc_gen_lfdux()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(regc_is_constant(rA) && regc_is_constant(rB))
	{
//		printf("[Tratax recompiler] lfdux constant access at %8.8x\n", trxCPUrec.pc);
	}
	regc_load(EAX, rA);
	regc_load(ECX, rB);
	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
	gen_asm(ADD_RR, ECX, EAX);
	regc_store(ECX, rA);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, EAX);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)p_rec_mem_read32);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);	
}

// used
void trx_ppc_gen_stfd()
{
	uint32 rS, rA;
	sint16 imm;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(rA == 0)
	{
//		printf("[Tratax recompiler] stfd constant access at %8.8x\n", trxCPUrec.pc);
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, imm);
	}
	else
	{
		if(regc_is_constant(rA))
		{
//			printf("[Tratax recompiler] stfd constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, imm);
	}	
	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rS]+4);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
	gen_asm(POP_R, ECX);
	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rS]);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

// used
void trx_ppc_gen_stfdu()
{
	uint32 rS, rA;
	sint16 imm;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(regc_is_constant(rA))
	{
//		printf("[Tratax recompiler] stfdu constant access at %8.8x\n", trxCPUrec.pc);
	}
	//EA = trxCPUrec.gpr[rA] + imm;
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, imm);
	regc_store(ECX, rA);

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rS]+4);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
	gen_asm(POP_R, ECX);	

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rS]);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

// used
void trx_ppc_gen_stfdx()
{
	uint32 rS, rA, rB;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfdx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfdx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rS]+4);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
	gen_asm(POP_R, ECX);	

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rS]);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

// used
void trx_ppc_gen_stfdux()
{
	uint32 rS, rA, rB;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(regc_is_constant(rA) && regc_is_constant(rB))
	{
//		printf("[Tratax recompiler] stfdux constant access at %8.8x\n", trxCPUrec.pc);
	}
	regc_load(EAX, rA);
	regc_load(ECX, rB);
	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
	gen_asm(ADD_RR, ECX, EAX);
	regc_store(ECX, rA);

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rS]+4);
	gen_asm(PUSH_R, ECX);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
	gen_asm(POP_R, ECX);	

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rS]);
	gen_asm(ADD_RI32, ECX, 4);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

// used
void trx_ppc_gen_stfs()
{
	uint32 rS, rA;
	sint16 imm;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(rA == 0)
	{
//		printf("[Tratax recompiler] stfs constant access at %8.8x\n", trxCPUrec.pc);
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, imm);
	}
	else
	{
		if(regc_is_constant(rA))
		{
//			printf("[Tratax recompiler] stfs constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, imm);
	}	
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(MOV_RM, EAX, (uint32)&float_store);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

// used
void trx_ppc_gen_stfsx()
{
	uint32 rS, rA, rB;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfsx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfsx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(MOV_RM, EAX, (uint32)&float_store);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

// used
void trx_ppc_gen_stfsux()
{
	uint32 rS, rA, rB;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(regc_is_constant(rA) && regc_is_constant(rB))
	{
//		printf("[Tratax recompiler] stfsux constant access at %8.8x\n", trxCPUrec.pc);
	}
	regc_load(EAX, rA);
	regc_load(ECX, rB);
	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
	gen_asm(ADD_RR, ECX, EAX);
	regc_store(ECX, rA);
	// store
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(MOV_RM, EAX, (uint32)&float_store);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

// used
void trx_ppc_gen_stfsu()
{
	uint32 rS, rA;
	sint16 imm;
	
	rS = (trxCPUrec.opcode >> 21)& 0x1f;
	rA = (trxCPUrec.opcode >> 16)& 0x1f;
	imm = trxCPUrec.opcode & 0xffff;

	if(regc_is_constant(rA))
	{
//		printf("[Tratax recompiler] stfsu constant access at %8.8x\n", trxCPUrec.pc);
	}
	//EA = trxCPUrec.gpr[rA] + imm;
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, imm);
	regc_store(ECX, rA);
	// store
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(MOV_RM, EAX, (uint32)&float_store);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

// used
void trx_ppc_gen_stfiwx()
{
	uint32 rS, rA, rB;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		if(regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfiwx constant access at %8.8x\n", trxCPUrec.pc);
		}
		//EA = trxCPUrec.gpr[rB];
		regc_load(ECX, rB);
	}
	else
	{
		if(regc_is_constant(rA) && regc_is_constant(rB))
		{
//			printf("[Tratax recompiler] stfiwx constant access at %8.8x\n", trxCPUrec.pc);
		}
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB];
		gen_asm(ADD_RR, ECX, EAX);
	}
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.fpr[rS]);
	gen_asm(CALL_M, (uint32)p_rec_mem_write32);
}

//==============================================================================
// Floating point arithmetic opcodes
//

// tested
void trx_ppc_gen_fdivsx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
/*
	f = (trx_rec_ps0_double[rA] / trx_rec_ps0_double[rB]);
	trx_rec_ps0_double[rD] = f;
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		trx_rec_ps1_double[rD] = f;
	}
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FDIV64_M, (uint32)&trx_rec_ps0_double[rB]);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fdivx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
/*
	trx_rec_ps0_double[rD] = (trx_rec_ps0_double[rA] / trx_rec_ps0_double[rB]);
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FDIV64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fsubsx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
/*		
	f = (trx_rec_ps0_double[rA] - trx_rec_ps0_double[rB]);
	trx_rec_ps0_double[rD] = f;

	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		trx_rec_ps1_double[rD] = f;
	}
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fmrx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_rec_ps0_int[rD] = trx_rec_ps0_int[rB]; // maybe copy as doubles is faster ?
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_int[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_int[rD]);
/*
	frd = &trx_rec_ps0_double[rD];
	frb = &trx_rec_ps0_double[rB];
	_asm
	{
		mov edx, dword ptr frb
		fld qword ptr [edx]	; // load double
		mov edx, dword ptr frd
		fstp qword ptr [edx]; // store as double 
	};
*/
}
// used
void trx_ppc_gen_fmulx()
{
	uint32 rD, rA, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

	//trx_rec_ps0_double[rD] = trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC];	
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}
// used
void trx_ppc_gen_fsubx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_rec_ps0_double[rD] = trx_rec_ps0_double[rA] - trx_rec_ps0_double[rB];	
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fmulsx()
{
	uint32 rD, rA, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	f = (trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]);
	trx_rec_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_rec_ps1_double[rD] = f;
	}
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fnegx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
//	trx_rec_ps0_int[rD] = trx_rec_ps0_int[rB] ^ FPU_SIGN_BIT;

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rB]);
	gen_asm(MOV_RM, ECX, (uint32)&trx_rec_ps0_int[rB]+4);
	gen_asm(XOR_RI32, ECX, 0x80000000);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, ECX);

//	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_int[rB],0);
//	gen_asm(FCHS, 0, 0);
//	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_int[rD],0);
}
// used
void trx_ppc_gen_frspx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//float f;
	//f = trx_rec_ps0_double[rB];
	//trx_rec_ps0_double[rD] = f;

	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP32_M, (uint32)&float_store);
	gen_asm(FLD32_M, (uint32)&float_store);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fabsx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
	//trx_rec_ps0_int[rD] = trx_rec_ps0_int[rB] & ~FPU_SIGN_BIT;
	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rB]);
	gen_asm(MOV_RM, ECX, (uint32)&trx_rec_ps0_int[rB]+4);
	gen_asm(AND_RI32, ECX, ~0x80000000);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, ECX);
}

// tested
void trx_ppc_gen_fnabsx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
		
	//trx_rec_ps0_int[rD] = trx_rec_ps0_int[rB] | FPU_SIGN_BIT;
	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rB]);
	gen_asm(MOV_RM, ECX, (uint32)&trx_rec_ps0_int[rB]+4);
	gen_asm(OR_RI32, ECX, 0x80000000);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, ECX);
}

// tested
void trx_ppc_gen_fresx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

/*
	fpu_as_double[frD] = 1.0f / fpu_as_double[frB];

	if(gCPU.hid[2] & HID2_PSE)
	{
		ps1_double[frD] = f;
	}
*/
	gen_asm(FLD1);
	gen_asm(FDIV64_M, (uint32)&trx_rec_ps0_double[rB]);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}	
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// tested
void trx_ppc_gen_faddsx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
/*		
	f = (trx_rec_ps0_double[rA] + trx_rec_ps0_double[rB]);
	trx_rec_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_rec_ps1_double[rD] = f;
	}
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_faddx()
{
	uint32 rD, rA, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
/*		
	trx_rec_ps0_double[rD] = (trx_rec_ps0_double[rA] + trx_rec_ps0_double[rB]);
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_frsqrtex()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

//	trx_rec_ps0_double[rD] = 1.0f / sqrt(trx_rec_ps0_double[rB]);
	gen_asm(FLD1);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSQRT);
	gen_asm(FDIV);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

//==============================================================================
// Floating point comparision opcodes
//

// used
void trx_ppc_gen_fcmpu()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rB]);
	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rA]);
	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(FUCOMIP, 1);
	gen_asm(FPOP); 
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpu_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_fcmpo()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rB]);
	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rA]);
	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(FUCOMIP, 1);
	gen_asm(FPOP);
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpo_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//	trxCPUrec.fpscr |= FPSCR_VXVC;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// tested
void trx_ppc_gen_fselx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

//	if (A.type == ppc_fpr_NaN || trx_rec_ps0_double[rA] < 0.0f) 
//		trx_rec_ps0_double[rD] = trx_rec_ps0_double[rB];
//	else 
//		trx_rec_ps0_double[rD] = trx_rec_ps0_double[rC];
	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rC]);
	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rB]);
	gen_asm(FLDZ);
	gen_asm(FLD64_M, (uint32)&trxCPUrec.fpr[rA]);
	gen_asm(FUCOMIP, 1); // cmp a, 0
	gen_asm(FCMOVB_M, 1); // < 0
	gen_asm(FCMOVU_M, 1); // NaN
	gen_asm(FCMOVNB_M, 2); // >= 0
	gen_asm(FSTP64_M, (uint32)&trxCPUrec.fpr[rD]);
	gen_asm(FPOP);
	gen_asm(FPOP);
}

//==============================================================================
// Floating point conversion and rounding opcodes
//

void trx_ppc_gen_fctiwzx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	gen_asm(FLDCW_M, (uint32)&fpucontrol_roundzero);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FISTP32_M, (uint32)&trxCPUrec.fpr[rD]);
//	gen_asm(FISTP64_M, (uint32)&trxCPUrec.fpr[rD]);   used to be
	gen_asm(FLDCW_M, (uint32)&fpucontrol_default);
}

void trx_ppc_gen_fctiwx()
{
	uint32 rD, rB;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FISTP32_M, (uint32)&trxCPUrec.fpr[rD]);
//	gen_asm(FISTP64_M, (uint32)&trxCPUrec.fpr[rD]);  used to be
}

//==============================================================================
// Floating point multiply-add opcodes
//
// tested
void trx_ppc_gen_fmsubsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	double f;

	f = ((trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) - trx_rec_ps0_double[rB]);
	trx_rec_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_rec_ps1_double[rD] = f;
	}
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fnmsubsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	double f;

	f = -((trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) - trx_rec_ps0_double[rB]);
	trx_rec_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_rec_ps1_double[rD] = f;
	}
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FCHS);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fnmsubx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	trx_rec_ps0_double[rD] = -((trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) - trx_rec_ps0_double[rB]);
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FCHS);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fmsubx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	trx_rec_ps0_double[rD] = (trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) - trx_rec_ps0_double[rB];
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fmaddsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	double f;

	f = (trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) + trx_rec_ps0_double[rB];
	trx_rec_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_rec_ps1_double[rD] = f;
	}
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		

}

// used
void trx_ppc_gen_fnmaddsx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	double f;

	f = -((trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) + trx_rec_ps0_double[rB]);
	trx_rec_ps0_double[rD] = f;

	if(gCPU.hid[2] & HID2_PSE)
	{
		trx_rec_ps1_double[rD] = f;
	}
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FCHS);
	if(trxCPUrec.spr[PPC_HID2] & HID2_PSE)
	{
		gen_asm(FST64_M, (uint32)&trx_rec_ps1_double[rD]);
	}
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

// tested
void trx_ppc_gen_fmaddx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	trx_rec_ps0_double[rD] = (trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) + trx_rec_ps0_double[rB];
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_fnmaddx()
{
	uint32 rD, rA, rB, rC;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;

/*
	trx_rec_ps0_double[rD] = -((trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) + trx_rec_ps0_double[rB]);
*/
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FCHS);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);		
}

//==============================================================================
// Paired Single Load and Store Instructions
//
// dequantization factor
static const float dq_factor[] =
{
	1.0/(1 <<  0),
	1.0/(1 <<  1),
	1.0/(1 <<  2),
	1.0/(1 <<  3),
	1.0/(1 <<  4),
	1.0/(1 <<  5),
	1.0/(1 <<  6),
	1.0/(1 <<  7),
	1.0/(1 <<  8),
	1.0/(1 <<  9),
	1.0/(1 << 10),
	1.0/(1 << 11),
	1.0/(1 << 12),
	1.0/(1 << 13),
	1.0/(1 << 14),
	1.0/(1 << 15),
	1.0/(1 << 16),
	1.0/(1 << 17),
	1.0/(1 << 18),
	1.0/(1 << 19),
	1.0/(1 << 20),
	1.0/(1 << 21),
	1.0/(1 << 22),
	1.0/(1 << 23),
	1.0/(1 << 24),
	1.0/(1 << 25),
	1.0/(1 << 26),
	1.0/(1 << 27),
	1.0/(1 << 28),
	1.0/(1 << 29),
	1.0/(1 << 30),
	1.0/(1 << 31),

	(1ULL << 32),
	(1 << 31),
	(1 << 30),
	(1 << 29),
	(1 << 28),
	(1 << 27),
	(1 << 26),
	(1 << 25),
	(1 << 24),
	(1 << 23),
	(1 << 22),
	(1 << 21),
	(1 << 20),
	(1 << 19),
	(1 << 18),
	(1 << 17),
	(1 << 16),
	(1 << 15),
	(1 << 14),
	(1 << 13),
	(1 << 12),
	(1 << 11),
	(1 << 10),
	(1 <<  9),
	(1 <<  8),
	(1 <<  7),
	(1 <<  6),
	(1 <<  5),
	(1 <<  4),
	(1 <<  3),
	(1 <<  2),
	(1 <<  1),
};

// ECX is assumed to contain EA, scale_value contains scale value
// returns (scaled) double on stack and ECX incremented by size
// load unscaled float
__declspec (naked) void qload_type0(void)
{
	_asm
	{
		push ecx
		call rec_mem_read32
		pop ecx
		add ecx, 4
		mov dword ptr float_store, eax
		fld dword ptr float_store
		ret
	}
}
// load scaled unsigned byte
__declspec (naked) void qload_type4(void)
{
	_asm
	{
		push ecx
		call rec_mem_read8
		pop ecx
		add ecx, 1
		movzx eax, al
		mov dword ptr [int32_store], eax
		fild dword ptr [int32_store]
		fmul dword ptr [scale_value]
		ret
	}
}
// load scaled unsigned word
__declspec (naked) void qload_type5(void)
{
	_asm
	{
		push ecx
		call rec_mem_read16
		pop ecx
		add ecx, 2
		movzx eax, ax
		mov dword ptr int32_store, eax
		fild dword ptr int32_store
		fmul dword ptr scale_value
		ret
	}
}
// load scaled signed byte
__declspec (naked) void qload_type6(void)
{
	_asm
	{
		push ecx
		call rec_mem_read8
		pop ecx
		add ecx, 1
		movsx eax, al
		mov dword ptr int32_store, eax
		fild dword ptr int32_store
		fmul dword ptr scale_value
		ret
	}
}
// load scaled signed word
__declspec (naked) void qload_type7(void)
{
	_asm
	{
		push ecx
		call rec_mem_read16
		pop ecx
		add ecx, 2
		movsx eax, ax
		mov dword ptr int32_store, eax
		fild dword ptr int32_store
		fmul dword ptr scale_value
		ret
	}
}

// quantization factor
static const float q_factor[] =
{
	(1 <<  0),
	(1 <<  1),
	(1 <<  2),
	(1 <<  3),
	(1 <<  4),
	(1 <<  5),
	(1 <<  6),
	(1 <<  7),
	(1 <<  8),
	(1 <<  9),

	(1 << 10),
	(1 << 11),
	(1 << 12),
	(1 << 13),
	(1 << 14),
	(1 << 15),
	(1 << 16),
	(1 << 17),
	(1 << 18),
	(1 << 19),

	(1 << 20),
	(1 << 21),
	(1 << 22),
	(1 << 23),
	(1 << 24),
	(1 << 25),
	(1 << 26),
	(1 << 27),
	(1 << 28),
	(1 << 29),
	(1 << 30),
	(1 << 31),

	1.0/(1ULL << 32),
	1.0/(1 << 31),
	1.0/(1 << 30),

	1.0/(1 << 29),
	1.0/(1 << 28),
	1.0/(1 << 27),
	1.0/(1 << 26),
	1.0/(1 << 25),
	1.0/(1 << 24),
	1.0/(1 << 23),
	1.0/(1 << 22),
	1.0/(1 << 21),
	1.0/(1 << 20),

	1.0/(1 << 19),
	1.0/(1 << 18),
	1.0/(1 << 17),
	1.0/(1 << 16),
	1.0/(1 << 15),
	1.0/(1 << 14),
	1.0/(1 << 13),
	1.0/(1 << 12),
	1.0/(1 << 11),
	1.0/(1 << 10),

	1.0/(1 <<  9),
	1.0/(1 <<  8),
	1.0/(1 <<  7),
	1.0/(1 <<  6),
	1.0/(1 <<  5),
	1.0/(1 <<  4),
	1.0/(1 <<  3),
	1.0/(1 <<  2),
	1.0/(1 <<  1),
};

// ECX is assumed to contain EA, scale_value contains scale value
// ST(0) is assumed to contain double to be stored
// returns ECX incremented by size
// -- rounding is done towards zero for integers --
// store unscaled float
__declspec (naked) void qstore_type0(void)
{
	_asm
	{
		fstp dword ptr float_store
		mov eax, dword ptr float_store
		push ecx
		call rec_mem_write32
		pop ecx
		add ecx, 4
		ret
	}
}
// store scaled unsigned byte
__declspec (naked) void qstore_type4(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 255
		cmp eax, edx
		cmovg eax, edx // clip to 255 if above 255
		// store
		push ecx
		call rec_mem_write8
		pop ecx
		add ecx, 1
		ret
	}
}
// store scaled unsigned word
__declspec (naked) void qstore_type5(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 65535
		cmp eax, edx
		cmovg eax, edx // clip to 65535 if above 65535
		// store
		push ecx
		call rec_mem_write16
		pop ecx
		add ecx, 2
		ret
	}
}
// store scaled signed byte
__declspec (naked) void qstore_type6(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, -128
		cmp eax, edx
		cmovl eax, edx // clip to -128 if below -128
		mov edx, 127
		cmp eax, edx
		cmovg eax, edx // clip to 127 if above 127
		// store
		push ecx
		call rec_mem_write8
		pop ecx
		add ecx, 1
		ret
	}
}
// store scaled signed word
__declspec (naked) void qstore_type7(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, -32768
		cmp eax, edx
		cmovl eax, edx // clip to -32768 if below -32768
		mov edx, 32767
		cmp eax, edx
		cmovg eax, edx // clip to 32767 if above 32767
		// store
		push ecx
		call rec_mem_write16
		pop ecx
		add ecx, 2
		ret
	}
}

__declspec (naked) void qstore_gxfifo_type0(void)
{
	_asm
	{
		fstp dword ptr float_store
		mov eax, dword ptr float_store
		push eax
		call gx_write_fifo32
		add esp, 4
		ret
	}
}
// store scaled unsigned byte
__declspec (naked) void qstore_gxfifo_type4(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 255
		cmp eax, edx
		cmovg eax, edx // clip to 255 if above 255
		// store
		push eax
		call gx_write_fifo8
		add esp, 4
		ret
	}
}
// store scaled unsigned word
__declspec (naked) void qstore_gxfifo_type5(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 65535
		cmp eax, edx
		cmovg eax, edx // clip to 65535 if above 65535
		// store
		push eax
		call gx_write_fifo16
		add esp, 4
		ret
	}
}
// store scaled signed byte
__declspec (naked) void qstore_gxfifo_type6(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, -128
		cmp eax, edx
		cmovl eax, edx // clip to -128 if below -128
		mov edx, 127
		cmp eax, edx
		cmovg eax, edx // clip to 127 if above 127
		// store
		push eax
		call gx_write_fifo8
		add esp, 4
		ret
	}
}
// store scaled signed word
__declspec (naked) void qstore_gxfifo_type7(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, -32768
		cmp eax, edx
		cmovl eax, edx // clip to -32768 if below -32768
		mov edx, 32767
		cmp eax, edx
		cmovg eax, edx // clip to 32767 if above 32767
		// store
		push eax
		call gx_write_fifo16
		add esp, 4
		ret
	}
}

// =================================================
// SELFCHECK ROUTINES
// ECX is assumed to contain EA, scale_value contains scale value
// returns (scaled) double on stack and ECX incremented by size
// load unscaled float
__declspec (naked) void self_qload_type0(void)
{
	_asm
	{
		push ecx
		call rec_slave_mem_read32
		pop ecx
		add ecx, 4
		mov dword ptr float_store, eax
		fld dword ptr float_store
		ret
	}
}
// load scaled unsigned byte
__declspec (naked) void self_qload_type4(void)
{
	_asm
	{
		push ecx
		call rec_slave_mem_read8
		pop ecx
		add ecx, 1
		movzx eax, al
		mov dword ptr [int32_store], eax
		fild dword ptr [int32_store]
		fmul dword ptr [scale_value]
		ret
	}
}
// load scaled unsigned word
__declspec (naked) void self_qload_type5(void)
{
	_asm
	{
		push ecx
		call rec_slave_mem_read16
		pop ecx
		add ecx, 2
		movzx eax, ax
		mov dword ptr int32_store, eax
		fild dword ptr int32_store
		fmul dword ptr scale_value
		ret
	}
}
// load scaled signed byte
__declspec (naked) void self_qload_type6(void)
{
	_asm
	{
		push ecx
		call rec_slave_mem_read8
		pop ecx
		add ecx, 1
		movsx eax, al
		mov dword ptr int32_store, eax
		fild dword ptr int32_store
		fmul dword ptr scale_value
		ret
	}
}
// load scaled signed word
__declspec (naked) void self_qload_type7(void)
{
	_asm
	{
		push ecx
		call rec_slave_mem_read16
		pop ecx
		add ecx, 2
		movsx eax, ax
		mov dword ptr int32_store, eax
		fild dword ptr int32_store
		fmul dword ptr scale_value
		ret
	}
}
// ECX is assumed to contain EA, scale_value contains scale value
// ST(0) is assumed to contain double to be stored
// returns ECX incremented by size
// -- rounding is done towards zero for integers --
// store unscaled float
__declspec (naked) void self_qstore_type0(void)
{
	_asm
	{
		fstp dword ptr float_store
		mov eax, dword ptr float_store
		push ecx
		call rec_slave_mem_write32
		pop ecx
		add ecx, 4
		ret
	}
}
// store scaled unsigned byte
__declspec (naked) void self_qstore_type4(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 255
		cmp eax, edx
		cmovg eax, edx // clip to 255 if above 255
		// store
		push ecx
		call rec_slave_mem_write8
		pop ecx
		add ecx, 1
		ret
	}
}
// store scaled unsigned word
__declspec (naked) void self_qstore_type5(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, 0
		cmp eax, edx
		cmovl eax, edx // clip to 0 if below 0
		mov edx, 65535
		cmp eax, edx
		cmovg eax, edx // clip to 65535 if above 65535
		// store
		push ecx
		call rec_slave_mem_write16
		pop ecx
		add ecx, 2
		ret
	}
}
// store scaled signed byte
__declspec (naked) void self_qstore_type6(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, -128
		cmp eax, edx
		cmovl eax, edx // clip to -128 if below -128
		mov edx, 127
		cmp eax, edx
		cmovg eax, edx // clip to 127 if above 127
		// store
		push ecx
		call rec_slave_mem_write8
		pop ecx
		add ecx, 1
		ret
	}
}
// store scaled signed word
__declspec (naked) void self_qstore_type7(void)
{
	_asm
	{
		fmul dword ptr scale_value
		fldcw fpucontrol_roundzero
		fistp int32_store
		fldcw fpucontrol_default
		mov eax, dword ptr int32_store
		// clip it in range
		mov edx, -32768
		cmp eax, edx
		cmovl eax, edx // clip to -32768 if below -32768
		mov edx, 32767
		cmp eax, edx
		cmovg eax, edx // clip to 32767 if above 32767
		// store
		push ecx
		call rec_slave_mem_write16
		pop ecx
		add ecx, 2
		ret
	}
}
// =======================================================

void *qload_functions[8] =
{
	qload_type0,
	qload_type0,
	qload_type0,
	qload_type0,
	qload_type4,
	qload_type5,
	qload_type6,
	qload_type7,
};


void *qstore_functions[8] =
{
	qstore_type0,
	qstore_type0,
	qstore_type0,
	qstore_type0,
	qstore_type4,
	qstore_type5,
	qstore_type6,
	qstore_type7,
};

void *qstore_gxfifo_functions[8] =
{
	qstore_gxfifo_type0,
	qstore_gxfifo_type0,
	qstore_gxfifo_type0,
	qstore_gxfifo_type0,
	qstore_gxfifo_type4,
	qstore_gxfifo_type5,
	qstore_gxfifo_type6,
	qstore_gxfifo_type7,
};

// selfcheck functions
void *self_qload_functions[8] =
{
	self_qload_type0,
	self_qload_type0,
	self_qload_type0,
	self_qload_type0,
	self_qload_type4,
	self_qload_type5,
	self_qload_type6,
	self_qload_type7,
};


void *self_qstore_functions[8] =
{
	self_qstore_type0,
	self_qstore_type0,
	self_qstore_type0,
	self_qstore_type0,
	self_qstore_type4,
	self_qstore_type5,
	self_qstore_type6,
	self_qstore_type7,
};

void trx_ppc_gen_psq_l(void)
{
	uint32 rA, rD, type, scale, i, w;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>12)&0x7;
	w = ((trxCPUrec.opcode)>>15)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i]>>16)&7;
	scale = (trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f;

	//if(trxCPUrec.pc == 0x8000dd0c)gen_asm(BREAK);
	// scale_value = dq_factor[trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i]>>16)&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 22);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&dq_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHR_RI8, EAX, 14);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!
	if(config_cpumode == CPU_SELFCHECKMODE)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qload_functions);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qload_functions);
	}
	uint32 EA = trxCPUrec.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	

	if(rA == 0)
	{
		//EA = 0 + imm;
		gen_asm(MOV_RI32, ECX, EA);
	}
	else
	{
		//EA = trxCPUrec.gpr[rA] + imm;
		regc_load(ECX, rA);
		gen_asm(ADD_RI32, ECX, EA);
	}

	if(w == 0)
	{
		//trx_gekko_qload(EA, 0, type, scale, &trx_rec_ps0_double[rD]);
		// ECX incremented to next memlocation by function
		gen_asm(PUSH_R, EAX);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]); 
		gen_asm(POP_R, EAX);
		//trx_gekko_qload(EA, 1, type, scale, &trx_rec_ps1_double[rD]);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]); 
	}
	else
	{
		//trx_gekko_qload(EA, 0, type, scale, &trx_rec_ps0_double[rD]);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]); 
		//trx_rec_ps1_double[rD] = 1.0f;
		gen_asm(FLD1);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]); 
	}
}

void trx_ppc_gen_psq_lx(void)
{
	uint32 rA, rD, rB, type, scale, i, w;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	i = ((trxCPUrec.opcode)>>7)&0x7;
	w = ((trxCPUrec.opcode)>>10)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i]>>16)&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f;

	//if(type !=0)gen_asm(BREAK);

	// scale_value = dq_factor[trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i]>>16)&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 22);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&dq_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHR_RI8, EAX, 14);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!
	if(config_cpumode == CPU_SELFCHECKMODE)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qload_functions);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qload_functions);
	}
	
	if(rA == 0)
	{
		//EA = trxCPUrec.gpr[rB]
		regc_load(ECX, rB);
	}
	else
	{
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
		regc_load(ECX, rA);
		regc_load(EDX, rB);
		gen_asm(ADD_RR, ECX, EDX);
	}

	if(w == 0)
	{
		//trx_gekko_qload(EA, 0, type, scale, &trx_rec_ps0_double[rD]);
		// ECX incremented to next memlocation by function
		gen_asm(PUSH_R, EAX);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]); 
		gen_asm(POP_R, EAX);
		//trx_gekko_qload(EA, 1, type, scale, &trx_rec_ps1_double[rD]);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]); 
	}
	else
	{
		//trx_gekko_qload(EA, 0, type, scale, &trx_rec_ps0_double[rD]);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]); 
		//trx_rec_ps1_double[rD] = 1.0f;
		gen_asm(FLD1);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]); 
	}
}

void trx_ppc_gen_psq_lu(void)
{
	uint32 rA, rD, type, scale, i, w;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>12)&0x7;
	w = ((trxCPUrec.opcode)>>15)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i]>>16)&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f;

	//if(type !=0)gen_asm(BREAK);

	// scale_value = dq_factor[trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i]>>16)&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 22);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&dq_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHR_RI8, EAX, 14);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!
	if(config_cpumode == CPU_SELFCHECKMODE)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qload_functions);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qload_functions);
	}

	uint32 EA = trxCPUrec.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	
	
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, EA);
	// and update
	regc_store(ECX, rA);

	if(w == 0)
	{
		//trx_gekko_qload(EA, 0, type, scale, &trx_rec_ps0_double[rD]);
		// ECX incremented to next memlocation by function
		gen_asm(PUSH_R, EAX);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]); 
		gen_asm(POP_R, EAX);
		//trx_gekko_qload(EA, 1, type, scale, &trx_rec_ps1_double[rD]);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]); 
	}
	else
	{
		//trx_gekko_qload(EA, 0, type, scale, &trx_rec_ps0_double[rD]);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]); 
		//trx_rec_ps1_double[rD] = 1.0f;
		gen_asm(FLD1);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]); 
	}
}

void trx_ppc_gen_psq_lux(void)
{
	uint32 rA, rD, rB, type, scale, i, w;

	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>7)&0x7;
	w = ((trxCPUrec.opcode)>>10)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i]>>16)&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f;

	//if(type !=0)gen_asm(BREAK);

	// scale_value = dq_factor[trxCPUrec.spr[PPC_GQR0+i]>>24)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i]>>16)&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 22);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&dq_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHR_RI8, EAX, 14);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!
	if(config_cpumode == CPU_SELFCHECKMODE)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qload_functions);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qload_functions);
	}

	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
	regc_load(ECX, rA);
	regc_load(EDX, rB);
	gen_asm(ADD_RR, ECX, EDX);
	// and update
	regc_store(ECX, rA);

	if(w == 0)
	{
		//trx_gekko_qload(EA, 0, type, scale, &trx_rec_ps0_double[rD]);
		// ECX incremented to next memlocation by function
		gen_asm(PUSH_R, EAX);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]); 
		gen_asm(POP_R, EAX);
		//trx_gekko_qload(EA, 1, type, scale, &trx_rec_ps1_double[rD]);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]); 
	}
	else
	{
		//trx_gekko_qload(EA, 0, type, scale, &trx_rec_ps0_double[rD]);
		gen_asm(CALL_R, EAX);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]); 
		//trx_rec_ps1_double[rD] = 1.0f;
		gen_asm(FLD1);
		gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]); 
	}
}

void trx_ppc_gen_psq_st(void)
{
	uint32 rA, rS, type, scale, i, w;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>12)&0x7;
	w = ((trxCPUrec.opcode)>>15)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i])&7;
	scale = (trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f;

	//if(type != 0)gen_asm(BREAK);
	// scale_value = q_factor[trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i])&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 6);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&q_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHL_RI8, EAX, 2);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!

	uint32 EA = trxCPUrec.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	

	if(rA == 0)
	{
		gen_asm(MOV_RI32, ECX, EA);
		if(config_cpumode == CPU_SELFCHECKMODE)
		{
			gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qstore_functions);
		}
		else
		{
			gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_functions);
		}
	}
	else
	{
		if(regc_is_constant(rA) && (config_constantaddress == 1))
		{
			EA += regc_getconstant(rA);
			// gx fifo write ?

			if((EA & 0xffffff00) == 0xcc008000)
			{
				gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_gxfifo_functions);			
			}
			else
			{
//				printf("psq_st constant store to %x\n", EA);
				gen_asm(MOV_RI32, ECX, EA);
				if(config_cpumode == CPU_SELFCHECKMODE)
				{
					gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qstore_functions);
				}
				else
				{
					gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_functions);
				}
			}
		}
		else
		{
			regc_load(ECX, rA);
			gen_asm(ADD_RI32, ECX, EA);
			if(config_cpumode == CPU_SELFCHECKMODE)
			{
				gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qstore_functions);
			}
			else
			{
				gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_functions);
			}
		}
	}	
	
	if(w == 0)
	{
		// ECX is assumed to contain EA, scale_value contains scale value
		// ST(0) is assumed to contain double to be stored
		// returns ECX incremented by size
		//trx_gekko_qstore(EA, 0, type, scale, &trx_rec_ps0_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
		gen_asm(PUSH_R, EAX);
		gen_asm(CALL_R, EAX);
		gen_asm(POP_R, EAX);
		//trx_gekko_qstore(EA, 1, type, scale, &trx_rec_ps1_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rS]);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		//trx_gekko_qstore(EA, 0, type, scale, &trx_rec_ps0_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
		gen_asm(CALL_R, EAX);
	}
}

void trx_ppc_gen_psq_stu(void)
{
	uint32 rA, rS, type, scale, i, w;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>12)&0x7;
	w = ((trxCPUrec.opcode)>>15)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i])&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f;

	//if(type!= 0)gen_asm(BREAK);
	// scale_value = q_factor[trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i])&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 6);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&q_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHL_RI8, EAX, 2);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!
	if(config_cpumode == CPU_SELFCHECKMODE)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qstore_functions);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_functions);
	}

	uint32 EA = trxCPUrec.opcode & 0xfff; // pay attention ! only 12 bits !
    if(EA & 0x800) EA |= 0xfffff000;	
	
	regc_load(ECX, rA);
	gen_asm(ADD_RI32, ECX, EA);
	// and update
	regc_store(ECX, rA);
	
	if(w == 0)
	{
		// ECX is assumed to contain EA, scale_value contains scale value
		// ST(0) is assumed to contain double to be stored
		// returns ECX incremented by size
		//trx_gekko_qstore(EA, 0, type, scale, &trx_rec_ps0_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
		gen_asm(PUSH_R, EAX);
		gen_asm(CALL_R, EAX);
		gen_asm(POP_R, EAX);
		//trx_gekko_qstore(EA, 1, type, scale, &trx_rec_ps1_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rS]);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		//trx_gekko_qstore(EA, 0, type, scale, &trx_rec_ps0_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
		gen_asm(CALL_R, EAX);
	}
}

void trx_ppc_gen_psq_stx(void)
{
	uint32 rA, rS, rB, type, scale, i, w;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>7)&0x7;
	w = ((trxCPUrec.opcode)>>10)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i])&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f;

	//if(type != 0)gen_asm(BREAK);
	// scale_value = q_factor[trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i])&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 6);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&q_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHL_RI8, EAX, 2);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!
	if(config_cpumode == CPU_SELFCHECKMODE)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qstore_functions);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_functions);
	}

	if(rA == 0)
	{
		//EA = trxCPUrec.gpr[rB]
		regc_load(ECX, rB);
	}
	else
	{
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
		regc_load(EDX, rA);
		regc_load(ECX, rB);
		gen_asm(ADD_RR, ECX, EDX);
	}
	
	if(w == 0)
	{
		// ECX is assumed to contain EA, scale_value contains scale value
		// ST(0) is assumed to contain double to be stored
		// returns ECX incremented by size
		//trx_gekko_qstore(EA, 0, type, scale, &trx_rec_ps0_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
		gen_asm(PUSH_R, EAX);
		gen_asm(CALL_R, EAX);
		gen_asm(POP_R, EAX);
		//trx_gekko_qstore(EA, 1, type, scale, &trx_rec_ps1_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rS]);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		//trx_gekko_qstore(EA, 0, type, scale, &trx_rec_ps0_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
		gen_asm(CALL_R, EAX);
	}
}

void trx_ppc_gen_psq_stux(void)
{
	uint32 rA, rS, rB, type, scale, i, w;

	rS = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	i = ((trxCPUrec.opcode)>>7)&0x7;
	w = ((trxCPUrec.opcode)>>10)&1;

	type = (trxCPUrec.spr[PPC_GQR0+i])&7;
	//scale = (trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f;

	//if(type!= 0)gen_asm(BREAK);
	// scale_value = q_factor[trxCPUrec.spr[PPC_GQR0+i]>>8)&0x3f]
	// EAX = (uint32)&qload_functions[trxCPUrec.spr[PPC_GQR0+i])&7];
	gen_asm(MOV_RM, EAX, (uint32)&trxCPUrec.spr[PPC_GQR0+i]);
	gen_asm(MOV_RR, EDX, EAX);
	gen_asm(SHR_RI8, EDX, 6);
	gen_asm(AND_RI32, EDX, 0x3f<<2);// indexing 32 bit values!
	gen_asm(MOV_RMRI32, EDX, EDX, (uint32)&q_factor);
	gen_asm(MOV_MR, (uint32)&scale_value, EDX);
	gen_asm(SHL_RI8, EAX, 2);
	gen_asm(AND_RI32, EAX, 7<<2); //indexing 32bit values!
	if(config_cpumode == CPU_SELFCHECKMODE)
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&self_qstore_functions);
	}
	else
	{
		gen_asm(MOV_RMRI32, EAX, EAX, (uint32)&qstore_functions);
	}

	//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
	regc_load(EDX, rA);
	regc_load(ECX, rB);
	gen_asm(ADD_RR, ECX, EDX);
	// and update
	regc_store(ECX, rA);
	
	if(w == 0)
	{
		// ECX is assumed to contain EA, scale_value contains scale value
		// ST(0) is assumed to contain double to be stored
		// returns ECX incremented by size
		//trx_gekko_qstore(EA, 0, type, scale, &trx_rec_ps0_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
		gen_asm(PUSH_R, EAX);
		gen_asm(CALL_R, EAX);
		gen_asm(POP_R, EAX);
		//trx_gekko_qstore(EA, 1, type, scale, &trx_rec_ps1_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rS]);
		gen_asm(CALL_R, EAX);
	}
	else
	{
		//trx_gekko_qstore(EA, 0, type, scale, &trx_rec_ps0_double[rS]);
		gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rS]);
		gen_asm(CALL_R, EAX);
	}
}

//==============================================================================
// Paired Single SIMD opcodes
//
// used
void trx_ppc_gen_ps_msub(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
//	res0 = (trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) - trx_rec_ps0_double[rB];
//	res1 = (trx_rec_ps1_double[rA] * trx_rec_ps1_double[rC]) - trx_rec_ps1_double[rB];
//	trx_rec_ps0_double[rD] = res0;
//	trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}
// used
void trx_ppc_gen_ps_madd(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = (trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) + trx_rec_ps0_double[rB];
	//res1 = (trx_rec_ps1_double[rA] * trx_rec_ps1_double[rC]) + trx_rec_ps1_double[rB];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;

	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}
// not used
void trx_ppc_gen_ps_nmsub(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = -((trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) - trx_rec_ps0_double[rB]);
	//res1 = -((trx_rec_ps1_double[rA] * trx_rec_ps1_double[rC]) - trx_rec_ps1_double[rB]);
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FCHS);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FCHS);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}
// not used
void trx_ppc_gen_ps_nmadd(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = -((trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) + trx_rec_ps0_double[rB]);
	//res1 = -((trx_rec_ps1_double[rA] * trx_rec_ps1_double[rC]) + trx_rec_ps1_double[rB]);
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FCHS);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FCHS);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}
// used
void trx_ppc_gen_ps_neg(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_rec_ps0_int[rD] = trx_rec_ps0_int[rB] ^ FPU_SIGN_BIT;
	//trx_rec_ps1_int[rD] = trx_rec_ps1_int[rB] ^ FPU_SIGN_BIT;
	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rB]);
	gen_asm(MOV_RM, ECX, (uint32)&trx_rec_ps0_int[rB]+4);
	gen_asm(XOR_RI32, ECX, 0x80000000);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, ECX);

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps1_int[rB]);
	gen_asm(MOV_RM, ECX, (uint32)&trx_rec_ps1_int[rB]+4);
	gen_asm(XOR_RI32, ECX, 0x80000000);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps1_int[rD], EAX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps1_int[rD]+4, ECX);
}
// not used
void trx_ppc_gen_ps_nabs(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_rec_ps0_int[rD] = trx_rec_ps0_int[rB] | FPU_SIGN_BIT;
	//trx_rec_ps1_int[rD] = trx_rec_ps1_int[rB] | FPU_SIGN_BIT;

	gen_asm(BREAK);
/*
	qps0rd = &trx_rec_ps0_int[rD];
	qps0rb = &trx_rec_ps0_int[rB];
	qps1rd = &trx_rec_ps1_int[rD];
	qps1rb = &trx_rec_ps1_int[rB];
	_asm
	{
		mov edx, dword ptr qps0rb
		mov eax, [edx]
		mov ecx, [edx+4]
		or ecx, 0x80000000
		mov edx, dword ptr qps0rd
		mov [edx], eax
		mov [edx+4], ecx
		mov edx, dword ptr qps1rb
		mov eax, [edx]
		mov ecx, [edx+4]
		or ecx, 0x80000000
		mov edx, dword ptr qps1rd
		mov [edx], eax
		mov [edx+4], ecx
	};
*/
}
// used
void trx_ppc_gen_ps_abs(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_rec_ps0_int[rD] = trx_rec_ps0_int[rB] & ~FPU_SIGN_BIT;
	//trx_rec_ps1_int[rD] = trx_rec_ps1_int[rB] & ~FPU_SIGN_BIT;

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps0_int[rB]);
	gen_asm(MOV_RM, ECX, (uint32)&trx_rec_ps0_int[rB]+4);
	gen_asm(AND_RI32, ECX, 0x7fffffff);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD], EAX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps0_int[rD]+4, ECX);

	gen_asm(MOV_RM, EAX, (uint32)&trx_rec_ps1_int[rB]);
	gen_asm(MOV_RM, ECX, (uint32)&trx_rec_ps1_int[rB]+4);
	gen_asm(AND_RI32, ECX, 0x7fffffff);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps1_int[rD], EAX);
	gen_asm(MOV_MR, (uint32)&trx_rec_ps1_int[rD]+4, ECX);
}

// used
void trx_ppc_gen_ps_mr(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

//	trx_rec_ps0_double[rD] = trx_rec_ps0_double[rB];
//	trx_rec_ps1_double[rD] = trx_rec_ps1_double[rB];
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}
// used
void trx_ppc_gen_ps_merge00(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//res0 = trx_rec_ps0_double[rA];
	//res1 = trx_rec_ps0_double[rB];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_ps_merge01(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//res0 = trx_rec_ps0_double[rA];
	//res1 = trx_rec_ps1_double[rB];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;

	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}
// used
void trx_ppc_gen_ps_merge10(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
	//res0 = trx_rec_ps1_double[rA];
	//res1 = trx_rec_ps0_double[rB];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}
// used
void trx_ppc_gen_ps_merge11(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
	//res0 = trx_rec_ps1_double[rA];
	//res1 = trx_rec_ps1_double[rB];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;

	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_ps_mul(void)
{
	int rD, rA, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC];
	//res1 = trx_rec_ps1_double[rA] * trx_rec_ps1_double[rC];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;

	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}

// used
void trx_ppc_gen_ps_muls0(void)
{
	int rD, rA, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC];
	//res1 = trx_rec_ps1_double[rA] * trx_rec_ps0_double[rC];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;

	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}
// used
void trx_ppc_gen_ps_muls1(void)
{
	int rD, rA, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_rec_ps0_double[rA] * trx_rec_ps1_double[rC];
	//res1 = trx_rec_ps1_double[rA] * trx_rec_ps1_double[rC];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}
// used
void trx_ppc_gen_ps_madds0(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
//	res0 = (trx_rec_ps0_double[rA] * trx_rec_ps0_double[rC]) + trx_rec_ps0_double[rB];
//	res1 = (trx_rec_ps1_double[rA] * trx_rec_ps0_double[rC]) + trx_rec_ps1_double[rB];
//	trx_rec_ps0_double[rD] = res0;
//	trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}
// used
void trx_ppc_gen_ps_madds1(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
//	res0 = (trx_rec_ps0_double[rA] * trx_rec_ps1_double[rC]) + trx_rec_ps0_double[rB];
//	res1 = (trx_rec_ps1_double[rA] * trx_rec_ps1_double[rC]) + trx_rec_ps1_double[rB];
//	trx_rec_ps0_double[rD] = res0;
//	trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FMUL64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_ps_cmpo0()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(FUCOMIP, 1);
	gen_asm(FPOP);
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpo_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//	trxCPUrec.fpscr |= FPSCR_VXVC;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_ps_cmpo1()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rA]);
	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(FUCOMIP, 1);
	gen_asm(FPOP);
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpo_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//	trxCPUrec.fpscr |= FPSCR_VXVC;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_ps_cmpu0()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(FUCOMIP, 1);
	gen_asm(FPOP); 
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpu_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_ps_cmpu1()
{
	uint32 cr, rA, rB;
	uint32 shift, mask;

	cr = (trxCPUrec.opcode >> 23)& 0x7;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	cr = 7-cr;
	shift = 28-(cr*4); 
	mask = trx_ppc_cmp_and_mask[cr];

	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rA]);
	gen_asm(XOR_RR, ECX, ECX);
	gen_asm(FUCOMIP, 1);
	gen_asm(FPOP); 
	gen_asm(CMOVA_M, EAX, (uint32)&cr_gt);
	gen_asm(CMOVB_M, EAX, (uint32)&cr_lt);
	gen_asm(CMOVE_M, EAX, (uint32)&cr_eq);
	gen_asm(CMOVP_M, EAX, (uint32)&cr_so);
	gen_asm(CMOVP_M, ECX, (uint32)&fcmpu_nan_flags);
	//if(rA == NaN || rB == Nan)
	//{
	//	trxCPUrec.fpscr |= FPSCR_VXSNAN;
	//}
	//trxCPUrec.fpscr &= ~0x1f000;
	//trxCPUrec.fpscr |= (cmp << 12); -> thats when the bits are in 0-3 now they are in 28-31
	//so we should do >>28 <<12 which is >>16
	gen_asm(MOV_RM, EDX, (uint32)&trxCPUrec.fpscr);
	gen_asm(OR_RR, EDX, ECX); // apply flags if needed
	gen_asm(AND_RI32, EDX, ~0x1f000);
	gen_asm(MOV_RR, ECX, EAX);
	gen_asm(SHR_RI8, ECX, 16);
	gen_asm(OR_RR, ECX, EDX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.fpscr, ECX);
	// store in designated flags
	gen_asm(SHR_RI8, EAX, shift);
	gen_asm(MOV_RM, ECX, (uint32)&trxCPUrec.cr);
	gen_asm(AND_RI32, ECX, mask);
	gen_asm(OR_RR, ECX, EAX);
	gen_asm(MOV_MR, (uint32)&trxCPUrec.cr, ECX);
}

// used
void trx_ppc_gen_ps_sum0(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_rec_ps0_double[rA] + trx_rec_ps1_double[rB];
	//res1 = trx_rec_ps1_double[rC];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);

	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}

// used
void trx_ppc_gen_ps_sum1(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//res0 = trx_rec_ps0_double[rC];
	//res1 = trx_rec_ps0_double[rA] + trx_rec_ps1_double[rB];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used
void trx_ppc_gen_ps_div(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
	//res0 = trx_rec_ps0_double[rA] / trx_rec_ps0_double[rB];
	//res1 = trx_rec_ps1_double[rA] / trx_rec_ps1_double[rB];
	//trx_rec_ps0_double[rD] = res0;
	//trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FDIV64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FDIV64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}
// used
void trx_ppc_gen_ps_sub(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
//	res0 = trx_rec_ps0_double[rA] - trx_rec_ps0_double[rB];
//	res1 = trx_rec_ps1_double[rA] - trx_rec_ps1_double[rB];
//	trx_rec_ps0_double[rD] = res0;
//	trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FSUB64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}
// used
void trx_ppc_gen_ps_add(void)
{
	int rD, rA, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
//	res0 = trx_rec_ps0_double[rA] + trx_rec_ps0_double[rB];
//	res1 = trx_rec_ps1_double[rA] + trx_rec_ps1_double[rB];
//	trx_rec_ps0_double[rD] = res0;
//	trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD64_M,(uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD64_M,(uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FADD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}

// used
void trx_ppc_gen_ps_sel(void)
{
	int rD, rA, rB, rC;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	rC = ((trxCPUrec.opcode)>>6)&0x1f;
	
	//if(trx_rec_ps0_double[rA] < 0.0f || NAN)	trx_rec_ps0_double[rD] = trx_rec_ps0_double[rB];
	//else trx_rec_ps0_double[rD] = trx_rec_ps0_double[rC];
	//if(trx_rec_ps1_double[rA] < 0.0f || NAN)	trx_rec_ps1_double[rD] = trx_rec_ps1_double[rB];
	//else trx_rec_ps1_double[rD] = trx_rec_ps1_double[rC];
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rC]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FLDZ);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rA]);
	gen_asm(FUCOMIP, 1); // cmp a, 0
	gen_asm(FCMOVB_M, 1); // < 0
	gen_asm(FCMOVU_M, 1); // NaN
	gen_asm(FCMOVNB_M, 2); // >= 0
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FPOP);
	gen_asm(FPOP);

	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rC]);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FLDZ);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rA]);
	gen_asm(FUCOMIP, 1); // cmp a, 0
	gen_asm(FCMOVB_M, 1); // < 0
	gen_asm(FCMOVU_M, 1); // NaN
	gen_asm(FCMOVNB_M, 2); // >= 0
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
	gen_asm(FPOP);
	gen_asm(FPOP);
}

// used
void trx_ppc_gen_ps_res(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	//trx_rec_ps0_double[rD] = 1.0f / trx_rec_ps0_double[rB];
	//trx_rec_ps1_double[rD] = 1.0f / trx_rec_ps1_double[rB];
	gen_asm(FLD1);
	gen_asm(FDIV64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD1);
	gen_asm(FDIV64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
}

// used (trusted)
void trx_ppc_gen_ps_rsqrte(void)
{
	int rD, rB;
	rD = ((trxCPUrec.opcode)>>21)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;
	
//	res0 = 1.0f / sqrt(trx_rec_ps0_double[rB]);
//	res1 = 1.0f / sqrt(trx_rec_ps1_double[rB]);
//	trx_rec_ps0_double[rD] = res0;
//	trx_rec_ps1_double[rD] = res1;
	gen_asm(FLD1);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps0_double[rB]);
	gen_asm(FSQRT);
	gen_asm(FDIV);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps0_double[rD]);
	gen_asm(FLD1);
	gen_asm(FLD64_M, (uint32)&trx_rec_ps1_double[rB]);
	gen_asm(FSQRT);
	gen_asm(FDIV);
	gen_asm(FSTP64_M, (uint32)&trx_rec_ps1_double[rD]);
}

// locked cache support
// gekko locked cache allocate
void trx_ppc_gen_dcbz_l(void)
{
	uint32 rA, rB;
	rA = ((trxCPUrec.opcode)>>16)&0x1f;
	rB = ((trxCPUrec.opcode)>>11)&0x1f;

	if(rA == 0)
	{
		//EA = trxCPUrec.gpr[rB]
		regc_load(ECX, rB);
	}
	else
	{
		//EA = trxCPUrec.gpr[rA] + trxCPUrec.gpr[rB]
		regc_load(EAX, rA);
		regc_load(ECX, rB);
		gen_asm(ADD_RR, ECX, EAX);
	}

	// interpreter is doing checks if the locked cache is in correct 
	// memory area. we dont care, we assume it is right
	// clear the cache line
	//memset(&lockedcache[EA & LOCKEDCACHE_MASK], 0, 32);
	gen_asm(XOR_RR, EAX, EAX); // zero
	gen_asm(AND_RI32, ECX, LOCKEDCACHE_MASK);
	gen_asm(ADD_RI32, ECX, (uint32)&lockedcache[0]);
	gen_asm(MOV_MRRI32, EAX, ECX, 0);
	gen_asm(MOV_MRRI32, EAX, ECX, 4);
	gen_asm(MOV_MRRI32, EAX, ECX, 8);
	gen_asm(MOV_MRRI32, EAX, ECX, 12);
	gen_asm(MOV_MRRI32, EAX, ECX, 16);
	gen_asm(MOV_MRRI32, EAX, ECX, 20);
	gen_asm(MOV_MRRI32, EAX, ECX, 24);
	gen_asm(MOV_MRRI32, EAX, ECX, 28);
}
