[Mono-devel-list] AMD64 patches.

mono_devel at workingpages.com mono_devel at workingpages.com
Fri Nov 14 11:51:08 EST 2003


Hi,

The following patch set should get the interpreter working on AMD64
. I was able to make it through all the unit tests with some
fiddling. There are a number of failures, but they don't seem to be
obviously port related.

I'm onto getting mini working, but I wanted to get this out there. I
hope someone else can give it a try.

The port was done on SuSE 8.0 . I doubt there are any OS/distro
dependencies in the architecture specific code. One must upgrade to
gcc-3.3.2 in order to compile Mono with -O2 on AMD64 . (With 3.2.2,
mint will crash at startup in monitor.c if compiled -O2.)

I chose to make a minimal modification to atomic.h in that the code is
very close to IA32. Otherwise all the changes should be quite isolated
in architecture specific places. None of these changes should modify
the behavior on other platforms.

I'd like to thank AMD Developer Services for access to a machine to
work on this.

Zalman Stern

----------cut here ----------
Index: configure.in
===================================================================
RCS file: /mono/mono/configure.in,v
retrieving revision 1.173
diff -u -r1.173 configure.in
--- configure.in	8 Nov 2003 03:23:10 -0000	1.173
+++ configure.in	14 Nov 2003 08:34:48 -0000
@@ -756,6 +756,10 @@
 		arch_target=x86;
 		JIT_SUPPORTED=yes
 		;;
+	x86_64-*-*)
+		TARGET=X86_64;
+		arch_target=x86_64;
+		;;
 	sparc*-*-*)
 		TARGET=SPARC;
 		arch_target=sparc;
@@ -833,6 +837,7 @@
 AM_CONDITIONAL(MIPS_SGI, test ${TARGET}${ac_cv_prog_gcc} = MIPSno)
 AM_CONDITIONAL(SPARC, test x$TARGET = xSPARC)
 AM_CONDITIONAL(X86, test x$TARGET = xX86)
+AM_CONDITIONAL(X86_64, test x$TARGET = xX86_64)
 AM_CONDITIONAL(ALPHA, test x$TARGET = xALPHA)
 AM_CONDITIONAL(IA64, test x$TARGET = xIA64)
 AM_CONDITIONAL(M68K, test x$TARGET = xM68K)
@@ -863,6 +868,7 @@
 mono/os/win32/Makefile
 mono/os/unix/Makefile
 mono/arch/x86/Makefile
+mono/arch/x86_64/Makefile
 mono/arch/hppa/Makefile
 mono/arch/ppc/Makefile
 mono/arch/sparc/Makefile
Index: mono/io-layer/atomic.h
===================================================================
RCS file: /mono/mono/mono/io-layer/atomic.h,v
retrieving revision 1.16
diff -u -r1.16 atomic.h
--- mono/io-layer/atomic.h	21 Oct 2003 10:15:02 -0000	1.16
+++ mono/io-layer/atomic.h	14 Nov 2003 08:34:49 -0000
@@ -14,7 +14,7 @@
 
 #include "mono/io-layer/wapi.h"
 
-#ifdef __i386__
+#if defined(__i386__) || defined(__x86_64__)
 #define WAPI_ATOMIC_ASM
 
 /*
@@ -41,9 +41,16 @@
 {
 	gpointer old;
 
-	__asm__ __volatile__ ("lock; cmpxchgl %2, %0"
+	__asm__ __volatile__ ("lock; "
+#ifdef __x86_64__
+			      "cmpxchgq"
+#else
+			      "cmpxchgl"
+#endif
+			      " %2, %0"
 			      : "=m" (*dest), "=a" (old)
 			      : "r" (exch), "m" (*dest), "a" (comp));	
+
 	return(old);
 }
 
@@ -96,7 +103,13 @@
 {
 	gpointer ret;
 	
-	__asm__ __volatile__ ("1:; lock; cmpxchgl %2, %0; jne 1b"
+	__asm__ __volatile__ ("1:; lock; "
+#ifdef __x86_64__
+			      "cmpxchgq"
+#else
+			      "cmpxchgl"
+#endif
+			      " %2, %0; jne 1b"
 			      : "=m" (*val), "=a" (ret)
 			      : "r" (new_val), "m" (*val), "a" (*val));
 
Index: mono/interpreter/interp.c
===================================================================
RCS file: /mono/mono/mono/interpreter/interp.c,v
retrieving revision 1.265
diff -u -r1.265 interp.c
--- mono/interpreter/interp.c	31 Oct 2003 13:06:28 -0000	1.265
+++ mono/interpreter/interp.c	14 Nov 2003 08:34:55 -0000
@@ -5125,6 +5125,9 @@
 #ifdef __hpux /* generates very big stack frames */
 	mono_threads_set_default_stacksize(32*1024*1024);
 #endif
+#ifdef __x86_64__
+	mono_threads_set_default_stacksize(8*1024*1024);
+#endif
 	mono_config_parse (config_file);
 	mono_init_icall ();
 	mono_add_internal_call ("System.Diagnostics.StackFrame::get_frame_info", ves_icall_get_frame_info);
Index: mono/utils/strtod.c
===================================================================
RCS file: /mono/mono/mono/utils/strtod.c,v
retrieving revision 1.5
diff -u -r1.5 strtod.c
--- mono/utils/strtod.c	1 Sep 2003 10:52:11 -0000	1.5
+++ mono/utils/strtod.c	14 Nov 2003 08:34:58 -0000
@@ -143,6 +143,11 @@
 #define IEEE_8087
 #define Long long
 
+#elif defined(__x86_64__)
+
+#define IEEE_8087
+#define Long int
+
 #elif defined(__ia64)
 
 # ifndef __LP64__
Index: mono/arch/x86_64/Makefile.am
===================================================================
--- /dev/null	2003-03-27 11:16:05.000000000 -0800
+++ mono/arch/x86_64/Makefile.am	2003-10-12 18:06:57.000000000 -0700
@@ -0,0 +1,7 @@
+
+INCLUDES = $(GLIB_CFLAGS) -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libmonoarch-x86_64.la
+
+libmonoarch_x86_64_la_SOURCES = tramp.c x86-codegen.h
+
Index: mono/arch/x86_64/tramp.c
===================================================================
--- /dev/null	2003-03-27 11:16:05.000000000 -0800
+++ mono/arch/x86_64/tramp.c	2003-11-14 02:13:57.000000000 -0800
@@ -0,0 +1,1055 @@
+/*
+ * Create trampolines to invoke arbitrary functions.
+ * 
+ * Copyright (C) Ximian Inc.
+ * 
+ * Authors: 
+ *   Zalman Stern
+ * Based on code by:
+ *   Paolo Molaro (lupus at ximian.com)
+ *   Dietmar Maurer (dietmar at ximian.com)
+ * 
+ * To understand this code, one will want to read http://x86-64.org/abi.pdf and the AMD64 architecture docs
+ * found at amd.com .
+ * The name x86_64 is used throughout this file for historical reasons to name the architecture now called AMD64. 
+ */
+
+#include "config.h"
+#include <stdlib.h>
+#include <string.h>
+#include "x86_64-codegen.h"
+#include "mono/metadata/class.h"
+#include "mono/metadata/tabledefs.h"
+#include "mono/interpreter/interp.h"
+#include "mono/metadata/appdomain.h"
+#include "mono/metadata/marshal.h"
+
+/*
+ * The resulting function takes the form:
+ * void func (void (*callme)(), void *retval, void *this_obj, stackval *arguments);
+ */
+#define FUNC_ADDR_POS	8
+#define RETVAL_POS	12
+#define THIS_POS	16
+#define ARGP_POS	20
+#define LOC_POS	-4
+
+#define ARG_SIZE	sizeof (stackval)
+
+#define MAX_INT_ARG_REGS	6
+#define MAX_FLOAT_ARG_REGS	8
+
+// TODO get these right. They are upper bounds anyway, so it doesn't much matter.
+#define PUSH_INT_STACK_ARG_SIZE		16
+#define MOVE_INT_REG_ARG_SIZE		16
+#define PUSH_FLOAT_STACK_ARG_SIZE	16
+#define MOVE_FLOAT_REG_ARG_SIZE		16
+#define COPY_STRUCT_STACK_ARG_SIZE	16
+
+/* Maps an argument number (starting at 0) to the register it is passed in (if it fits).
+ * E.g. int foo(int bar, int quux) has the foo arg in RDI and the quux arg in RSI
+ * There is no such map for floating point args as they go in XMM0-XMM7 in order and thus the
+ * index is the register number.
+ */
+static int int_arg_regs[] = { X86_64_RDI, X86_64_RSI, X86_64_RDX, X86_64_RCX, X86_64_R8, X86_64_R9 };
+
+/* This next block of code resolves the ABI rules for passing structures in the argument registers.
+ * These basically amount to "Use up to two registers if they are all integer or all floating point.
+ * If the structure is bigger than two registers or would be in one integer register and one floating point,
+ * it is passed in memory instead.
+ *
+ * It is possible this code needs to be recursive to be correct in the case when one of the structure members
+ * is itself a structure.
+ *
+ * The 80-bit floating point stuff is ignored.
+ */
+typedef enum {
+	ARG_IN_MEMORY,
+	ARG_IN_INT_REGS,
+	ARG_IN_FLOAT_REGS
+} struct_arg_type;
+
+static struct_arg_type compute_arg_type(MonoType *type)
+{
+	guint32 simpletype = type->type;
+
+	switch (simpletype) {
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_PTR:
+		case MONO_TYPE_SZARRAY:
+		case MONO_TYPE_CLASS:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_STRING:
+		case MONO_TYPE_I8:
+			return ARG_IN_INT_REGS;
+			break;
+		case MONO_TYPE_VALUETYPE: {
+			if (type->data.klass->enumtype)
+				return ARG_IN_INT_REGS;
+ 			return ARG_IN_MEMORY;
+			break;
+		}
+		case MONO_TYPE_R4:
+		case MONO_TYPE_R8:
+ 			return ARG_IN_FLOAT_REGS;
+			break;
+		default:
+			g_error ("Can't trampoline 0x%x", type->type);
+	}
+
+	return ARG_IN_MEMORY;
+}
+
+static struct_arg_type value_type_info(MonoClass *klass, int *native_size, int *regs_used, int *offset1, int *size1, int *offset2, int *size2)
+{
+	MonoMarshalType *info = mono_marshal_load_type_info (klass);
+
+	*native_size = info->native_size;
+
+	if (info->native_size > 8 || info->num_fields > 2)
+	{
+		*regs_used = 0;
+		*offset1 = -1;
+		*offset2 = -1;
+		return ARG_IN_MEMORY;
+	}
+
+	if (info->num_fields == 1)
+	{
+		struct_arg_type result = compute_arg_type(info->fields[0].field->type);
+		if (result != ARG_IN_MEMORY)
+		{
+			*regs_used = 1;
+			*offset1 = info->fields[0].offset;
+			*size1 = mono_marshal_type_size (info->fields[0].field->type, info->fields[0].mspec, NULL, 1, 1);
+		} 
+		else
+		{
+			*regs_used = 0;
+			*offset1 = -1;
+		}
+
+		*offset2 = -1;
+		return result;
+	}
+
+	struct_arg_type result1 = compute_arg_type(info->fields[0].field->type);
+	struct_arg_type result2 = compute_arg_type(info->fields[0].field->type);
+
+	if (result1 == result2 && result1 != ARG_IN_MEMORY)
+	{
+		*regs_used = 2;
+		*offset1 = info->fields[0].offset;
+		*size1 = mono_marshal_type_size (info->fields[0].field->type, info->fields[0].mspec, NULL, 1, 1);
+		*offset2 = info->fields[1].offset;
+		*size2 = mono_marshal_type_size (info->fields[1].field->type, info->fields[1].mspec, NULL, 1, 1);
+		return result1;
+	}
+
+	return ARG_IN_MEMORY;
+}
+
+MonoPIFunc
+mono_arch_create_trampoline (MonoMethodSignature *sig, gboolean string_ctor)
+{
+	unsigned char *p, *code_buffer;
+	guint32 stack_size = 0, code_size = 50;
+	guint32 arg_pos, simpletype;
+	int i;
+	static GHashTable *cache = NULL;
+	MonoPIFunc res;
+
+	guint32 int_arg_regs_used = 0;
+	guint32 float_arg_regs_used = 0;
+	guint32 next_int_arg_reg = 0;
+	guint32 next_float_arg_reg = 0;
+	/* Indicates that the return value is filled in inside the called function. */
+	int retval_implicit = 0;
+	char *arg_in_reg_bitvector; /* A set index by argument number saying if it is in a register
+				       (integer or floating point according to type) */
+
+	if (!cache) 
+		cache = g_hash_table_new ((GHashFunc)mono_signature_hash, 
+					  (GCompareFunc)mono_metadata_signature_equal);
+
+	if ((res = (MonoPIFunc)g_hash_table_lookup (cache, sig)))
+		return res;
+
+	if (sig->ret->type == MONO_TYPE_VALUETYPE && !sig->ret->byref && !sig->ret->data.klass->enumtype) {
+		int_arg_regs_used++;
+		code_size += MOVE_INT_REG_ARG_SIZE;
+	}
+
+	if (sig->hasthis) {
+		int_arg_regs_used++;
+		code_size += MOVE_INT_REG_ARG_SIZE;
+	}
+	
+	/* Run through stuff to calculate code size and argument bytes that will be pushed on stack (stack_size). */
+	for (i = 0; i < sig->param_count; ++i) {
+		if (sig->params [i]->byref)
+			simpletype = MONO_TYPE_PTR;
+		else
+			simpletype = sig->params [i]->type;
+enum_calc_size:
+		switch (simpletype) {
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_PTR:
+		case MONO_TYPE_SZARRAY:
+		case MONO_TYPE_CLASS:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_STRING:
+		case MONO_TYPE_I8:
+			if (int_arg_regs_used++ > MAX_INT_ARG_REGS) {
+				stack_size += 8;
+				code_size += PUSH_INT_STACK_ARG_SIZE;
+			}
+			else
+				code_size += MOVE_INT_REG_ARG_SIZE;
+			break;
+		case MONO_TYPE_VALUETYPE: {
+			int size;
+			int arg_type;
+			int regs_used;
+			int offset1;
+			int size1;
+			int offset2;
+			int size2;
+
+			if (sig->params [i]->data.klass->enumtype) {
+				simpletype = sig->params [i]->data.klass->enum_basetype->type;
+				goto enum_calc_size;
+			}
+
+			arg_type = value_type_info(sig->params [i]->data.klass, &size, &regs_used, &offset1, &size1, &offset2, &size2);
+			if (arg_type == ARG_IN_INT_REGS &&
+			    (int_arg_regs_used + regs_used) <= MAX_INT_ARG_REGS)
+			{
+				code_size += MOVE_INT_REG_ARG_SIZE;
+				int_arg_regs_used += regs_used;
+				break;
+			}
+
+			if (arg_type == ARG_IN_FLOAT_REGS &&
+			    (float_arg_regs_used + regs_used) <= MAX_FLOAT_ARG_REGS)
+			{
+				code_size += MOVE_FLOAT_REG_ARG_SIZE;
+				float_arg_regs_used += regs_used;
+				break;
+			}
+
+			/* Else item is in memory. */
+
+			stack_size += size + 7;
+			stack_size &= ~7;
+			code_size += COPY_STRUCT_STACK_ARG_SIZE;
+
+			break;
+		}
+		case MONO_TYPE_R4:
+		case MONO_TYPE_R8:
+			if (float_arg_regs_used++ > MAX_FLOAT_ARG_REGS) {
+				stack_size += 8;
+				code_size += PUSH_FLOAT_STACK_ARG_SIZE;
+			}
+			else
+				code_size += MOVE_FLOAT_REG_ARG_SIZE;
+			break;
+		default:
+			g_error ("Can't trampoline 0x%x", sig->params [i]->type);
+		}
+	}
+	/*
+	 * FIXME: take into account large return values.
+	 * (Comment carried over from IA32 code. Not sure what it means :-)
+ 	 */
+
+	code_buffer = p = alloca (code_size);
+
+	/*
+	 * Standard function prolog.
+	 */
+	x86_64_push_reg (p, X86_64_RBP);
+	x86_64_mov_reg_reg (p, X86_64_RBP, X86_64_RSP, 8);
+	/*
+	 * and align to 16 byte boundary...
+	 */
+
+	if (sig->ret->byref || string_ctor || !(retval_implicit || sig->ret->type == MONO_TYPE_VOID)) {
+		stack_size += 8;
+	}
+
+	/* Ensure stack is 16 byte aligned when entering called function as required by calling convention. 
+	 * Getting this wrong results in a general protection fault on an SSE load or store somewhere in the
+	 * code called under the trampoline.
+	 */
+	if ((stack_size & 15) != 0)
+		x86_64_alu_reg_imm (p, X86_SUB, X86_64_RSP, 16 - (stack_size & 15));
+
+	/*
+	 * On entry to generated function:
+	 *     RDI has target function address
+	 *     RSI has return value location address
+	 *     RDX has this pointer address
+	 *     RCX has the pointer to the args array.
+	 *
+	 * Inside the stub function:
+	 *     R10 holds the pointer to the args 
+	 *     R11 holds the target function address.
+	 *     The return value address is pushed on the stack.
+	 *     The this pointer is moved into the first arg register at the start.
+	 *
+	 * Optimization note: we could keep the args pointer in RCX and then
+	 * load over itself at the end. Ditto the callee addres could be left in RDI in some cases.
+	 */
+
+	if (sig->ret->byref || string_ctor || !(retval_implicit || sig->ret->type == MONO_TYPE_VOID)) {
+		/* Push the retval register so it is saved across the call. */
+		x86_64_push_reg (p, X86_64_RSI);
+	}
+
+	/* Move args pointer to temp register. */
+	x86_64_mov_reg_reg (p, X86_64_R10, X86_64_RCX, 8);
+	x86_64_mov_reg_reg (p, X86_64_R11, X86_64_RDI, 8);
+
+	/* First args register gets return value pointer, if need be.
+         * Note that "byref" equal true means the called function returns a pointer.
+         */
+	if (sig->ret->type == MONO_TYPE_VALUETYPE && !sig->ret->byref) {
+		MonoClass *klass = sig->ret->data.klass;
+		if (!klass->enumtype) {
+			x86_64_mov_reg_reg (p, int_arg_regs[next_int_arg_reg], X86_64_RSI, 8);
+			next_int_arg_reg++;
+			retval_implicit = 1;
+		}
+	}
+
+	/* this pointer goes in next args register. */
+	if (sig->hasthis) {
+		x86_64_mov_reg_reg (p, int_arg_regs[next_int_arg_reg], X86_64_RDX, 8);
+		next_int_arg_reg++;
+	}
+
+	/*
+	 * Generate code to handle arguments in registers. Stack arguments will happen in a loop after this.
+	 */
+	arg_in_reg_bitvector = (char *)alloca((sig->param_count + 7) / 8);
+	memset(arg_in_reg_bitvector, 0, (sig->param_count + 7) / 8);
+
+	/* First, load all the arguments that are passed in registers into the appropriate registers.
+	 * Below there is another loop to handle arguments passed on the stack.
+	 */
+	for (i = 0; i < sig->param_count; i++) {
+		arg_pos = ARG_SIZE * i;
+
+		if (sig->params [i]->byref)
+			simpletype = MONO_TYPE_PTR;
+		else
+			simpletype = sig->params [i]->type;
+enum_marshal:
+		switch (simpletype) {
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_PTR:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_STRING:
+		case MONO_TYPE_SZARRAY:
+		case MONO_TYPE_I8:
+		case MONO_TYPE_U8:
+		case MONO_TYPE_CLASS:
+			if (next_int_arg_reg < MAX_INT_ARG_REGS) {
+				x86_64_mov_reg_membase (p, int_arg_regs[next_int_arg_reg], X86_64_R10, arg_pos, 8);
+				next_int_arg_reg++;
+				arg_in_reg_bitvector[i >> 3] |= (1 << (i & 7));
+			}
+			break;
+		case MONO_TYPE_R4:
+			if (next_float_arg_reg < MAX_FLOAT_ARG_REGS) {
+				x86_64_movss_reg_membase (p, next_float_arg_reg, X86_64_R10, arg_pos);
+				next_float_arg_reg++;
+				arg_in_reg_bitvector[i >> 3] |= (1 << (i & 7));
+			}
+			break;
+		case MONO_TYPE_R8:
+			if (next_float_arg_reg < MAX_FLOAT_ARG_REGS) {
+				x86_64_movsd_reg_membase (p, next_float_arg_reg, X86_64_R10, arg_pos);
+				next_float_arg_reg++;
+				arg_in_reg_bitvector[i >> 3] |= (1 << (i & 7));
+			}
+			break;
+		case MONO_TYPE_VALUETYPE: {
+			if (!sig->params [i]->data.klass->enumtype) {
+				int size;
+				int arg_type;
+				int regs_used;
+				int offset1;
+				int size1;
+				int offset2;
+				int size2;
+
+				arg_type = value_type_info(sig->params [i]->data.klass, &size, &regs_used, &offset1, &size1, &offset2, &size2);
+
+				if (arg_type == ARG_IN_INT_REGS &&
+				    (next_int_arg_reg + regs_used) <= MAX_INT_ARG_REGS)
+				{
+					x86_64_mov_reg_membase (p, int_arg_regs[next_int_arg_reg], X86_64_R10, arg_pos + offset1, size1);
+					next_int_arg_reg++;
+					if (regs_used > 1)
+					{
+						x86_64_mov_reg_membase (p, int_arg_regs[next_int_arg_reg], X86_64_R10, arg_pos + offset2, size2);
+						next_int_arg_reg++;
+					}
+					arg_in_reg_bitvector[i >> 3] |= (1 << (i & 7));
+					break;
+				}
+
+				if (arg_type == ARG_IN_FLOAT_REGS &&
+				    (next_float_arg_reg + regs_used) <= MAX_FLOAT_ARG_REGS)
+				{
+					if (size1 == 4)
+						x86_64_movss_reg_membase (p, next_float_arg_reg, X86_64_R10, arg_pos + offset1);
+					else
+						x86_64_movsd_reg_membase (p, next_float_arg_reg, X86_64_R10, arg_pos + offset1);
+					next_float_arg_reg++;
+
+					if (regs_used > 1)
+					{
+						if (size2 == 4)
+							x86_64_movss_reg_membase (p, next_float_arg_reg, X86_64_R10, arg_pos + offset2);
+						else
+							x86_64_movsd_reg_membase (p, next_float_arg_reg, X86_64_R10, arg_pos + offset2);
+						next_float_arg_reg++;
+					}
+					arg_in_reg_bitvector[i >> 3] |= (1 << (i & 7));
+					break;
+				}
+
+				/* Structs in memory are handled in the next loop. */
+			} else {
+				/* it's an enum value */
+				simpletype = sig->params [i]->data.klass->enum_basetype->type;
+				goto enum_marshal;
+			}
+			break;
+		}
+		default:
+			g_error ("Can't trampoline 0x%x", sig->params [i]->type);
+		}
+	}
+
+	/* Handle stack arguments, pushing the rightmost argument first. */
+	for (i = sig->param_count; i > 0; --i) {
+		arg_pos = ARG_SIZE * (i - 1);
+		if (sig->params [i - 1]->byref)
+			simpletype = MONO_TYPE_PTR;
+		else
+			simpletype = sig->params [i - 1]->type;
+enum_marshal2:
+		switch (simpletype) {
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_PTR:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_STRING:
+		case MONO_TYPE_SZARRAY:
+		case MONO_TYPE_I8:
+		case MONO_TYPE_U8:
+		case MONO_TYPE_CLASS:
+			if ((arg_in_reg_bitvector[(i - 1) >> 3] & (1 << ((i - 1) & 7))) == 0) {
+				x86_64_push_membase (p, X86_64_R10, arg_pos);
+			}
+			break;
+		case MONO_TYPE_R4:
+			if ((arg_in_reg_bitvector[(i - 1) >> 3] & (1 << ((i - 1) & 7))) == 0) {
+				x86_64_push_membase (p, X86_64_R10, arg_pos);
+			}
+			break;
+		case MONO_TYPE_R8:
+			if ((arg_in_reg_bitvector[(i - 1) >> 3] & (1 << ((i - 1) & 7))) == 0) {
+				x86_64_push_membase (p, X86_64_R10, arg_pos);
+			}
+			break;
+		case MONO_TYPE_VALUETYPE:
+			if (!sig->params [i - 1]->data.klass->enumtype) {
+				if ((arg_in_reg_bitvector[(i - 1) >> 3] & (1 << ((i - 1) & 7))) == 0)
+				{
+					int ss = mono_class_native_size (sig->params [i - 1]->data.klass, NULL);
+					ss += 7;
+					ss &= ~7;
+
+ 					x86_64_alu_reg_imm(p, X86_SUB, X86_64_RSP, ss);
+					/* Count register */
+					x86_64_mov_reg_imm(p, X86_64_RCX, ss);
+					/* Source register */
+					x86_64_lea_membase(p, X86_64_RSI, X86_64_R10, arg_pos);
+					/* Dest register */
+					x86_64_mov_reg_reg(p, X86_64_RDI, X86_64_RSP, 8);
+
+					/* AMD64 calling convention guarantees direction flag is clear at call boundary. */
+					x86_prefix(p, X86_64_REX(X86_64_REX_W));
+					x86_prefix(p, X86_REP_PREFIX);
+					x86_movsb(p);
+				}
+			} else {
+				/* it's an enum value */
+				simpletype = sig->params [i - 1]->data.klass->enum_basetype->type;
+				goto enum_marshal2;
+			}
+			break;
+		default:
+			g_error ("Can't trampoline 0x%x", sig->params [i - 1]->type);
+		}
+	}
+
+        /* TODO: Set RAL to number of XMM registers used in case this is a varags function? */
+ 
+	/* 
+	 * Insert call to function 
+	 */
+	x86_64_call_reg (p, X86_64_R11);
+
+	if (sig->ret->byref || string_ctor || !(retval_implicit || sig->ret->type == MONO_TYPE_VOID)) {
+		x86_64_pop_reg(p, X86_64_RSI);
+	}
+	/*
+	 * Handle retval.
+	 * Small integer and pointer values are in EAX.
+	 * Long integers are in EAX:EDX.
+	 * FP values are on the FP stack.
+	 */
+
+	if (sig->ret->byref || string_ctor) {
+		simpletype = MONO_TYPE_PTR;
+	} else {
+		simpletype = sig->ret->type;
+	}
+	enum_retvalue:
+	switch (simpletype) {
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+			x86_64_mov_regp_reg (p, X86_64_RSI, X86_EAX, 1);
+			break;
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+			x86_64_mov_regp_reg (p, X86_64_RSI, X86_EAX, 2);
+			break;
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_CLASS:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_SZARRAY:
+		case MONO_TYPE_ARRAY:
+		case MONO_TYPE_STRING: 
+		case MONO_TYPE_PTR:
+			x86_64_mov_regp_reg (p, X86_64_RSI, X86_EAX, 8);
+			break;
+		case MONO_TYPE_R4:
+			x86_64_movss_regp_reg (p, X86_64_RSI, X86_64_XMM0);
+			break;
+		case MONO_TYPE_R8:
+			x86_64_movsd_regp_reg (p, X86_64_RSI, X86_64_XMM0);
+			break;
+		case MONO_TYPE_I8:
+			x86_64_mov_regp_reg (p, X86_64_RSI, X86_EAX, 8);
+			break;
+		case MONO_TYPE_VALUETYPE: {
+			int size;
+			int arg_type;
+			int regs_used;
+			int offset1;
+			int size1;
+			int offset2;
+			int size2;
+
+			if (sig->ret->data.klass->enumtype) {
+				simpletype = sig->ret->data.klass->enum_basetype->type;
+				goto enum_retvalue;
+			}
+
+			arg_type = value_type_info(sig->params [i]->data.klass, &size, &regs_used, &offset1, &size1, &offset2, &size2);
+
+			if (arg_type == ARG_IN_INT_REGS)
+			{
+				x86_64_mov_membase_reg (p, X86_64_RSI, offset1, X86_64_RAX, size1);
+				if (regs_used > 1)
+					x86_64_mov_membase_reg (p, X86_64_RSI, offset2, X86_64_RDX, size2);
+				break;
+			}
+
+			if (arg_type == ARG_IN_FLOAT_REGS)
+			{
+				if (size1 == 4)
+					x86_64_movss_membase_reg (p, X86_64_RSI, offset1, X86_64_XMM0);
+				else
+					x86_64_movsd_membase_reg (p, X86_64_RSI, offset1, X86_64_XMM0);
+
+				if (regs_used > 1)
+				{
+					if (size2 == 4)
+						x86_64_movss_membase_reg (p, X86_64_RSI, offset2, X86_64_XMM1);
+					else
+						x86_64_movsd_membase_reg (p, X86_64_RSI, offset2, X86_64_XMM1);
+				}
+				break;
+			}
+
+			/* Else result should have been stored in place already. */
+			break;
+		}
+		case MONO_TYPE_VOID:
+			break;
+		default:
+			g_error ("Can't handle as return value 0x%x", sig->ret->type);
+	}
+
+	/*
+	 * Standard epilog.
+	 */
+	x86_64_leave (p);
+	x86_64_ret (p);
+
+	g_assert (p - code_buffer < code_size);
+	res = (MonoPIFunc)g_memdup (code_buffer, p - code_buffer);
+
+	g_hash_table_insert (cache, sig, res);
+
+	return res;
+}
+
+/*
+ * Returns a pointer to a native function that can be used to
+ * call the specified method.
+ * The function created will receive the arguments according
+ * to the call convention specified in the method.
+ * This function works by creating a MonoInvocation structure,
+ * filling the fields in and calling ves_exec_method on it.
+ * Still need to figure out how to handle the exception stuff
+ * across the managed/unmanaged boundary.
+ */
+void *
+mono_arch_create_method_pointer (MonoMethod *method)
+{
+	MonoMethodSignature *sig;
+	MonoJitInfo *ji;
+	unsigned char *p, *code_buffer;
+	guint32 simpletype;
+	gint32 local_size;
+	gint32 stackval_pos;
+	gint32 mono_invocation_pos;
+	int i, cpos;
+	int *vtbuf;
+	int *rbpoffsets;
+	int int_arg_regs_used = 0;
+	int float_arg_regs_used = 0;
+	int stacked_args_size = 0; /* bytes of register passed arguments pushed on stack for safe keeping. Used to get alignment right. */
+	int next_stack_arg_rbp_offset = 16;
+	int retval_ptr_rbp_offset = 0;
+	int this_reg; /* Remember register this ptr is in. */
+
+	/*
+	 * If it is a static P/Invoke method, we can just return the pointer
+	 * to the method implementation.
+	 */
+	if (method->flags & METHOD_ATTRIBUTE_PINVOKE_IMPL && method->addr) {
+		ji = g_new0 (MonoJitInfo, 1);
+		ji->method = method;
+		ji->code_size = 1;
+		ji->code_start = method->addr;
+
+		mono_jit_info_table_add (mono_root_domain, ji);
+		return method->addr;
+	}
+
+	sig = method->signature;
+
+	code_buffer = p = alloca (512); /* FIXME: check for overflows... */
+	vtbuf = alloca (sizeof(int)*sig->param_count);
+	rbpoffsets = alloca (sizeof(int)*sig->param_count);
+
+
+	/*
+	 * Standard function prolog.
+	 */
+	x86_64_push_reg (p, X86_64_RBP);
+	x86_64_mov_reg_reg (p, X86_64_RBP, X86_64_RSP, 8);
+
+	/* If there is an implicit return value pointer in the first args reg, save it now so
+	 * the result can be stored through the pointer at the end.
+	 */
+	if (sig->ret->type == MONO_TYPE_VALUETYPE && !sig->ret->byref && !sig->ret->data.klass->enumtype) 
+	{
+		x86_64_push_reg (p, int_arg_regs[int_arg_regs_used]);
+		int_arg_regs_used++;
+		stacked_args_size += 8;
+		retval_ptr_rbp_offset = -stacked_args_size;
+	}
+
+	/*
+	 * If there is a this pointer, remember the number of the register it is in.
+	 */
+	if (sig->hasthis) {
+		this_reg = int_arg_regs[int_arg_regs_used++];
+	}
+
+	/* Put all arguments passed in registers on the stack.
+	 * Record offsets from RBP to each argument.
+	 */
+	cpos = 0;
+
+	for (i = 0; i < sig->param_count; i++) {
+		if (sig->params [i]->byref)
+			simpletype = MONO_TYPE_PTR;
+		else
+			simpletype = sig->params [i]->type;
+enum_calc_size:
+		switch (simpletype) {
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_PTR:
+		case MONO_TYPE_SZARRAY:
+		case MONO_TYPE_CLASS:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_STRING:
+		case MONO_TYPE_I8:
+			if (int_arg_regs_used < MAX_INT_ARG_REGS) {
+				x86_64_push_reg (p, int_arg_regs[int_arg_regs_used]);
+				int_arg_regs_used++;
+				stacked_args_size += 8;
+				rbpoffsets[i] = -stacked_args_size;
+			}
+			else
+			{
+				rbpoffsets[i] = next_stack_arg_rbp_offset;
+				next_stack_arg_rbp_offset += 8;
+			}
+			break;
+		case MONO_TYPE_VALUETYPE: {
+			if (sig->params [i]->data.klass->enumtype) {
+				simpletype = sig->params [i]->data.klass->enum_basetype->type;
+				goto enum_calc_size;
+			}
+			else
+			{
+				int size;
+				int arg_type;
+				int regs_used;
+				int offset1;
+				int size1;
+				int offset2;
+				int size2;
+
+				arg_type = value_type_info(sig->params [i]->data.klass, &size, &regs_used, &offset1, &size1, &offset2, &size2);
+
+				if (arg_type == ARG_IN_INT_REGS &&
+				    (int_arg_regs_used + regs_used) <= MAX_INT_ARG_REGS)
+				{
+					x86_64_alu_reg_imm (p, X86_SUB, X86_64_RSP, size);
+					stacked_args_size += size;
+					rbpoffsets[i] = stacked_args_size;
+
+					x86_64_mov_reg_membase (p, int_arg_regs[int_arg_regs_used], X86_64_RSP, offset1, size1);
+					int_arg_regs_used++;
+					if (regs_used > 1)
+					{
+						x86_64_mov_reg_membase (p, int_arg_regs[int_arg_regs_used], X86_64_RSP, offset2, size2);
+						int_arg_regs_used++;
+					}
+					break;
+				}
+
+				if (arg_type == ARG_IN_FLOAT_REGS &&
+				    (float_arg_regs_used + regs_used) <= MAX_FLOAT_ARG_REGS)
+				{
+					x86_64_alu_reg_imm (p, X86_SUB, X86_64_RSP, size);
+					stacked_args_size += size;
+					rbpoffsets[i] = stacked_args_size;
+
+					if (size1 == 4)
+						x86_64_movss_reg_membase (p, float_arg_regs_used, X86_64_RSP, offset1);
+					else
+						x86_64_movsd_reg_membase (p, float_arg_regs_used, X86_64_RSP, offset1);
+					float_arg_regs_used++;
+
+					if (regs_used > 1)
+					{
+						if (size2 == 4)
+							x86_64_movss_reg_membase (p, float_arg_regs_used, X86_64_RSP, offset2);
+						else
+							x86_64_movsd_reg_membase (p, float_arg_regs_used, X86_64_RSP, offset2);
+						float_arg_regs_used++;
+					}
+					break;
+				}
+
+				rbpoffsets[i] = next_stack_arg_rbp_offset;
+				next_stack_arg_rbp_offset += size;
+			}
+			break;
+		}
+		case MONO_TYPE_R4:
+			if (float_arg_regs_used < MAX_FLOAT_ARG_REGS) {
+				x86_64_alu_reg_imm (p, X86_SUB, X86_64_RSP, 8);
+				x86_64_movss_regp_reg (p, X86_64_RSP, float_arg_regs_used);
+				float_arg_regs_used++;
+				stacked_args_size += 8;
+				rbpoffsets[i] = -stacked_args_size;
+			}
+			else
+			{
+				rbpoffsets[i] = next_stack_arg_rbp_offset;
+				next_stack_arg_rbp_offset += 8;
+			}
+			break;
+		case MONO_TYPE_R8:
+			stacked_args_size += 8;
+			if (float_arg_regs_used < MAX_FLOAT_ARG_REGS) {
+				x86_64_alu_reg_imm (p, X86_SUB, X86_64_RSP, 8);
+				x86_64_movsd_regp_reg (p, X86_64_RSP, float_arg_regs_used);
+				float_arg_regs_used++;
+				stacked_args_size += 8;
+				rbpoffsets[i] = -stacked_args_size;
+			}
+			else
+			{
+				rbpoffsets[i] = next_stack_arg_rbp_offset;
+				next_stack_arg_rbp_offset += 8;
+			}
+			break;
+		default:
+			g_error ("Can't trampoline 0x%x", sig->params [i]->type);
+		}
+	}
+
+	local_size = sizeof (MonoInvocation) + sizeof (stackval) * (sig->param_count + 1) + stacked_args_size;
+
+	local_size += 15;
+	local_size &= ~15;
+
+	stackval_pos = -local_size;
+	mono_invocation_pos = stackval_pos + sizeof (stackval) * (sig->param_count + 1);
+
+	/* stacked_args_size has already been pushed onto the stack. Make room for the rest of it. */
+	x86_64_alu_reg_imm (p, X86_SUB, X86_64_RSP, local_size - stacked_args_size);
+
+	/* Be careful not to trash any arg regs before saving this_reg to MonoInvocation structure below. */
+
+	/*
+	 * Initialize MonoInvocation fields, first the ones known now.
+	 */
+	x86_64_alu_reg_reg (p, X86_XOR, X86_64_RAX, X86_64_RAX);
+	x86_64_mov_membase_reg (p, X86_64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, ex)), X86_64_RAX, 8);
+	x86_64_mov_membase_reg (p, X86_64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, ex_handler)), X86_64_RAX, 8);
+	x86_64_mov_membase_reg (p, X86_64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, child)), X86_64_RAX, 8);
+	x86_64_mov_membase_reg (p, X86_64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, parent)), X86_64_RAX, 8);
+	/*
+	 * Set the method pointer.
+	 */
+	x86_64_mov_membase_imm (p, X86_64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, method)), (long)method, 8);
+
+	/*
+	 * Handle this.
+	 */
+	if (sig->hasthis)
+		x86_64_mov_membase_reg(p, X86_64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, obj)), this_reg, 8);
+
+	/*
+	 * Handle the arguments. stackval_pos is the offset from RBP of the stackval in the MonoInvocation args array .
+	 * arg_pos is the offset from RBP to the incoming arg on the stack.
+	 * We just call stackval_from_data to handle all the (nasty) issues....
+	 */
+	x86_64_lea_membase (p, X86_64_RAX, X86_64_RBP, stackval_pos);
+	x86_64_mov_membase_reg (p, X86_64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, stack_args)), X86_64_RAX, 8);
+	for (i = 0; i < sig->param_count; ++i) {
+/* Need to call stackval_from_data (MonoType *type, stackval *result, char *data, gboolean pinvoke); */
+		x86_64_mov_reg_imm (p, X86_64_R11, stackval_from_data);
+		x86_64_mov_reg_imm (p, int_arg_regs[0], sig->params[i]);
+		x86_64_lea_membase (p, int_arg_regs[1], X86_64_RBP, stackval_pos);
+		x86_64_lea_membase (p, int_arg_regs[2], X86_64_RBP, rbpoffsets[i]);
+		x86_64_mov_reg_imm (p, int_arg_regs[3], sig->pinvoke);
+		x86_64_call_reg (p, X86_64_R11);
+		stackval_pos += sizeof (stackval);
+#if 0
+		/* fixme: alignment */
+		if (sig->pinvoke)
+			arg_pos += mono_type_native_stack_size (sig->params [i], &align);
+		else
+			arg_pos += mono_type_stack_size (sig->params [i], &align);
+#endif
+	}
+
+	/*
+	 * Handle the return value storage area.
+	 */
+	x86_64_lea_membase (p, X86_64_RAX, X86_64_RBP, stackval_pos);
+	x86_64_mov_membase_reg (p, X86_64_RBP, (mono_invocation_pos + G_STRUCT_OFFSET (MonoInvocation, retval)), X86_64_RAX, 8);
+	if (sig->ret->type == MONO_TYPE_VALUETYPE && !sig->ret->byref) {
+		MonoClass *klass  = sig->ret->data.klass;
+		if (!klass->enumtype) {
+			x86_64_mov_reg_membase (p, X86_64_RCX, X86_64_RBP, retval_ptr_rbp_offset, 8);
+			x86_64_mov_membase_reg (p, X86_64_RBP, stackval_pos, X86_64_RCX, 8);
+		}
+	}
+
+	/*
+	 * Call the method.
+	 */
+	x86_64_lea_membase (p, int_arg_regs[0], X86_64_RBP, mono_invocation_pos);
+	x86_64_mov_reg_imm (p, X86_64_R11, ves_exec_method);
+	x86_64_call_reg (p, X86_64_R11);
+	
+	/*
+	 * Move the return value to the proper place.
+	 */
+	x86_64_lea_membase (p, X86_64_RAX, X86_64_RBP, stackval_pos);
+	if (sig->ret->byref) {
+		x86_64_mov_reg_membase (p, X86_64_RAX, X86_64_RAX, 0, 8);
+	} else {
+		int simpletype = sig->ret->type;	
+	enum_retvalue:
+		switch (sig->ret->type) {
+		case MONO_TYPE_VOID:
+			break;
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+			x86_64_movzx_reg_membase (p, X86_64_RAX, X86_64_RAX, 0, 1);
+			break;
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+			x86_64_movzx_reg_membase (p, X86_64_RAX, X86_64_RAX, 0, 2);
+			break;
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_STRING:
+		case MONO_TYPE_CLASS:
+			x86_64_movzx_reg_membase (p, X86_64_RAX, X86_64_RAX, 0, 4);
+			break;
+		case MONO_TYPE_I8:
+			x86_64_movzx_reg_membase (p, X86_64_RAX, X86_64_RAX, 0, 8);
+			break;
+		case MONO_TYPE_R4:
+			x86_64_movss_regp_reg (p, X86_64_RAX, X86_64_XMM0);
+			break;
+		case MONO_TYPE_R8:
+			x86_64_movsd_regp_reg (p, X86_64_RAX, X86_64_XMM0);
+			break;
+		case MONO_TYPE_VALUETYPE: {
+			int size;
+			int arg_type;
+			int regs_used;
+			int offset1;
+			int size1;
+			int offset2;
+			int size2;
+
+			if (sig->ret->data.klass->enumtype) {
+				simpletype = sig->ret->data.klass->enum_basetype->type;
+				goto enum_retvalue;
+			}
+
+			arg_type = value_type_info(sig->params [i]->data.klass, &size, &regs_used, &offset1, &size1, &offset2, &size2);
+
+			if (arg_type == ARG_IN_INT_REGS)
+			{
+				if (regs_used > 1)
+					x86_64_mov_membase_reg (p, X86_64_RAX, offset2, X86_64_RDX, size2);
+				x86_64_mov_membase_reg (p, X86_64_RAX, offset1, X86_64_RAX, size1);
+				break;
+			}
+
+			if (arg_type == ARG_IN_FLOAT_REGS)
+			{
+				if (size1 == 4)
+					x86_64_movss_membase_reg (p, X86_64_RAX, offset1, X86_64_XMM0);
+				else
+					x86_64_movsd_membase_reg (p, X86_64_RAX, offset1, X86_64_XMM0);
+
+				if (regs_used > 1)
+				{
+					if (size2 == 4)
+						x86_64_movss_membase_reg (p, X86_64_RAX, offset2, X86_64_XMM1);
+					else
+						x86_64_movsd_membase_reg (p, X86_64_RAX, offset2, X86_64_XMM1);
+				}
+				break;
+			}
+
+			/* Else result should have been stored in place already. IA32 code has a stackval_to_data call here, which
+			 * looks wrong to me as the pointer in the stack val being converted is setup to point to the output area anyway.
+			 * It all looks a bit suspect anyway.
+			 */
+			break;
+		}
+		default:
+			g_error ("Type 0x%x not handled yet in thunk creation", sig->ret->type);
+			break;
+		}
+	}
+	
+	/*
+	 * Standard epilog.
+	 */
+	x86_64_leave (p);
+	x86_64_ret (p);
+
+	g_assert (p - code_buffer < 512);
+
+	ji = g_new0 (MonoJitInfo, 1);
+	ji->method = method;
+	ji->code_size = p - code_buffer;
+	ji->code_start = g_memdup (code_buffer, p - code_buffer);
+
+	mono_jit_info_table_add (mono_root_domain, ji);
+
+	return ji->code_start;
+}
Index: mono/arch/x86_64/x86_64-codegen.h
===================================================================
--- /dev/null	2003-03-27 11:16:05.000000000 -0800
+++ mono/arch/x86_64/x86_64-codegen.h	2003-11-12 18:15:58.000000000 -0800
@@ -0,0 +1,1952 @@
+/*
+ * x86_64-codegen.h: Macros for generating x86 code
+ *
+ * Authors:
+ *   Paolo Molaro (lupus at ximian.com)
+ *   Intel Corporation (ORP Project)
+ *   Sergey Chaban (serge at wildwestsoftware.com)
+ *   Dietmar Maurer (dietmar at ximian.com)
+ *   Patrik Torstensson
+ *   Zalman Stern
+ * 
+ * Not all routines are done for AMD64. Much could also be removed from here if supporting tramp.c is the only goal.
+ * 
+ * Copyright (C)  2000 Intel Corporation.  All rights reserved.
+ * Copyright (C)  2001, 2002 Ximian, Inc.
+ */
+
+#ifndef X86_64_H
+#define X86_64_H
+#include <assert.h>
+/*
+// x86 register numbers
+*/
+typedef enum {
+	X86_EAX = 0,
+	X86_ECX = 1,
+	X86_EDX = 2,
+	X86_EBX = 3,
+	X86_ESP = 4,
+	X86_EBP = 5,
+	X86_ESI = 6,
+	X86_EDI = 7,
+	X86_NREG
+} X86_Reg_No;
+
+typedef enum {
+	X86_64_RAX = 0,
+	X86_64_RCX = 1,
+	X86_64_RDX = 2,
+	X86_64_RBX = 3,
+	X86_64_RSP = 4,
+	X86_64_RBP = 5,
+	X86_64_RSI = 6,
+	X86_64_RDI = 7,
+	X86_64_R8 = 8,
+	X86_64_R9 = 9,
+	X86_64_R10 = 10,
+	X86_64_R11 = 11,
+	X86_64_R12 = 12,
+	X86_64_R13 = 13,
+	X86_64R_14 = 14,
+	X86_64_R15 = 15,
+	X86_64_NREG
+} X86_64_Reg_No;
+
+typedef enum
+{
+  X86_64_REX_B = 1,
+  X86_64_REX_X = 2,
+  X86_64_REX_R = 4,
+  X86_64_REX_W = 8
+} X86_64_REX_Bits;
+
+#define X86_64_REX(bits) ((unsigned char)(0x40 | (bits)))
+
+typedef enum {
+	X86_64_XMM0 = 0,
+	X86_64_XMM1 = 1,
+	X86_64_XMM2 = 2,
+	X86_64_XMM3 = 3,
+	X86_64_XMM4 = 4,
+	X86_64_XMM5 = 5,
+	X86_64_XMM6 = 6,
+	X86_64_XMM8 = 8,
+	X86_64_XMM9 = 9,
+	X86_64_XMM10 = 10,
+	X86_64_XMM11 = 11,
+	X86_64_XMM12 = 12,
+	X86_64_XMM13 = 13,
+	X86_64_XMM14 = 14,
+	X86_64_XMM15 = 15,
+	X86_64_XMM_NREG = 16,
+} X86_64_XMM_Reg_No;
+
+/*
+// opcodes for alu instructions
+*/
+typedef enum {
+	X86_ADD = 0,
+	X86_OR  = 1,
+	X86_ADC = 2,
+	X86_SBB = 3,
+	X86_AND = 4,
+	X86_SUB = 5,
+	X86_XOR = 6,
+	X86_CMP = 7,
+	X86_NALU
+} X86_ALU_Opcode;
+/*
+// opcodes for shift instructions
+*/
+typedef enum {
+	X86_SHLD,
+	X86_SHLR,
+	X86_ROL = 0,
+	X86_ROR = 1,
+	X86_RCL = 2,
+	X86_RCR = 3,
+	X86_SHL = 4,
+	X86_SHR = 5,
+	X86_SAR = 7,
+	X86_NSHIFT = 8
+} X86_Shift_Opcode;
+/*
+// opcodes for floating-point instructions
+*/
+typedef enum {
+	X86_FADD  = 0,
+	X86_FMUL  = 1,
+	X86_FCOM  = 2,
+	X86_FCOMP = 3,
+	X86_FSUB  = 4,
+	X86_FSUBR = 5,
+	X86_FDIV  = 6,
+	X86_FDIVR = 7,
+	X86_NFP   = 8
+} X86_FP_Opcode;
+/*
+// integer conditions codes
+*/
+typedef enum {
+	X86_CC_EQ = 0, X86_CC_E = 0, X86_CC_Z = 0,
+	X86_CC_NE = 1, X86_CC_NZ = 1,
+	X86_CC_LT = 2, X86_CC_B = 2, X86_CC_C = 2, X86_CC_NAE = 2,
+	X86_CC_LE = 3, X86_CC_BE = 3, X86_CC_NA = 3,
+	X86_CC_GT = 4, X86_CC_A = 4, X86_CC_NBE = 4,
+	X86_CC_GE = 5, X86_CC_AE = 5, X86_CC_NB = 5, X86_CC_NC = 5,
+	X86_CC_LZ = 6, X86_CC_S = 6,
+	X86_CC_GEZ = 7, X86_CC_NS = 7,
+	X86_CC_P = 8, X86_CC_PE = 8,
+	X86_CC_NP = 9, X86_CC_PO = 9,
+	X86_CC_O = 10,
+	X86_CC_NO = 11,
+	X86_NCC
+} X86_CC;
+
+/* FP status */
+enum {
+	X86_FP_C0 = 0x100,
+	X86_FP_C1 = 0x200,
+	X86_FP_C2 = 0x400,
+	X86_FP_C3 = 0x4000,
+	X86_FP_CC_MASK = 0x4500
+};
+
+/* FP control word */
+enum {
+	X86_FPCW_INVOPEX_MASK = 0x1,
+	X86_FPCW_DENOPEX_MASK = 0x2,
+	X86_FPCW_ZERODIV_MASK = 0x4,
+	X86_FPCW_OVFEX_MASK   = 0x8,
+	X86_FPCW_UNDFEX_MASK  = 0x10,
+	X86_FPCW_PRECEX_MASK  = 0x20,
+	X86_FPCW_PRECC_MASK   = 0x300,
+	X86_FPCW_ROUNDC_MASK  = 0xc00,
+
+	/* values for precision control */
+	X86_FPCW_PREC_SINGLE    = 0,
+	X86_FPCW_PREC_DOUBLE    = 0x200,
+	X86_FPCW_PREC_EXTENDED  = 0x300,
+
+	/* values for rounding control */
+	X86_FPCW_ROUND_NEAREST  = 0,
+	X86_FPCW_ROUND_DOWN     = 0x400,
+	X86_FPCW_ROUND_UP       = 0x800,
+	X86_FPCW_ROUND_TOZERO   = 0xc00
+};
+
+/*
+// prefix code
+*/
+typedef enum {
+	X86_LOCK_PREFIX = 0xF0,
+	X86_REPNZ_PREFIX = 0xF2,
+	X86_REPZ_PREFIX = 0xF3, 
+	X86_REP_PREFIX = 0xF3,
+	X86_CS_PREFIX = 0x2E,
+	X86_SS_PREFIX = 0x36,
+	X86_DS_PREFIX = 0x3E,
+	X86_ES_PREFIX = 0x26,
+	X86_FS_PREFIX = 0x64,
+	X86_GS_PREFIX = 0x65,
+	X86_UNLIKELY_PREFIX = 0x2E,
+	X86_LIKELY_PREFIX = 0x3E,
+	X86_OPERAND_PREFIX = 0x66,
+	X86_ADDRESS_PREFIX = 0x67
+} X86_Prefix;
+
+static const unsigned char 
+x86_cc_unsigned_map [X86_NCC] = {
+	0x74, /* eq  */
+	0x75, /* ne  */
+	0x72, /* lt  */
+	0x76, /* le  */
+	0x77, /* gt  */
+	0x73, /* ge  */
+	0x78, /* lz  */
+	0x79, /* gez */
+	0x7a, /* p   */
+	0x7b, /* np  */
+	0x70, /* o  */
+	0x71, /* no  */
+};
+
+static const unsigned char 
+x86_cc_signed_map [X86_NCC] = {
+	0x74, /* eq  */
+	0x75, /* ne  */
+	0x7c, /* lt  */
+	0x7e, /* le  */
+	0x7f, /* gt  */
+	0x7d, /* ge  */
+	0x78, /* lz  */
+	0x79, /* gez */
+	0x7a, /* p   */
+	0x7b, /* np  */
+	0x70, /* o  */
+	0x71, /* no  */
+};
+
+typedef union {
+	int val;
+	unsigned char b [4];
+} x86_imm_buf;
+
+typedef union {
+	long val;
+	unsigned char b [8];
+} x86_64_imm_buf;
+
+#define X86_NOBASEREG (-1)
+
+/*
+// bitvector mask for callee-saved registers
+*/
+#define X86_ESI_MASK (1<<X86_ESI)
+#define X86_EDI_MASK (1<<X86_EDI)
+#define X86_EBX_MASK (1<<X86_EBX)
+#define X86_EBP_MASK (1<<X86_EBP)
+
+#define X86_CALLEE_REGS ((1<<X86_EAX) | (1<<X86_ECX) | (1<<X86_EDX))
+#define X86_CALLER_REGS ((1<<X86_EBX) | (1<<X86_EBP) | (1<<X86_ESI) | (1<<X86_EDI))
+#define X86_BYTE_REGS   ((1<<X86_EAX) | (1<<X86_ECX) | (1<<X86_EDX) | (1<<X86_EBX))
+
+#define X86_IS_SCRATCH(reg) (X86_CALLER_REGS & (1 << (reg))) /* X86_EAX, X86_ECX, or X86_EDX */
+#define X86_IS_CALLEE(reg)  (X86_CALLEE_REGS & (1 << (reg))) 	/* X86_ESI, X86_EDI, X86_EBX, or X86_EBP */
+
+#define X86_IS_BYTE_REG(reg) ((reg) < 4)
+
+/*
+// Frame structure:
+//
+//      +--------------------------------+
+//      | in_arg[0]       = var[0]	     |
+//      | in_arg[1]	      = var[1]	     |
+//      |	      . . .			         |
+//      | in_arg[n_arg-1] = var[n_arg-1] |
+//      +--------------------------------+
+//      |       return IP                |
+//      +--------------------------------+
+//      |       saved EBP                | <-- frame pointer (EBP)
+//      +--------------------------------+
+//      |            ...                 |  n_extra
+//      +--------------------------------+
+//      |	    var[n_arg]	             |
+//      |	    var[n_arg+1]             |  local variables area
+//      |          . . .                 |
+//      |	    var[n_var-1]             | 
+//      +--------------------------------+
+//      |			                     |
+//      |			                     |  
+//      |		spill area               | area for spilling mimic stack
+//      |			                     |
+//      +--------------------------------|
+//      |          ebx                   |
+//      |          ebp [ESP_Frame only]  |
+//      |	       esi                   |  0..3 callee-saved regs
+//      |          edi                   | <-- stack pointer (ESP)
+//      +--------------------------------+
+//      |	stk0	                     |
+//      |	stk1	                     |  operand stack area/
+//      |	. . .	                     |  out args
+//      |	stkn-1	                     |
+//      +--------------------------------|
+//
+//
+*/
+
+
+/*
+ * useful building blocks
+ */
+#define x86_address_byte(inst,m,o,r) do { *(inst)++ = ((((m)&0x03)<<6)|(((o)&0x07)<<3)|(((r)&0x07))); } while (0)
+#define x86_imm_emit64(inst,imm)     \
+	do {	\
+			x86_64_imm_buf imb; imb.val = (long) (imm);	\
+			*(inst)++ = imb.b [0];	\
+			*(inst)++ = imb.b [1];	\
+			*(inst)++ = imb.b [2];	\
+			*(inst)++ = imb.b [3];	\
+			*(inst)++ = imb.b [4];	\
+			*(inst)++ = imb.b [5];	\
+			*(inst)++ = imb.b [6];	\
+			*(inst)++ = imb.b [7];	\
+	} while (0)
+
+#define x86_imm_emit32(inst,imm)     \
+	do {	\
+			x86_imm_buf imb; imb.val = (int) (imm);	\
+			*(inst)++ = imb.b [0];	\
+			*(inst)++ = imb.b [1];	\
+			*(inst)++ = imb.b [2];	\
+			*(inst)++ = imb.b [3];	\
+	} while (0)
+#define x86_imm_emit16(inst,imm)     do { *(short*)(inst) = (imm); (inst) += 2; } while (0)
+#define x86_imm_emit8(inst,imm)      do { *(inst) = (unsigned char)((imm) & 0xff); ++(inst); } while (0)
+#define x86_is_imm8(imm)             (((int)(imm) >= -128 && (int)(imm) <= 127))
+#define x86_is_imm16(imm)            (((int)(imm) >= -(1<<16) && (int)(imm) <= ((1<<16)-1)))
+
+#define x86_reg_emit(inst,r,regno)   do { x86_address_byte ((inst), 3, (r), (regno)); } while (0)
+#define x86_reg8_emit(inst,r,regno,is_rh,is_rnoh)   do {x86_address_byte ((inst), 3, (is_rh)?((r)|4):(r), (is_rnoh)?((regno)|4):(regno));} while (0)
+#define x86_regp_emit(inst,r,regno)  do { x86_address_byte ((inst), 0, (r), (regno)); } while (0)
+#define x86_mem_emit(inst,r,disp)    do { x86_address_byte ((inst), 0, (r), 5); x86_imm_emit32((inst), (disp)); } while (0)
+
+#define x86_membase_emit(inst,r,basereg,disp)	do {\
+	if ((basereg) == X86_ESP) {	\
+		if ((disp) == 0) {	\
+			x86_address_byte ((inst), 0, (r), X86_ESP);	\
+			x86_address_byte ((inst), 0, X86_ESP, X86_ESP);	\
+		} else if (x86_is_imm8((disp))) {	\
+			x86_address_byte ((inst), 1, (r), X86_ESP);	\
+			x86_address_byte ((inst), 0, X86_ESP, X86_ESP);	\
+			x86_imm_emit8 ((inst), (disp));	\
+		} else {	\
+			x86_address_byte ((inst), 2, (r), X86_ESP);	\
+			x86_address_byte ((inst), 0, X86_ESP, X86_ESP);	\
+			x86_imm_emit32 ((inst), (disp));	\
+		}	\
+		break;	\
+	}	\
+	if ((disp) == 0 && (basereg) != X86_EBP) {	\
+		x86_address_byte ((inst), 0, (r), (basereg));	\
+		break;	\
+	}	\
+	if (x86_is_imm8((disp))) {	\
+		x86_address_byte ((inst), 1, (r), (basereg));	\
+		x86_imm_emit8 ((inst), (disp));	\
+	} else {	\
+		x86_address_byte ((inst), 2, (r), (basereg));	\
+		x86_imm_emit32 ((inst), (disp));	\
+	}	\
+	} while (0)
+
+#define x86_memindex_emit(inst,r,basereg,disp,indexreg,shift)	\
+	do {	\
+		if ((basereg) == X86_NOBASEREG) {	\
+			x86_address_byte ((inst), 0, (r), 4);	\
+			x86_address_byte ((inst), (shift), (indexreg), 5);	\
+			x86_imm_emit32 ((inst), (disp));	\
+		} else if ((disp) == 0 && (basereg) != X86_EBP) {	\
+			x86_address_byte ((inst), 0, (r), 4);	\
+			x86_address_byte ((inst), (shift), (indexreg), (basereg));	\
+		} else if (x86_is_imm8((disp))) {	\
+			x86_address_byte ((inst), 1, (r), 4);	\
+			x86_address_byte ((inst), (shift), (indexreg), (basereg));	\
+			x86_imm_emit8 ((inst), (disp));	\
+		} else {	\
+			x86_address_byte ((inst), 2, (r), 4);	\
+			x86_address_byte ((inst), (shift), (indexreg), 5);	\
+			x86_imm_emit32 ((inst), (disp));	\
+		}	\
+	} while (0)
+
+/*
+ * target is the position in the code where to jump to:
+ * target = code;
+ * .. output loop code...
+ * x86_mov_reg_imm (code, X86_EAX, 0);
+ * loop = code;
+ * x86_loop (code, -1);
+ * ... finish method
+ *
+ * patch displacement
+ * x86_patch (loop, target);
+ *
+ * ins should point at the start of the instruction that encodes a target.
+ * the instruction is inspected for validity and the correct displacement
+ * is inserted.
+ */
+#define x86_patch(ins,target)	\
+	do {	\
+		unsigned char* pos = (ins) + 1;	\
+		int disp, size = 0;	\
+		switch (*(unsigned char*)(ins)) {	\
+		case 0xe8: case 0xe9: ++size; break; /* call, jump32 */	\
+		case 0x0f: if (!(*pos >= 0x70 && *pos <= 0x8f)) assert (0);	\
+		   ++size; ++pos; break; /* prefix for 32-bit disp */	\
+		case 0xe0: case 0xe1: case 0xe2: /* loop */	\
+		case 0xeb: /* jump8 */	\
+		/* conditional jump opcodes */	\
+		case 0x70: case 0x71: case 0x72: case 0x73:	\
+		case 0x74: case 0x75: case 0x76: case 0x77:	\
+		case 0x78: case 0x79: case 0x7a: case 0x7b:	\
+		case 0x7c: case 0x7d: case 0x7e: case 0x7f:	\
+			break;	\
+		default: assert (0);	\
+		}	\
+		disp = (target) - pos;	\
+		if (size) x86_imm_emit32 (pos, disp - 4);	\
+		else if (x86_is_imm8 (disp - 1)) x86_imm_emit8 (pos, disp - 1);	\
+		else assert (0);	\
+	} while (0)
+
+#define x86_breakpoint(inst) \
+	do {	\
+		*(inst)++ = 0xcc;	\
+	} while (0)
+
+#define x86_cld(inst) do { *(inst)++ =(unsigned char)0xfc; } while (0)
+#define x86_stosb(inst) do { *(inst)++ =(unsigned char)0xaa; } while (0)
+#define x86_stosl(inst) do { *(inst)++ =(unsigned char)0xab; } while (0)
+#define x86_stosd(inst) x86_stosl((inst))
+#define x86_movsb(inst) do { *(inst)++ =(unsigned char)0xa4; } while (0)
+#define x86_movsl(inst) do { *(inst)++ =(unsigned char)0xa5; } while (0)
+#define x86_movsd(inst) x86_movsl((inst))
+
+#define x86_prefix(inst,p) do { *(inst)++ =(unsigned char) (p); } while (0)
+
+#define x86_rdtsc(inst) \
+	do {	\
+		*(inst)++ = 0x0f;	\
+		*(inst)++ = 0x31;	\
+	} while (0)
+
+#define x86_cmpxchg_reg_reg(inst,dreg,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xb1;	\
+		x86_reg_emit ((inst), (reg), (dreg));	\
+	} while (0)
+	
+#define x86_cmpxchg_mem_reg(inst,mem,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xb1;	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+	
+#define x86_cmpxchg_membase_reg(inst,basereg,disp,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xb1;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_xchg_reg_reg(inst,dreg,reg,size)	\
+	do {	\
+		if ((size) == 1)	\
+			*(inst)++ = (unsigned char)0x86;	\
+		else	\
+			*(inst)++ = (unsigned char)0x87;	\
+		x86_reg_emit ((inst), (reg), (dreg));	\
+	} while (0)
+
+#define x86_xchg_mem_reg(inst,mem,reg,size)	\
+	do {	\
+		if ((size) == 1)	\
+			*(inst)++ = (unsigned char)0x86;	\
+		else	\
+			*(inst)++ = (unsigned char)0x87;	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_xchg_membase_reg(inst,basereg,disp,reg,size)	\
+	do {	\
+		if ((size) == 1)	\
+			*(inst)++ = (unsigned char)0x86;	\
+		else	\
+			*(inst)++ = (unsigned char)0x87;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_inc_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_mem_emit ((inst), 0, (mem)); 	\
+	} while (0)
+
+#define x86_inc_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_membase_emit ((inst), 0, (basereg), (disp));	\
+	} while (0)
+
+#define x86_inc_reg(inst,reg) do { *(inst)++ = (unsigned char)0x40 + (reg); } while (0)
+
+#define x86_dec_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_mem_emit ((inst), 1, (mem));	\
+	} while (0)
+
+#define x86_dec_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_membase_emit ((inst), 1, (basereg), (disp));	\
+	} while (0)
+
+#define x86_dec_reg(inst,reg) do { *(inst)++ = (unsigned char)0x48 + (reg); } while (0)
+
+#define x86_not_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_mem_emit ((inst), 2, (mem));	\
+	} while (0)
+
+#define x86_not_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_membase_emit ((inst), 2, (basereg), (disp));	\
+	} while (0)
+
+#define x86_not_reg(inst,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_reg_emit ((inst), 2, (reg));	\
+	} while (0)
+
+#define x86_neg_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_mem_emit ((inst), 3, (mem));	\
+	} while (0)
+
+#define x86_neg_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_membase_emit ((inst), 3, (basereg), (disp));	\
+	} while (0)
+
+#define x86_neg_reg(inst,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_reg_emit ((inst), 3, (reg));	\
+	} while (0)
+
+#define x86_nop(inst) do { *(inst)++ = (unsigned char)0x90; } while (0)
+
+#define x86_alu_reg_imm(inst,opc,reg,imm) 	\
+	do {	\
+		if ((reg) == X86_EAX) {	\
+			*(inst)++ = (((unsigned char)(opc)) << 3) + 5;	\
+			x86_imm_emit32 ((inst), (imm));	\
+			break;	\
+		}	\
+		if (x86_is_imm8((imm))) {	\
+			*(inst)++ = (unsigned char)0x83;	\
+			x86_reg_emit ((inst), (opc), (reg));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0x81;	\
+			x86_reg_emit ((inst), (opc), (reg));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_64_alu_reg_imm(inst,opc,reg,imm) 	\
+	do {	\
+		if ((reg) == X86_EAX) {	\
+			*(inst)++ = X86_64_REX(X86_64_REX_W);	\
+			*(inst)++ = (((unsigned char)(opc)) << 3) + 5;	\
+			x86_imm_emit64 ((inst), (imm));	\
+			break;	\
+		}	\
+		if (x86_is_imm8((imm))) {	\
+			*(inst)++ = X86_64_REX(X86_64_REX_W | (((reg) > 7) ? X86_64_REX_B : 0));	\
+			*(inst)++ = (unsigned char)0x83;	\
+			x86_reg_emit ((inst), (opc), (reg));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = X86_64_REX(X86_64_REX_W | (((reg) > 7) ? X86_64_REX_B : 0));	\
+			*(inst)++ = (unsigned char)0x81;	\
+			x86_reg_emit ((inst), (opc), (reg));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_alu_mem_imm(inst,opc,mem,imm) 	\
+	do {	\
+		if (x86_is_imm8((imm))) {	\
+			*(inst)++ = (unsigned char)0x83;	\
+			x86_mem_emit ((inst), (opc), (mem));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0x81;	\
+			x86_mem_emit ((inst), (opc), (mem));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_alu_membase_imm(inst,opc,basereg,disp,imm) 	\
+	do {	\
+		if (x86_is_imm8((imm))) {	\
+			*(inst)++ = (unsigned char)0x83;	\
+			x86_membase_emit ((inst), (opc), (basereg), (disp));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0x81;	\
+			x86_membase_emit ((inst), (opc), (basereg), (disp));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_alu_mem_reg(inst,opc,mem,reg)	\
+	do {	\
+		*(inst)++ = (((unsigned char)(opc)) << 3) + 1;	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_alu_membase_reg(inst,opc,basereg,disp,reg)	\
+	do {	\
+		*(inst)++ = (((unsigned char)(opc)) << 3) + 1;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_alu_reg_reg(inst,opc,dreg,reg)	\
+	do {	\
+		*(inst)++ = (((unsigned char)(opc)) << 3) + 3;	\
+		x86_reg_emit ((inst), (dreg), (reg));	\
+	} while (0)
+
+#define x86_64_alu_reg_reg(inst,opc,dreg,reg)	\
+	do {	\
+		*(inst)++ = X86_64_REX(X86_64_REX_W); \
+		*(inst)++ = (((unsigned char)(opc)) << 3) + 3;	\
+		x86_reg_emit ((inst), (dreg), (reg));	\
+	} while (0)
+
+/**
+ * @x86_alu_reg8_reg8:
+ * Supports ALU operations between two 8-bit registers.
+ * dreg := dreg opc reg
+ * X86_Reg_No enum is used to specify the registers.
+ * Additionally is_*_h flags are used to specify what part
+ * of a given 32-bit register is used - high (TRUE) or low (FALSE).
+ * For example: dreg = X86_EAX, is_dreg_h = TRUE -> use AH
+ */
+#define x86_alu_reg8_reg8(inst,opc,dreg,reg,is_dreg_h,is_reg_h)	\
+	do {	\
+		*(inst)++ = (((unsigned char)(opc)) << 3) + 2;	\
+		x86_reg8_emit ((inst), (dreg), (reg), (is_dreg_h), (is_reg_h));	\
+	} while (0)
+
+#define x86_alu_reg_mem(inst,opc,reg,mem)	\
+	do {	\
+		*(inst)++ = (((unsigned char)(opc)) << 3) + 3;	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_alu_reg_membase(inst,opc,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = (((unsigned char)(opc)) << 3) + 3;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_test_reg_imm(inst,reg,imm)	\
+	do {	\
+		if ((reg) == X86_EAX) {	\
+			*(inst)++ = (unsigned char)0xa9;	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xf7;	\
+			x86_reg_emit ((inst), 0, (reg));	\
+		}	\
+		x86_imm_emit32 ((inst), (imm));	\
+	} while (0)
+
+#define x86_test_mem_imm(inst,mem,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_mem_emit ((inst), 0, (mem));	\
+		x86_imm_emit32 ((inst), (imm));	\
+	} while (0)
+
+#define x86_test_membase_imm(inst,basereg,disp,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_membase_emit ((inst), 0, (basereg), (disp));	\
+		x86_imm_emit32 ((inst), (imm));	\
+	} while (0)
+
+#define x86_test_reg_reg(inst,dreg,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x85;	\
+		x86_reg_emit ((inst), (reg), (dreg));	\
+	} while (0)
+
+#define x86_test_mem_reg(inst,mem,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x85;	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_test_membase_reg(inst,basereg,disp,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x85;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_shift_reg_imm(inst,opc,reg,imm)	\
+	do {	\
+		if ((imm) == 1) {	\
+			*(inst)++ = (unsigned char)0xd1;	\
+			x86_reg_emit ((inst), (opc), (reg));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xc1;	\
+			x86_reg_emit ((inst), (opc), (reg));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_shift_mem_imm(inst,opc,mem,imm)	\
+	do {	\
+		if ((imm) == 1) {	\
+			*(inst)++ = (unsigned char)0xd1;	\
+			x86_mem_emit ((inst), (opc), (mem));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xc1;	\
+			x86_mem_emit ((inst), (opc), (mem));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_shift_membase_imm(inst,opc,basereg,disp,imm)	\
+	do {	\
+		if ((imm) == 1) {	\
+			*(inst)++ = (unsigned char)0xd1;	\
+			x86_membase_emit ((inst), (opc), (basereg), (disp));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xc1;	\
+			x86_membase_emit ((inst), (opc), (basereg), (disp));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_shift_reg(inst,opc,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd3;	\
+		x86_reg_emit ((inst), (opc), (reg));	\
+	} while (0)
+
+#define x86_shift_mem(inst,opc,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd3;	\
+		x86_mem_emit ((inst), (opc), (mem));	\
+	} while (0)
+
+#define x86_shift_membase(inst,opc,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd3;	\
+		x86_membase_emit ((inst), (opc), (basereg), (disp));	\
+	} while (0)
+
+/*
+ * Multi op shift missing.
+ */
+
+#define x86_shrd_reg(inst,dreg,reg)                     \
+        do {                                            \
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xad;	\
+		x86_reg_emit ((inst), (reg), (dreg));	\
+	} while (0)
+
+#define x86_shrd_reg_imm(inst,dreg,reg,shamt)           \
+        do {                                            \
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xac;	\
+		x86_reg_emit ((inst), (reg), (dreg));	\
+		x86_imm_emit8 ((inst), (shamt));	\
+	} while (0)
+
+#define x86_shld_reg(inst,dreg,reg)                     \
+        do {                                            \
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xa5;	\
+		x86_reg_emit ((inst), (reg), (dreg));	\
+	} while (0)
+
+#define x86_shld_reg_imm(inst,dreg,reg,shamt)           \
+        do {                                            \
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xa4;	\
+		x86_reg_emit ((inst), (reg), (dreg));	\
+		x86_imm_emit8 ((inst), (shamt));	\
+	} while (0)
+
+/*
+ * EDX:EAX = EAX * rm
+ */
+#define x86_mul_reg(inst,reg,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_reg_emit ((inst), 4 + ((is_signed) ? 1 : 0), (reg));	\
+	} while (0)
+
+#define x86_mul_mem(inst,mem,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_mem_emit ((inst), 4 + ((is_signed) ? 1 : 0), (mem));	\
+	} while (0)
+
+#define x86_mul_membase(inst,basereg,disp,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_membase_emit ((inst), 4 + ((is_signed) ? 1 : 0), (basereg), (disp));	\
+	} while (0)
+
+/*
+ * r *= rm
+ */
+#define x86_imul_reg_reg(inst,dreg,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xaf;	\
+		x86_reg_emit ((inst), (dreg), (reg));	\
+	} while (0)
+
+#define x86_imul_reg_mem(inst,reg,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xaf;	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_imul_reg_membase(inst,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0xaf;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+/*
+ * dreg = rm * imm
+ */
+#define x86_imul_reg_reg_imm(inst,dreg,reg,imm)	\
+	do {	\
+		if (x86_is_imm8 ((imm))) {	\
+			*(inst)++ = (unsigned char)0x6b;	\
+			x86_reg_emit ((inst), (dreg), (reg));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0x69;	\
+			x86_reg_emit ((inst), (dreg), (reg));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_imul_reg_mem_imm(inst,reg,mem,imm)	\
+	do {	\
+		if (x86_is_imm8 ((imm))) {	\
+			*(inst)++ = (unsigned char)0x6b;	\
+			x86_mem_emit ((inst), (reg), (mem));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0x69;	\
+			x86_reg_emit ((inst), (reg), (mem));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_imul_reg_membase_imm(inst,reg,basereg,disp,imm)	\
+	do {	\
+		if (x86_is_imm8 ((imm))) {	\
+			*(inst)++ = (unsigned char)0x6b;	\
+			x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0x69;	\
+			x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+/*
+ * divide EDX:EAX by rm;
+ * eax = quotient, edx = remainder
+ */
+
+#define x86_div_reg(inst,reg,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_reg_emit ((inst), 6 + ((is_signed) ? 1 : 0), (reg));	\
+	} while (0)
+
+#define x86_div_mem(inst,mem,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_mem_emit ((inst), 6 + ((is_signed) ? 1 : 0), (mem));	\
+	} while (0)
+
+#define x86_div_membase(inst,basereg,disp,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf7;	\
+		x86_membase_emit ((inst), 6 + ((is_signed) ? 1 : 0), (basereg), (disp));	\
+	} while (0)
+
+#define x86_mov_mem_reg(inst,mem,reg,size)	\
+	do {	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x88; break;	\
+		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 4: *(inst)++ = (unsigned char)0x89; break;	\
+		default: assert (0);	\
+		}	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_mov_regp_reg(inst,regp,reg,size)	\
+	do {	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x88; break;	\
+		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 4: *(inst)++ = (unsigned char)0x89; break;	\
+		default: assert (0);	\
+		}	\
+		x86_regp_emit ((inst), (reg), (regp));	\
+	} while (0)
+
+#define x86_64_mov_regp_reg(inst,regp,reg,size)	\
+	do {	\
+		if ((size) == 2) \
+			*(inst)++ = (unsigned char)0x66; \
+		if (((size) == 8) ||  (regp) > 7 || (reg) > 7 )	\
+			*(inst++) = X86_64_REX((((size) == 8) ? X86_64_REX_W : 0) | (((reg) > 7) ? X86_64_REX_R : 0) | (((regp) > 7) ? X86_64_REX_B : 0));	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x88; break;	\
+		case 2: case 4: case 8: *(inst)++ = (unsigned char)0x89; break;	\
+		default: assert (0);	\
+		}	\
+		x86_regp_emit ((inst), (reg), (regp));	\
+	} while (0)
+
+#define x86_mov_membase_reg(inst,basereg,disp,reg,size)	\
+	do {	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x88; break;	\
+		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 4: *(inst)++ = (unsigned char)0x89; break;	\
+		default: assert (0);	\
+		}	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_64_mov_membase_reg(inst,basereg,disp,reg,size)	\
+	do {	\
+		if ((size) == 2) \
+			*(inst)++ = (unsigned char)0x66; \
+		if (((size) == 8) ||  (basereg) > 7 || (reg) > 7 )	\
+			*(inst++) = X86_64_REX((((size) == 8) ? X86_64_REX_W : 0) | (((reg) > 7) ? X86_64_REX_R : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x88; break;	\
+		case 2: case 4: case 8: *(inst)++ = (unsigned char)0x89; break;	\
+		default: assert (0);	\
+		}	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+
+#define x86_mov_memindex_reg(inst,basereg,disp,indexreg,shift,reg,size)	\
+	do {	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x88; break;	\
+		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 4: *(inst)++ = (unsigned char)0x89; break;	\
+		default: assert (0);	\
+		}	\
+		x86_memindex_emit ((inst), (reg), (basereg), (disp), (indexreg), (shift));	\
+	} while (0)
+
+#define x86_mov_reg_reg(inst,dreg,reg,size) x86_64_mov_reg_reg((inst),(dreg),(reg),(size))
+#define x86_64_mov_reg_reg(inst,dreg,reg,size)	\
+	do {	\
+		if ((size) == 2) \
+			*(inst)++ = (unsigned char)0x66; \
+		if (((size) == 8) ||  (reg) > 7 || (dreg) > 7 )	\
+			*(inst++) = X86_64_REX((((size) == 8) ? X86_64_REX_W : 0) | (((dreg) > 7) ? X86_64_REX_R : 0) | (((reg) > 7) ? X86_64_REX_B : 0));	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
+		case 2: case 4: case 8: *(inst)++ = (unsigned char)0x8b; break;	\
+		default: assert (0);	\
+		}	\
+		x86_reg_emit ((inst), (dreg), (reg));	\
+	} while (0)
+
+#define x86_mov_reg_mem(inst,reg,mem,size) x86_64_mov_reg_mem((inst),(reg),(mem),(size))
+#define x86_64_mov_reg_mem(inst,reg,mem,size)	\
+	do {	\
+		if ((size) == 2) \
+			*(inst)++ = (unsigned char)0x66; \
+		if (((size) == 8) ||  (reg) > 7)	\
+			*(inst++) = X86_64_REX((((size) == 8) ? X86_64_REX_W : 0) | (((reg) > 7) ? X86_64_REX_R : 0));	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
+		case 2: case 4: case 8: *(inst)++ = (unsigned char)0x8b; break;	\
+		default: assert (0);	\
+		}	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_mov_reg_membase(inst,reg,basereg,disp,size)	\
+	do {	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
+		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 4: *(inst)++ = (unsigned char)0x8b; break;	\
+		default: assert (0);	\
+		}	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_64_mov_reg_membase(inst,reg,basereg,disp,size)	\
+	do {	\
+		if ((size) == 2) \
+			*(inst)++ = (unsigned char)0x66; \
+		if (((size) == 8) ||  (basereg) > 7 || (reg) > 7 )	\
+			*(inst++) = X86_64_REX((((size) == 8) ? X86_64_REX_W : 0) | (((reg) > 7) ? X86_64_REX_R : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
+		case 2: case 4: case 8: *(inst)++ = (unsigned char)0x8b; break;	\
+		default: assert (0);	\
+		}	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_64_movzx_reg_membase(inst,reg,basereg,disp,size)	\
+	do {	\
+		if (((size) == 8) ||  (basereg) > 7 || (reg) > 7 )	\
+			*(inst++) = X86_64_REX((((size) == 8) ? X86_64_REX_W : 0) | (((reg) > 7) ? X86_64_REX_R : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x0f; *(inst)++ = (unsigned char)0xb6; break;	\
+		case 2: *(inst)++ = (unsigned char)0x0f; *(inst)++ = (unsigned char)0xb7; break;	\
+		case 4: case 8: *(inst)++ = (unsigned char)0x8b; break;	\
+		default: assert (0);	\
+		}	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_mov_reg_memindex(inst,reg,basereg,disp,indexreg,shift,size)	\
+	do {	\
+		switch ((size)) {	\
+		case 1: *(inst)++ = (unsigned char)0x8a; break;	\
+		case 2: *(inst)++ = (unsigned char)0x66; /* fall through */	\
+		case 4: *(inst)++ = (unsigned char)0x8b; break;	\
+		default: assert (0);	\
+		}	\
+		x86_memindex_emit ((inst), (reg), (basereg), (disp), (indexreg), (shift));	\
+	} while (0)
+
+/*
+ * Note: x86_clear_reg () chacnges the condition code!
+ */
+#define x86_clear_reg(inst,reg) x86_alu_reg_reg((inst), X86_XOR, (reg), (reg))
+
+#define x86_mov_reg_imm(inst,reg,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xb8 + (reg);	\
+		x86_imm_emit32 ((inst), (imm));	\
+	} while (0)
+
+#define x86_64_mov_reg_imm(inst,reg,imm)	\
+	do {	\
+		*(inst++) = X86_64_REX(((reg > 7) ? X86_64_REX_B : 0) | X86_64_REX_W);	\
+		*(inst)++ = (unsigned char)0xb8 + ((reg) & 0x7);	\
+		x86_imm_emit64 ((inst), (imm));	\
+	} while (0)
+
+#define x86_mov_mem_imm(inst,mem,imm,size)	\
+	do {	\
+		if ((size) == 1) {	\
+			*(inst)++ = (unsigned char)0xc6;	\
+			x86_mem_emit ((inst), 0, (mem));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else if ((size) == 2) {	\
+			*(inst)++ = (unsigned char)0x66;	\
+			*(inst)++ = (unsigned char)0xc7;	\
+			x86_mem_emit ((inst), 0, (mem));	\
+			x86_imm_emit16 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xc7;	\
+			x86_mem_emit ((inst), 0, (mem));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_mov_membase_imm(inst,basereg,disp,imm,size)	\
+	do {	\
+		if ((size) == 1) {	\
+			*(inst)++ = (unsigned char)0xc6;	\
+			x86_membase_emit ((inst), 0, (basereg), (disp));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else if ((size) == 2) {	\
+			*(inst)++ = (unsigned char)0x66;	\
+			*(inst)++ = (unsigned char)0xc7;	\
+			x86_membase_emit ((inst), 0, (basereg), (disp));	\
+			x86_imm_emit16 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xc7;	\
+			x86_membase_emit ((inst), 0, (basereg), (disp));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_64_mov_membase_imm(inst,basereg,disp,imm,size)	\
+	do {	\
+		if ((size) == 2) \
+			*(inst)++ = (unsigned char)0x66; \
+		if (((size) == 8) || (basereg) > 7 )	\
+			*(inst++) = X86_64_REX((((size) == 8) ? X86_64_REX_W : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		if ((size) == 1) {	\
+			*(inst)++ = (unsigned char)0xc6;	\
+			x86_membase_emit ((inst), 0, (basereg), (disp));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else if ((size) == 2) {	\
+			*(inst)++ = (unsigned char)0xc7;	\
+			x86_membase_emit ((inst), 0, (basereg), (disp));	\
+			x86_imm_emit16 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xc7;	\
+			x86_membase_emit ((inst), 0, (basereg), (disp));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_mov_memindex_imm(inst,basereg,disp,indexreg,shift,imm,size)	\
+	do {	\
+		if ((size) == 1) {	\
+			*(inst)++ = (unsigned char)0xc6;	\
+			x86_memindex_emit ((inst), 0, (basereg), (disp), (indexreg), (shift));	\
+			x86_imm_emit8 ((inst), (imm));	\
+		} else if ((size) == 2) {	\
+			*(inst)++ = (unsigned char)0x66;	\
+			*(inst)++ = (unsigned char)0xc7;	\
+			x86_memindex_emit ((inst), 0, (basereg), (disp), (indexreg), (shift));	\
+			x86_imm_emit16 ((inst), (imm));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xc7;	\
+			x86_memindex_emit ((inst), 0, (basereg), (disp), (indexreg), (shift));	\
+			x86_imm_emit32 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_lea_mem(inst,reg,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x8d;	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_lea_membase(inst,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x8d;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_64_lea_membase(inst,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = X86_64_REX(X86_64_REX_W | (((reg) > 7) ? X86_64_REX_R : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x8d;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_lea_memindex(inst,reg,basereg,disp,indexreg,shift)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x8d;	\
+		x86_memindex_emit ((inst), (reg), (basereg), (disp), (indexreg), (shift));	\
+	} while (0)
+
+#define x86_widen_reg(inst,dreg,reg,is_signed,is_half)	\
+	do {	\
+		unsigned char op = 0xb6;	\
+                g_assert (is_half ||  X86_IS_BYTE_REG (reg)); \
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed)) op += 0x08;	\
+		if ((is_half)) op += 0x01;	\
+		*(inst)++ = op;	\
+		x86_reg_emit ((inst), (dreg), (reg));	\
+	} while (0)
+
+#define x86_widen_mem(inst,dreg,mem,is_signed,is_half)	\
+	do {	\
+		unsigned char op = 0xb6;	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed)) op += 0x08;	\
+		if ((is_half)) op += 0x01;	\
+		*(inst)++ = op;	\
+		x86_mem_emit ((inst), (dreg), (mem));	\
+	} while (0)
+
+#define x86_widen_membase(inst,dreg,basereg,disp,is_signed,is_half)	\
+	do {	\
+		unsigned char op = 0xb6;	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed)) op += 0x08;	\
+		if ((is_half)) op += 0x01;	\
+		*(inst)++ = op;	\
+		x86_membase_emit ((inst), (dreg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_widen_memindex(inst,dreg,basereg,disp,indexreg,shift,is_signed,is_half)	\
+	do {	\
+		unsigned char op = 0xb6;	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed)) op += 0x08;	\
+		if ((is_half)) op += 0x01;	\
+		*(inst)++ = op;	\
+		x86_memindex_emit ((inst), (dreg), (basereg), (disp), (indexreg), (shift));	\
+	} while (0)
+
+#define x86_cdq(inst)  do { *(inst)++ = (unsigned char)0x99; } while (0)
+#define x86_wait(inst) do { *(inst)++ = (unsigned char)0x9b; } while (0)
+
+#define x86_fp_op_mem(inst,opc,mem,is_double)	\
+	do {	\
+		*(inst)++ = (is_double) ? (unsigned char)0xdc : (unsigned char)0xd8;	\
+		x86_mem_emit ((inst), (opc), (mem));	\
+	} while (0)
+
+#define x86_fp_op_membase(inst,opc,basereg,disp,is_double)	\
+	do {	\
+		*(inst)++ = (is_double) ? (unsigned char)0xdc : (unsigned char)0xd8;	\
+		x86_membase_emit ((inst), (opc), (basereg), (disp));	\
+	} while (0)
+
+#define x86_fp_op(inst,opc,index)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd8;	\
+		*(inst)++ = (unsigned char)0xc0+((opc)<<3)+((index)&0x07);	\
+	} while (0)
+
+#define x86_fp_op_reg(inst,opc,index,pop_stack)	\
+	do {	\
+		static const unsigned char map[] = { 0, 1, 2, 3, 5, 4, 7, 6, 8};	\
+		*(inst)++ = (pop_stack) ? (unsigned char)0xde : (unsigned char)0xdc;	\
+		*(inst)++ = (unsigned char)0xc0+(map[(opc)]<<3)+((index)&0x07);	\
+	} while (0)
+
+/**
+ * @x86_fp_int_op_membase
+ * Supports FPU operations between ST(0) and integer operand in memory.
+ * Operation encoded using X86_FP_Opcode enum.
+ * Operand is addressed by [basereg + disp].
+ * is_int specifies whether operand is int32 (TRUE) or int16 (FALSE).
+ */
+#define x86_fp_int_op_membase(inst,opc,basereg,disp,is_int)	\
+	do {	\
+		*(inst)++ = (is_int) ? (unsigned char)0xda : (unsigned char)0xde;	\
+		x86_membase_emit ((inst), opc, (basereg), (disp));	\
+	} while (0)
+
+#define x86_fstp(inst,index)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdd;	\
+		*(inst)++ = (unsigned char)0xd8+(index);	\
+	} while (0)
+
+#define x86_fcompp(inst)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xde;	\
+		*(inst)++ = (unsigned char)0xd9;	\
+	} while (0)
+
+#define x86_fucompp(inst)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xda;	\
+		*(inst)++ = (unsigned char)0xe9;	\
+	} while (0)
+
+#define x86_fnstsw(inst)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdf;	\
+		*(inst)++ = (unsigned char)0xe0;	\
+	} while (0)
+
+#define x86_fnstcw(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		x86_mem_emit ((inst), 7, (mem));	\
+	} while (0)
+
+#define x86_fnstcw_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		x86_membase_emit ((inst), 7, (basereg), (disp));	\
+	} while (0)
+
+#define x86_fldcw(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		x86_mem_emit ((inst), 5, (mem));	\
+	} while (0)
+
+#define x86_fldcw_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		x86_membase_emit ((inst), 5, (basereg), (disp));	\
+	} while (0)
+
+#define x86_fchs(inst)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		*(inst)++ = (unsigned char)0xe0;	\
+	} while (0)
+
+#define x86_frem(inst)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		*(inst)++ = (unsigned char)0xf8;	\
+	} while (0)
+
+#define x86_fxch(inst,index)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		*(inst)++ = (unsigned char)0xc8 + ((index) & 0x07);	\
+	} while (0)
+
+#define x86_fcomi(inst,index)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdb;	\
+		*(inst)++ = (unsigned char)0xf0 + ((index) & 0x07);	\
+	} while (0)
+
+#define x86_fcomip(inst,index)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdf;	\
+		*(inst)++ = (unsigned char)0xf0 + ((index) & 0x07);	\
+	} while (0)
+
+#define x86_fucomi(inst,index)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdb;	\
+		*(inst)++ = (unsigned char)0xe8 + ((index) & 0x07);	\
+	} while (0)
+
+#define x86_fucomip(inst,index)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdf;	\
+		*(inst)++ = (unsigned char)0xe8 + ((index) & 0x07);	\
+	} while (0)
+
+#define x86_fld(inst,mem,is_double)	\
+	do {	\
+		*(inst)++ = (is_double) ? (unsigned char)0xdd : (unsigned char)0xd9;	\
+		x86_mem_emit ((inst), 0, (mem));	\
+	} while (0)
+
+#define x86_fld_membase(inst,basereg,disp,is_double)	\
+	do {	\
+		*(inst)++ = (is_double) ? (unsigned char)0xdd : (unsigned char)0xd9;	\
+		x86_membase_emit ((inst), 0, (basereg), (disp));	\
+	} while (0)
+
+#define x86_fld80_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdb;	\
+		x86_mem_emit ((inst), 5, (mem));	\
+	} while (0)
+
+#define x86_fld80_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdb;	\
+		x86_membase_emit ((inst), 5, (basereg), (disp));	\
+	} while (0)
+
+#define x86_fild(inst,mem,is_long)	\
+	do {	\
+		if ((is_long)) {	\
+			*(inst)++ = (unsigned char)0xdf;	\
+			x86_mem_emit ((inst), 5, (mem));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xdb;	\
+			x86_mem_emit ((inst), 0, (mem));	\
+		}	\
+	} while (0)
+
+#define x86_fild_membase(inst,basereg,disp,is_long)	\
+	do {	\
+		if ((is_long)) {	\
+			*(inst)++ = (unsigned char)0xdf;	\
+			x86_membase_emit ((inst), 5, (basereg), (disp));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xdb;	\
+			x86_membase_emit ((inst), 0, (basereg), (disp));	\
+		}	\
+	} while (0)
+
+#define x86_fld_reg(inst,index)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		*(inst)++ = (unsigned char)0xc0 + ((index) & 0x07);	\
+	} while (0)
+
+#define x86_fldz(inst)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		*(inst)++ = (unsigned char)0xee;	\
+	} while (0)
+
+#define x86_fld1(inst)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		*(inst)++ = (unsigned char)0xe8;	\
+	} while (0)
+
+#define x86_fldpi(inst)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xd9;	\
+		*(inst)++ = (unsigned char)0xeb;	\
+	} while (0)
+
+#define x86_fst(inst,mem,is_double,pop_stack)	\
+	do {	\
+		*(inst)++ = (is_double) ? (unsigned char)0xdd: (unsigned char)0xd9;	\
+		x86_mem_emit ((inst), 2 + ((pop_stack) ? 1 : 0), (mem));	\
+	} while (0)
+
+#define x86_fst_membase(inst,basereg,disp,is_double,pop_stack)	\
+	do {	\
+		*(inst)++ = (is_double) ? (unsigned char)0xdd: (unsigned char)0xd9;	\
+		x86_membase_emit ((inst), 2 + ((pop_stack) ? 1 : 0), (basereg), (disp));	\
+	} while (0)
+
+#define x86_fst80_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdb;	\
+		x86_mem_emit ((inst), 7, (mem));	\
+	} while (0)
+
+
+#define x86_fst80_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xdb;	\
+		x86_membase_emit ((inst), 7, (basereg), (disp));	\
+	} while (0)
+
+
+#define x86_fist_pop(inst,mem,is_long)	\
+	do {	\
+		if ((is_long)) {	\
+			*(inst)++ = (unsigned char)0xdf;	\
+			x86_mem_emit ((inst), 7, (mem));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xdb;	\
+			x86_mem_emit ((inst), 3, (mem));	\
+		}	\
+	} while (0)
+
+#define x86_fist_pop_membase(inst,basereg,disp,is_long)	\
+	do {	\
+		if ((is_long)) {	\
+			*(inst)++ = (unsigned char)0xdf;	\
+			x86_membase_emit ((inst), 7, (basereg), (disp));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xdb;	\
+			x86_membase_emit ((inst), 3, (basereg), (disp));	\
+		}	\
+	} while (0)
+
+#define x86_fstsw(inst)	\
+	do {	\
+			*(inst)++ = (unsigned char)0x9b;	\
+			*(inst)++ = (unsigned char)0xdf;	\
+			*(inst)++ = (unsigned char)0xe0;	\
+	} while (0)
+
+/**
+ * @x86_fist_membase
+ * Converts content of ST(0) to integer and stores it at memory location
+ * addressed by [basereg + disp].
+ * is_int specifies whether destination is int32 (TRUE) or int16 (FALSE).
+ */
+#define x86_fist_membase(inst,basereg,disp,is_int)	\
+	do {	\
+		if ((is_int)) {	\
+			*(inst)++ = (unsigned char)0xdb;	\
+			x86_membase_emit ((inst), 2, (basereg), (disp));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xdf;	\
+			x86_membase_emit ((inst), 2, (basereg), (disp));	\
+		}	\
+	} while (0)
+
+
+#define x86_push_reg(inst,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x50 + (reg);	\
+	} while (0)
+
+#define x86_64_push_reg(inst,reg)	\
+	do {	\
+		if ((reg) > 7)	\
+			*(inst)++ = X86_64_REX(X86_64_REX_B);	\
+		*(inst)++ = (unsigned char)0x50 + ((reg) & 0x7);	\
+	} while (0)
+
+#define x86_push_regp(inst,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_regp_emit ((inst), 6, (reg));	\
+	} while (0)
+
+#define x86_push_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_mem_emit ((inst), 6, (mem));	\
+	} while (0)
+
+#define x86_push_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_membase_emit ((inst), 6, (basereg), (disp));	\
+	} while (0)
+
+#define x86_64_push_membase(inst,basereg,disp)	\
+	do {	\
+		if ((basereg) > 7)	\
+			*(inst)++ = X86_64_REX(X86_64_REX_B);	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_membase_emit ((inst), 6, (basereg), (disp));	\
+	} while (0)
+
+#define x86_push_memindex(inst,basereg,disp,indexreg,shift)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_memindex_emit ((inst), 6, (basereg), (disp), (indexreg), (shift));	\
+	} while (0)
+
+#define x86_push_imm(inst,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x68;	\
+		x86_imm_emit32 ((inst), (imm));	\
+	} while (0)
+
+#define x86_pop_reg(inst,reg) x86_64_pop_reg(inst, reg)
+#define x86_64_pop_reg(inst,reg)	\
+	do {	\
+		if ((reg) > 7)	\
+			*(inst)++ = X86_64_REX(X86_64_REX_B);	\
+		*(inst)++ = (unsigned char)0x58 + (reg);	\
+	} while (0)
+
+#define x86_pop_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x87;	\
+		x86_mem_emit ((inst), 0, (mem));	\
+	} while (0)
+
+#define x86_pop_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x87;	\
+		x86_membase_emit ((inst), 0, (basereg), (disp));	\
+	} while (0)
+
+#define x86_pushad(inst) do { *(inst)++ = (unsigned char)0x60; } while (0)
+#define x86_pushfd(inst) do { *(inst)++ = (unsigned char)0x9c; } while (0)
+#define x86_popad(inst)  do { *(inst)++ = (unsigned char)0x61; } while (0)
+#define x86_popfd(inst)  do { *(inst)++ = (unsigned char)0x9d; } while (0)
+
+#define x86_loop(inst,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xe2;	\
+		x86_imm_emit8 ((inst), (imm));	\
+	} while (0)
+
+#define x86_loope(inst,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xe1;	\
+		x86_imm_emit8 ((inst), (imm));	\
+	} while (0)
+
+#define x86_loopne(inst,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xe0;	\
+		x86_imm_emit8 ((inst), (imm));	\
+	} while (0)
+
+#define x86_jump32(inst,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xe9;	\
+		x86_imm_emit32 ((inst), (imm));	\
+	} while (0)
+
+#define x86_jump8(inst,imm)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xeb;	\
+		x86_imm_emit8 ((inst), (imm));	\
+	} while (0)
+
+#define x86_jump_reg(inst,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_reg_emit ((inst), 4, (reg));	\
+	} while (0)
+
+#define x86_jump_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_mem_emit ((inst), 4, (mem));	\
+	} while (0)
+
+#define x86_jump_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_membase_emit ((inst), 4, (basereg), (disp));	\
+	} while (0)
+
+/*
+ * target is a pointer in our buffer.
+ */
+#define x86_jump_code(inst,target)	\
+	do {	\
+		int t = (unsigned char*)(target) - (inst) - 2;	\
+		if (x86_is_imm8(t)) {	\
+			x86_jump8 ((inst), t);	\
+		} else {	\
+			t -= 3;	\
+			x86_jump32 ((inst), t);	\
+		}	\
+	} while (0)
+
+#define x86_jump_disp(inst,disp)	\
+	do {	\
+		int t = (disp) - 2;	\
+		if (x86_is_imm8(t)) {	\
+			x86_jump8 ((inst), t);	\
+		} else {	\
+			t -= 3;	\
+			x86_jump32 ((inst), t);	\
+		}	\
+	} while (0)
+
+#define x86_branch8(inst,cond,imm,is_signed)	\
+	do {	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)];	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)];	\
+		x86_imm_emit8 ((inst), (imm));	\
+	} while (0)
+
+#define x86_branch32(inst,cond,imm,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)] + 0x10;	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)] + 0x10;	\
+		x86_imm_emit32 ((inst), (imm));	\
+	} while (0)
+
+#define x86_branch(inst,cond,target,is_signed)	\
+	do {	\
+		int offset = (target) - (inst) - 2;	\
+		if (x86_is_imm8 ((offset)))	\
+			x86_branch8 ((inst), (cond), offset, (is_signed));	\
+		else {	\
+			offset -= 4;	\
+			x86_branch32 ((inst), (cond), offset, (is_signed));	\
+		}	\
+	} while (0)
+
+#define x86_branch_disp(inst,cond,disp,is_signed)	\
+	do {	\
+		int offset = (disp) - 2;	\
+		if (x86_is_imm8 ((offset)))	\
+			x86_branch8 ((inst), (cond), offset, (is_signed));	\
+		else {	\
+			offset -= 4;	\
+			x86_branch32 ((inst), (cond), offset, (is_signed));	\
+		}	\
+	} while (0)
+
+#define x86_set_reg(inst,cond,reg,is_signed)	\
+	do {	\
+                g_assert (X86_IS_BYTE_REG (reg)); \
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)] + 0x20;	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)] + 0x20;	\
+		x86_reg_emit ((inst), 0, (reg));	\
+	} while (0)
+
+#define x86_set_mem(inst,cond,mem,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)] + 0x20;	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)] + 0x20;	\
+		x86_mem_emit ((inst), 0, (mem));	\
+	} while (0)
+
+#define x86_set_membase(inst,cond,basereg,disp,is_signed)	\
+	do {	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)] + 0x20;	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)] + 0x20;	\
+		x86_membase_emit ((inst), 0, (basereg), (disp));	\
+	} while (0)
+
+#define x86_call_imm(inst,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xe8;	\
+		x86_imm_emit32 ((inst), (int)(disp));	\
+	} while (0)
+
+#define x86_call_reg(inst,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_reg_emit ((inst), 2, (reg));	\
+	} while (0)
+
+#define x86_64_call_reg(inst,reg)	\
+	do {	\
+		if ((reg) > 7)	\
+			*(inst)++ = X86_64_REX(X86_64_REX_W|X86_64_REX_B); \
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_reg_emit ((inst), 2, (reg));	\
+	} while (0)
+
+#define x86_call_mem(inst,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_mem_emit ((inst), 2, (mem));	\
+	} while (0)
+
+#define x86_call_membase(inst,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xff;	\
+		x86_membase_emit ((inst), 2, (basereg), (disp));	\
+	} while (0)
+
+#define x86_call_code(inst,target)	\
+	do {	\
+		int _x86_offset = (unsigned char*)(target) - (inst);	\
+		_x86_offset -= 5;	\
+		x86_call_imm ((inst), _x86_offset);	\
+	} while (0)
+
+#define x86_ret(inst) do { *(inst)++ = (unsigned char)0xc3; } while (0)
+#define x86_64_ret(inst) do { *(inst)++ = (unsigned char)0xc3; } while (0)
+
+#define x86_ret_imm(inst,imm)	\
+	do {	\
+		if ((imm) == 0) {	\
+			x86_ret ((inst));	\
+		} else {	\
+			*(inst)++ = (unsigned char)0xc2;	\
+			x86_imm_emit16 ((inst), (imm));	\
+		}	\
+	} while (0)
+
+#define x86_cmov_reg(inst,cond,is_signed,dreg,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char) 0x0f;	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)] - 0x30;	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)] - 0x30;	\
+		x86_reg_emit ((inst), (dreg), (reg));	\
+	} while (0)
+
+#define x86_cmov_mem(inst,cond,is_signed,reg,mem)	\
+	do {	\
+		*(inst)++ = (unsigned char) 0x0f;	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)] - 0x30;	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)] - 0x30;	\
+		x86_mem_emit ((inst), (reg), (mem));	\
+	} while (0)
+
+#define x86_cmov_membase(inst,cond,is_signed,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char) 0x0f;	\
+		if ((is_signed))	\
+			*(inst)++ = x86_cc_signed_map [(cond)] - 0x30;	\
+		else	\
+			*(inst)++ = x86_cc_unsigned_map [(cond)] - 0x30;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_enter(inst,framesize)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xc8;	\
+		x86_imm_emit16 ((inst), (framesize));	\
+		*(inst)++ = 0;	\
+	} while (0)
+	
+#define x86_leave(inst) do { *(inst)++ = (unsigned char)0xc9; } while (0)
+#define x86_64_leave(inst) do { *(inst)++ = (unsigned char)0xc9; } while (0)
+#define x86_sahf(inst)  do { *(inst)++ = (unsigned char)0x9e; } while (0)
+
+#define x86_fsin(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xfe; } while (0)
+#define x86_fcos(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xff; } while (0)
+#define x86_fabs(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xe1; } while (0)
+#define x86_ftst(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xe4; } while (0)
+#define x86_fxam(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xe5; } while (0)
+#define x86_fpatan(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xf3; } while (0)
+#define x86_fprem(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xf8; } while (0)
+#define x86_fprem1(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xf5; } while (0)
+#define x86_frndint(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xfc; } while (0)
+#define x86_fsqrt(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xfa; } while (0)
+#define x86_fptan(inst) do { *(inst)++ = (unsigned char)0xd9; *(inst)++ = (unsigned char)0xf2; } while (0)
+
+#define x86_padding(inst,size)	\
+	do {	\
+		switch ((size)) {	\
+		case 1: x86_nop ((inst)); break;	\
+		case 2: *(inst)++ = 0x8b;	\
+			*(inst)++ = 0xc0; break;	\
+		case 3: *(inst)++ = 0x8d; *(inst)++ = 0x6d;	\
+			*(inst)++ = 0x00; break;	\
+		case 4: *(inst)++ = 0x8d; *(inst)++ = 0x64;	\
+			*(inst)++ = 0x24; *(inst)++ = 0x00;	\
+			break;	\
+		case 5: *(inst)++ = 0x8d; *(inst)++ = 0x64;	\
+			*(inst)++ = 0x24; *(inst)++ = 0x00;	\
+			x86_nop ((inst)); break;	\
+		case 6: *(inst)++ = 0x8d; *(inst)++ = 0xad;	\
+			*(inst)++ = 0x00; *(inst)++ = 0x00;	\
+			*(inst)++ = 0x00; *(inst)++ = 0x00;	\
+			break;	\
+		case 7: *(inst)++ = 0x8d; *(inst)++ = 0xa4;	\
+			*(inst)++ = 0x24; *(inst)++ = 0x00;	\
+			*(inst)++ = 0x00; *(inst)++ = 0x00;	\
+			*(inst)++ = 0x00; break;	\
+		default: assert (0);	\
+		}	\
+	} while (0)
+
+#define x86_prolog(inst,frame_size,reg_mask)	\
+	do {	\
+		unsigned i, m = 1;	\
+		x86_enter ((inst), (frame_size));	\
+		for (i = 0; i < X86_NREG; ++i, m <<= 1) {	\
+			if ((reg_mask) & m)	\
+				x86_push_reg ((inst), i);	\
+		}	\
+	} while (0)
+
+#define x86_epilog(inst,reg_mask)	\
+	do {	\
+		unsigned i, m = 1 << X86_EDI;	\
+		for (i = X86_EDI; m != 0; i--, m=m>>1) {	\
+			if ((reg_mask) & m)	\
+				x86_pop_reg ((inst), i);	\
+		}	\
+		x86_leave ((inst));	\
+		x86_ret ((inst));	\
+	} while (0)
+
+#define x86_64_movsd_reg_regp(inst,reg,regp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf2;	\
+		if ((reg) > 7 || (regp) > 7)	\
+			*(inst)++ = X86_64_REX((((reg) > 7) ? X86_64_REX_R : 0) | (((regp) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0x10;	\
+		x86_regp_emit ((inst), (reg), (regp));	\
+	} while (0)
+
+#define x86_64_movsd_regp_reg(inst,regp,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf2;	\
+		if ((reg) > 7 || (regp) > 7)	\
+			*(inst)++ = X86_64_REX((((reg) > 7) ? X86_64_REX_R : 0) | (((regp) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0x11;	\
+		x86_regp_emit ((inst), (reg), (regp));	\
+	} while (0)
+
+#define x86_64_movss_reg_regp(inst,reg,regp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf3;	\
+		if ((reg) > 7 || (regp) > 7)	\
+			*(inst)++ = X86_64_REX((((reg) > 7) ? X86_64_REX_R : 0) | (((regp) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0x10;	\
+		x86_regp_emit ((inst), (reg), (regp));	\
+	} while (0)
+
+#define x86_64_movss_regp_reg(inst,regp,reg)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf3;	\
+		if ((reg) > 7 || (regp) > 7)	\
+			*(inst)++ = X86_64_REX((((reg) > 7) ? X86_64_REX_R : 0) | (((regp) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0x11;	\
+		x86_regp_emit ((inst), (reg), (regp));	\
+	} while (0)
+
+#define x86_64_movsd_reg_membase(inst,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf2;	\
+		if ((reg) > 7 || (basereg) > 7)	\
+			*(inst)++ = X86_64_REX((((reg) > 7) ? X86_64_REX_R : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0x10;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_64_movss_reg_membase(inst,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf3;	\
+		if ((reg) > 7 || (basereg) > 7)	\
+			*(inst)++ = X86_64_REX((((reg) > 7) ? X86_64_REX_R : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0x10;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_64_movsd_membase_reg(inst,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf2;	\
+		if ((reg) > 7 || (basereg) > 7)	\
+			*(inst)++ = X86_64_REX((((reg) > 7) ? X86_64_REX_R : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0x11;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#define x86_64_movss_membase_reg(inst,reg,basereg,disp)	\
+	do {	\
+		*(inst)++ = (unsigned char)0xf3;	\
+		if ((reg) > 7 || (basereg) > 7)	\
+			*(inst)++ = X86_64_REX((((reg) > 7) ? X86_64_REX_R : 0) | (((basereg) > 7) ? X86_64_REX_B : 0));	\
+		*(inst)++ = (unsigned char)0x0f;	\
+		*(inst)++ = (unsigned char)0x11;	\
+		x86_membase_emit ((inst), (reg), (basereg), (disp));	\
+	} while (0)
+
+#endif // X86_64_H




More information about the Mono-devel-list mailing list