Add mesa headers from git commit fa7829c36b78b8ecc42238cbc0a02d1059320c77

2013-08-12 13:12:09 -07:00 · 2013-08-12 13:12:09 -07:00 · 535b1cb0ab
parent 810c434324
commit 535b1cb0ab
52 changed files with 17371 additions and 0 deletions
--- a/include/c99_compat.h
+++ b/include/c99_compat.h
@ -0,0 +1,145 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2013 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _C99_COMPAT_H_
+#define _C99_COMPAT_H_
+
+
+/*
+ * MSVC hacks.
+ */
+#if defined(_MSC_VER)
+   /*
+    * Visual Studio 2012 will complain if we define the `inline` keyword, but
+    * actually it only supports the keyword on C++.
+    *
+    * To avoid this the _ALLOW_KEYWORD_MACROS must be set.
+    */
+#  if (_MSC_VER >= 1700) && !defined(_ALLOW_KEYWORD_MACROS)
+#    define _ALLOW_KEYWORD_MACROS
+#  endif
+
+   /*
+    * XXX: MSVC has a `__restrict` keyword, but it also has a
+    * `__declspec(restrict)` modifier, so it is impossible to define a
+    * `restrict` macro without interfering with the latter.  Furthermore the
+    * MSVC standard library uses __declspec(restrict) under the _CRTRESTRICT
+    * macro.  For now resolve this issue by redefining _CRTRESTRICT, but going
+    * forward we should probably should stop using restrict, especially
+    * considering that our code does not obbey strict aliasing rules any way.
+    */
+#  include <crtdefs.h>
+#  undef _CRTRESTRICT
+#  define _CRTRESTRICT
+#endif
+
+
+/*
+ * C99 inline keyword
+ */
+#ifndef inline
+#  ifdef __cplusplus
+     /* C++ supports inline keyword */
+#  elif defined(__GNUC__)
+#    define inline __inline__
+#  elif defined(_MSC_VER)
+#    define inline __inline
+#  elif defined(__ICL)
+#    define inline __inline
+#  elif defined(__INTEL_COMPILER)
+     /* Intel compiler supports inline keyword */
+#  elif defined(__WATCOMC__) && (__WATCOMC__ >= 1100)
+#    define inline __inline
+#  elif defined(__SUNPRO_C) && defined(__C99FEATURES__)
+     /* C99 supports inline keyword */
+#  elif (__STDC_VERSION__ >= 199901L)
+     /* C99 supports inline keyword */
+#  else
+#    define inline
+#  endif
+#endif
+
+
+/*
+ * C99 restrict keyword
+ *
+ * See also:
+ * - http://cellperformance.beyond3d.com/articles/2006/05/demystifying-the-restrict-keyword.html
+ */
+#ifndef restrict
+#  if (__STDC_VERSION__ >= 199901L)
+     /* C99 */
+#  elif defined(__SUNPRO_C) && defined(__C99FEATURES__)
+     /* C99 */
+#  elif defined(__GNUC__)
+#    define restrict __restrict__
+#  elif defined(_MSC_VER)
+#    define restrict __restrict
+#  else
+#    define restrict /* */
+#  endif
+#endif
+
+
+/*
+ * C99 __func__ macro
+ */
+#ifndef __func__
+#  if (__STDC_VERSION__ >= 199901L)
+     /* C99 */
+#  elif defined(__SUNPRO_C) && defined(__C99FEATURES__)
+     /* C99 */
+#  elif defined(__GNUC__)
+#    if __GNUC__ >= 2
+#      define __func__ __FUNCTION__
+#    else
+#      define __func__ "<unknown>"
+#    endif
+#  elif defined(_MSC_VER)
+#    if _MSC_VER >= 1300
+#      define __func__ __FUNCTION__
+#    else
+#      define __func__ "<unknown>"
+#    endif
+#  else
+#    define __func__ "<unknown>"
+#  endif
+#endif
+
+
+/* Simple test case for debugging */
+#if 0
+static inline const char *
+test_c99_compat_h(const void * restrict a,
+                  const void * restrict b)
+{
+   return __func__;
+}
+#endif
+
+
+#endif /* _C99_COMPAT_H_ */
--- a/include/compiler.h
+++ b/include/compiler.h
@ -0,0 +1,445 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * \file compiler.h
+ * Compiler-related stuff.
+ */
+
+
+#ifndef COMPILER_H
+#define COMPILER_H
+
+
+#include <assert.h>
+#include <ctype.h>
+#if defined(__alpha__) && defined(CCPML)
+#include <cpml.h> /* use Compaq's Fast Math Library on Alpha */
+#else
+#include <math.h>
+#endif
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <float.h>
+#include <stdarg.h>
+
+#include "c99_compat.h" /* inline, __func__, etc. */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Get standard integer types
+ */
+#include <stdint.h>
+
+
+/**
+  * Sun compilers define __i386 instead of the gcc-style __i386__
+ */
+#ifdef __SUNPRO_C
+# if !defined(__i386__) && defined(__i386)
+#  define __i386__
+# elif !defined(__amd64__) && defined(__amd64)
+#  define __amd64__
+# elif !defined(__sparc__) && defined(__sparc)
+#  define __sparc__
+# endif
+# if !defined(__volatile)
+#  define __volatile volatile
+# endif
+#endif
+
+
+/**
+ * finite macro.
+ */
+#if defined(_MSC_VER)
+#  define finite _finite
+#elif defined(__WATCOMC__)
+#  define finite _finite
+#endif
+
+
+/**
+ * Disable assorted warnings
+ */
+#if !defined(OPENSTEP) && (defined(_WIN32) && !defined(__CYGWIN__)) && !defined(BUILD_FOR_SNAP)
+#  if !defined(__GNUC__) /* mingw environment */
+#    pragma warning( disable : 4068 ) /* unknown pragma */
+#    pragma warning( disable : 4710 ) /* function 'foo' not inlined */
+#    pragma warning( disable : 4711 ) /* function 'foo' selected for automatic inline expansion */
+#    pragma warning( disable : 4127 ) /* conditional expression is constant */
+#    if defined(MESA_MINWARN)
+#      pragma warning( disable : 4244 ) /* '=' : conversion from 'const double ' to 'float ', possible loss of data */
+#      pragma warning( disable : 4018 ) /* '<' : signed/unsigned mismatch */
+#      pragma warning( disable : 4305 ) /* '=' : truncation from 'const double ' to 'float ' */
+#      pragma warning( disable : 4550 ) /* 'function' undefined; assuming extern returning int */
+#      pragma warning( disable : 4761 ) /* integral size mismatch in argument; conversion supplied */
+#    endif
+#  endif
+#endif
+#if defined(__WATCOMC__)
+#  pragma disable_message(201) /* Disable unreachable code warnings */
+#endif
+
+
+
+/* XXX: Use standard `inline` keyword instead */
+#ifndef INLINE
+#  define INLINE inline
+#endif
+
+
+/**
+ * PUBLIC/USED macros
+ *
+ * If we build the library with gcc's -fvisibility=hidden flag, we'll
+ * use the PUBLIC macro to mark functions that are to be exported.
+ *
+ * We also need to define a USED attribute, so the optimizer doesn't 
+ * inline a static function that we later use in an alias. - ajax
+ */
+#ifndef PUBLIC
+#  if (defined(__GNUC__) && __GNUC__ >= 4) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+#    define PUBLIC __attribute__((visibility("default")))
+#    define USED __attribute__((used))
+#  else
+#    define PUBLIC
+#    define USED
+#  endif
+#endif
+
+
+/**
+ * __builtin_expect macros
+ */
+#if !defined(__GNUC__)
+#  define __builtin_expect(x, y) (x)
+#endif
+
+#ifndef likely
+#  ifdef __GNUC__
+#    define likely(x)   __builtin_expect(!!(x), 1)
+#    define unlikely(x) __builtin_expect(!!(x), 0)
+#  else
+#    define likely(x)   (x)
+#    define unlikely(x) (x)
+#  endif
+#endif
+
+/* XXX: Use standard `__func__` instead */
+#ifndef __FUNCTION__
+#  define __FUNCTION__ __func__
+#endif
+
+/**
+ * Either define MESA_BIG_ENDIAN or MESA_LITTLE_ENDIAN, and CPU_TO_LE32.
+ * Do not use these unless absolutely necessary!
+ * Try to use a runtime test instead.
+ * For now, only used by some DRI hardware drivers for color/texel packing.
+ */
+#if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
+#if defined(__linux__)
+#include <byteswap.h>
+#define CPU_TO_LE32( x )	bswap_32( x )
+#elif defined(__APPLE__)
+#include <CoreFoundation/CFByteOrder.h>
+#define CPU_TO_LE32( x )	CFSwapInt32HostToLittle( x )
+#elif (defined(_AIX) || defined(__blrts))
+static INLINE GLuint CPU_TO_LE32(GLuint x)
+{
+   return (((x & 0x000000ff) << 24) |
+           ((x & 0x0000ff00) <<  8) |
+           ((x & 0x00ff0000) >>  8) |
+           ((x & 0xff000000) >> 24));
+}
+#elif defined(__OpenBSD__)
+#include <sys/types.h>
+#define CPU_TO_LE32( x )	htole32( x )
+#else /*__linux__ */
+#include <sys/endian.h>
+#define CPU_TO_LE32( x )	bswap32( x )
+#endif /*__linux__*/
+#define MESA_BIG_ENDIAN 1
+#else
+#define CPU_TO_LE32( x )	( x )
+#define MESA_LITTLE_ENDIAN 1
+#endif
+#define LE32_TO_CPU( x )	CPU_TO_LE32( x )
+
+
+
+#if !defined(CAPI) && defined(_WIN32) && !defined(BUILD_FOR_SNAP)
+#define CAPI _cdecl
+#endif
+
+
+/**
+ * Create a macro so that asm functions can be linked into compilers other
+ * than GNU C
+ */
+#ifndef _ASMAPI
+#if defined(_WIN32) && !defined(BUILD_FOR_SNAP)/* was: !defined( __GNUC__ ) && !defined( VMS ) && !defined( __INTEL_COMPILER )*/
+#define _ASMAPI __cdecl
+#else
+#define _ASMAPI
+#endif
+#ifdef	PTR_DECL_IN_FRONT
+#define	_ASMAPIP * _ASMAPI
+#else
+#define	_ASMAPIP _ASMAPI *
+#endif
+#endif
+
+#ifdef USE_X86_ASM
+#define _NORMAPI _ASMAPI
+#define _NORMAPIP _ASMAPIP
+#else
+#define _NORMAPI
+#define _NORMAPIP *
+#endif
+
+
+/* Turn off macro checking systems used by other libraries */
+#ifdef CHECK
+#undef CHECK
+#endif
+
+
+/**
+ * ASSERT macro
+ */
+#if !defined(_WIN32_WCE)
+#if defined(BUILD_FOR_SNAP) && defined(CHECKED)
+#  define ASSERT(X)   _CHECK(X) 
+#elif defined(DEBUG)
+#  define ASSERT(X)   assert(X)
+#else
+#  define ASSERT(X)
+#endif
+#endif
+
+
+/**
+ * Static (compile-time) assertion.
+ * Basically, use COND to dimension an array.  If COND is false/zero the
+ * array size will be -1 and we'll get a compilation error.
+ */
+#define STATIC_ASSERT(COND) \
+   do { \
+      (void) sizeof(char [1 - 2*!(COND)]); \
+   } while (0)
+
+
+#if (__GNUC__ >= 3)
+#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
+#else
+#define PRINTFLIKE(f, a)
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+
+/**
+ * LONGSTRING macro
+ * gcc -pedantic warns about long string literals, LONGSTRING silences that.
+ */
+#if !defined(__GNUC__)
+# define LONGSTRING
+#else
+# define LONGSTRING __extension__
+#endif
+
+
+#ifndef M_PI
+#define M_PI (3.14159265358979323846)
+#endif
+
+#ifndef M_E
+#define M_E (2.7182818284590452354)
+#endif
+
+#ifndef M_LOG2E
+#define M_LOG2E     (1.4426950408889634074)
+#endif
+
+#ifndef ONE_DIV_SQRT_LN2
+#define ONE_DIV_SQRT_LN2 (1.201122408786449815)
+#endif
+
+#ifndef FLT_MAX_EXP
+#define FLT_MAX_EXP 128
+#endif
+
+
+/**
+ * USE_IEEE: Determine if we're using IEEE floating point
+ */
+#if defined(__i386__) || defined(__386__) || defined(__sparc__) || \
+    defined(__s390__) || defined(__s390x__) || defined(__powerpc__) || \
+    defined(__x86_64__) || \
+    defined(__m68k__) || \
+    defined(ia64) || defined(__ia64__) || \
+    defined(__hppa__) || defined(hpux) || \
+    defined(__mips) || defined(_MIPS_ARCH) || \
+    defined(__arm__) || \
+    defined(__sh__) || defined(__m32r__) || \
+    (defined(__sun) && defined(_IEEE_754)) || \
+    defined(__alpha__)
+#define USE_IEEE
+#define IEEE_ONE 0x3f800000
+#endif
+
+
+/**
+ * START/END_FAST_MATH macros:
+ *
+ * START_FAST_MATH: Set x86 FPU to faster, 32-bit precision mode (and save
+ *                  original mode to a temporary).
+ * END_FAST_MATH: Restore x86 FPU to original mode.
+ */
+#if defined(__GNUC__) && defined(__i386__)
+/*
+ * Set the x86 FPU control word to guarentee only 32 bits of precision
+ * are stored in registers.  Allowing the FPU to store more introduces
+ * differences between situations where numbers are pulled out of memory
+ * vs. situations where the compiler is able to optimize register usage.
+ *
+ * In the worst case, we force the compiler to use a memory access to
+ * truncate the float, by specifying the 'volatile' keyword.
+ */
+/* Hardware default: All exceptions masked, extended double precision,
+ * round to nearest (IEEE compliant):
+ */
+#define DEFAULT_X86_FPU		0x037f
+/* All exceptions masked, single precision, round to nearest:
+ */
+#define FAST_X86_FPU		0x003f
+/* The fldcw instruction will cause any pending FP exceptions to be
+ * raised prior to entering the block, and we clear any pending
+ * exceptions before exiting the block.  Hence, asm code has free
+ * reign over the FPU while in the fast math block.
+ */
+#if defined(NO_FAST_MATH)
+#define START_FAST_MATH(x)						\
+do {									\
+   static GLuint mask = DEFAULT_X86_FPU;				\
+   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );				\
+   __asm__ ( "fldcw %0" : : "m" (mask) );				\
+} while (0)
+#else
+#define START_FAST_MATH(x)						\
+do {									\
+   static GLuint mask = FAST_X86_FPU;					\
+   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );				\
+   __asm__ ( "fldcw %0" : : "m" (mask) );				\
+} while (0)
+#endif
+/* Restore original FPU mode, and clear any exceptions that may have
+ * occurred in the FAST_MATH block.
+ */
+#define END_FAST_MATH(x)						\
+do {									\
+   __asm__ ( "fnclex ; fldcw %0" : : "m" (*&(x)) );			\
+} while (0)
+
+#elif defined(__WATCOMC__) && defined(__386__)
+#define DEFAULT_X86_FPU		0x037f /* See GCC comments above */
+#define FAST_X86_FPU		0x003f /* See GCC comments above */
+void _watcom_start_fast_math(unsigned short *x,unsigned short *mask);
+#pragma aux _watcom_start_fast_math =                                   \
+   "fnstcw  word ptr [eax]"                                             \
+   "fldcw   word ptr [ecx]"                                             \
+   parm [eax] [ecx]                                                     \
+   modify exact [];
+void _watcom_end_fast_math(unsigned short *x);
+#pragma aux _watcom_end_fast_math =                                     \
+   "fnclex"                                                             \
+   "fldcw   word ptr [eax]"                                             \
+   parm [eax]                                                           \
+   modify exact [];
+#if defined(NO_FAST_MATH)
+#define START_FAST_MATH(x)                                              \
+do {                                                                    \
+   static GLushort mask = DEFAULT_X86_FPU;	                        \
+   _watcom_start_fast_math(&x,&mask);                                   \
+} while (0)
+#else
+#define START_FAST_MATH(x)                                              \
+do {                                                                    \
+   static GLushort mask = FAST_X86_FPU;                                 \
+   _watcom_start_fast_math(&x,&mask);                                   \
+} while (0)
+#endif
+#define END_FAST_MATH(x)  _watcom_end_fast_math(&x)
+
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#define DEFAULT_X86_FPU		0x037f /* See GCC comments above */
+#define FAST_X86_FPU		0x003f /* See GCC comments above */
+#if defined(NO_FAST_MATH)
+#define START_FAST_MATH(x) do {\
+	static GLuint mask = DEFAULT_X86_FPU;\
+	__asm fnstcw word ptr [x]\
+	__asm fldcw word ptr [mask]\
+} while(0)
+#else
+#define START_FAST_MATH(x) do {\
+	static GLuint mask = FAST_X86_FPU;\
+	__asm fnstcw word ptr [x]\
+	__asm fldcw word ptr [mask]\
+} while(0)
+#endif
+#define END_FAST_MATH(x) do {\
+	__asm fnclex\
+	__asm fldcw word ptr [x]\
+} while(0)
+
+#else
+#define START_FAST_MATH(x)  x = 0
+#define END_FAST_MATH(x)  (void)(x)
+#endif
+
+
+#ifndef Elements
+#define Elements(x) (sizeof(x)/sizeof(*(x)))
+#endif
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* COMPILER_H */
--- a/include/glheader.h
+++ b/include/glheader.h
@ -0,0 +1,196 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.5
+ *
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * \file glheader.h
+ * Wrapper for GL/gl.h and GL/glext.h
+ */
+
+
+#ifndef GLHEADER_H
+#define GLHEADER_H
+
+
+#ifdef WGLAPI
+#undef WGLAPI
+#endif
+
+
+#if !defined(OPENSTEP) && (defined(__WIN32__) && !defined(__CYGWIN__)) && !defined(BUILD_FOR_SNAP)
+#  if (defined(_MSC_VER) || defined(__MINGW32__)) && defined(BUILD_GL32) /* tag specify we're building mesa as a DLL */
+#    define WGLAPI __declspec(dllexport)
+#  elif (defined(_MSC_VER) || defined(__MINGW32__)) && defined(_DLL) /* tag specifying we're building for DLL runtime support */
+#    define WGLAPI __declspec(dllimport)
+#  else /* for use with static link lib build of Win32 edition only */
+#    define WGLAPI __declspec(dllimport)
+#  endif /* _STATIC_MESA support */
+#endif /* WIN32 / CYGWIN bracket */
+
+
+#define GL_GLEXT_PROTOTYPES
+#include "GL/gl.h"
+#include "GL/glext.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * GL_FIXED is defined in glext.h version 64 but these typedefs aren't (yet).
+ */
+typedef int GLfixed;
+typedef int GLclampx;
+
+
+#ifndef GL_OES_EGL_image
+typedef void *GLeglImageOES;
+#endif
+
+
+#ifndef GL_OES_EGL_image_external
+#define GL_TEXTURE_EXTERNAL_OES                                 0x8D65
+#define GL_SAMPLER_EXTERNAL_OES                                 0x8D66
+#define GL_TEXTURE_BINDING_EXTERNAL_OES                         0x8D67
+#define GL_REQUIRED_TEXTURE_IMAGE_UNITS_OES                     0x8D68
+#endif
+
+
+#ifndef GL_OES_point_size_array
+#define GL_POINT_SIZE_ARRAY_OES                                 0x8B9C
+#define GL_POINT_SIZE_ARRAY_TYPE_OES                            0x898A
+#define GL_POINT_SIZE_ARRAY_STRIDE_OES                          0x898B
+#define GL_POINT_SIZE_ARRAY_POINTER_OES                         0x898C
+#define GL_POINT_SIZE_ARRAY_BUFFER_BINDING_OES                  0x8B9F
+#endif
+
+
+#ifndef GL_OES_draw_texture
+#define GL_TEXTURE_CROP_RECT_OES  0x8B9D
+#endif
+
+
+#ifndef GL_PROGRAM_BINARY_LENGTH_OES
+#define GL_PROGRAM_BINARY_LENGTH_OES 0x8741
+#endif
+
+/* GLES 2.0 tokens */
+#ifndef GL_RGB565
+#define GL_RGB565 0x8D62
+#endif
+
+#ifndef GL_TEXTURE_GEN_STR_OES
+#define GL_TEXTURE_GEN_STR_OES                                  0x8D60
+#endif
+
+#ifndef GL_OES_compressed_paletted_texture
+#define GL_PALETTE4_RGB8_OES                                    0x8B90
+#define GL_PALETTE4_RGBA8_OES                                   0x8B91
+#define GL_PALETTE4_R5_G6_B5_OES                                0x8B92
+#define GL_PALETTE4_RGBA4_OES                                   0x8B93
+#define GL_PALETTE4_RGB5_A1_OES                                 0x8B94
+#define GL_PALETTE8_RGB8_OES                                    0x8B95
+#define GL_PALETTE8_RGBA8_OES                                   0x8B96
+#define GL_PALETTE8_R5_G6_B5_OES                                0x8B97
+#define GL_PALETTE8_RGBA4_OES                                   0x8B98
+#define GL_PALETTE8_RGB5_A1_OES                                 0x8B99
+#endif
+
+#ifndef GL_OES_matrix_get
+#define GL_MODELVIEW_MATRIX_FLOAT_AS_INT_BITS_OES               0x898D
+#define GL_PROJECTION_MATRIX_FLOAT_AS_INT_BITS_OES              0x898E
+#define GL_TEXTURE_MATRIX_FLOAT_AS_INT_BITS_OES                 0x898F
+#endif
+
+#ifndef GL_ES_VERSION_2_0
+#define GL_SHADER_BINARY_FORMATS            0x8DF8
+#define GL_NUM_SHADER_BINARY_FORMATS        0x8DF9
+#define GL_SHADER_COMPILER                  0x8DFA
+#define GL_MAX_VERTEX_UNIFORM_VECTORS       0x8DFB
+#define GL_MAX_VARYING_VECTORS              0x8DFC
+#define GL_MAX_FRAGMENT_UNIFORM_VECTORS     0x8DFD
+#endif
+
+#ifndef GL_ATI_texture_compression_3dc
+#define GL_ATI_texture_compression_3dc 1
+#define GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI 0x8837
+#endif
+
+#ifndef GL_OES_compressed_ETC1_RGB8_texture
+#define GL_ETC1_RGB8_OES                                        0x8D64
+#endif
+
+
+/* Inexplicably, GL_HALF_FLOAT_OES has a different value than GL_HALF_FLOAT.
+ */
+#ifndef GL_HALF_FLOAT_OES
+#define GL_HALF_FLOAT_OES 0x8D61
+#endif
+
+
+/**
+ * Internal token to represent a GLSL shader program (a collection of
+ * one or more shaders that get linked together).  Note that GLSL
+ * shaders and shader programs share one name space (one hash table)
+ * so we need a value that's different from any of the
+ * GL_VERTEX/FRAGMENT/GEOMETRY_PROGRAM tokens.
+ */
+#define GL_SHADER_PROGRAM_MESA 0x9999
+
+
+/**
+ * Internal token for geometry programs.
+ * Use the value for GL_GEOMETRY_PROGRAM_NV for now.
+ */
+#define MESA_GEOMETRY_PROGRAM 0x8c26
+
+/* Several fields of struct gl_config can take these as values.  Since
+ * GLX header files may not be available everywhere they need to be used,
+ * redefine them here.
+ */
+#define GLX_NONE                           0x8000
+#define GLX_SLOW_CONFIG                    0x8001
+#define GLX_TRUE_COLOR                     0x8002
+#define GLX_DIRECT_COLOR                   0x8003
+#define GLX_PSEUDO_COLOR                   0x8004
+#define GLX_STATIC_COLOR                   0x8005
+#define GLX_GRAY_SCALE                     0x8006
+#define GLX_STATIC_GRAY                    0x8007
+#define GLX_TRANSPARENT_RGB                0x8008
+#define GLX_TRANSPARENT_INDEX              0x8009
+#define GLX_NON_CONFORMANT_CONFIG          0x800D
+#define GLX_SWAP_EXCHANGE_OML              0x8061
+#define GLX_SWAP_COPY_OML                  0x8062
+#define GLX_SWAP_UNDEFINED_OML             0x8063
+
+#define GLX_DONT_CARE                      0xFFFFFFFF
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GLHEADER_H */
--- a/include/main/glheader.h
+++ b/include/main/glheader.h
@ -0,0 +1 @@
+../glheader.h
--- a/src/arch/sparc/norm.S
+++ b/src/arch/sparc/norm.S
@ -0,0 +1,605 @@
+
+#include "sparc_matrix.h"
+
+	.register %g2, #scratch
+	.register %g3, #scratch
+
+	.text
+
+#ifdef __arch64__
+#define STACK_VAR_OFF	(2047 + (8 * 16))
+#else
+#define STACK_VAR_OFF	(4 * 16)
+#endif
+
+	/* Newton-Raphson approximation turns out to be slower
+	 * (and less accurate) than direct fsqrts/fdivs.
+	 */
+#define ONE_DOT_ZERO	0x3f800000
+
+	.globl	_mesa_sparc_transform_normalize_normals
+_mesa_sparc_transform_normalize_normals:
+	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
+
+	sethi	%hi(ONE_DOT_ZERO), %g2
+	sub	%sp, 16, %sp
+	st	%g2, [%sp + STACK_VAR_OFF+0x0]
+	st	%o1, [%sp + STACK_VAR_OFF+0x4]
+	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
+	ld	[%sp + STACK_VAR_OFF+0x4], %f15	! f15 = scale
+	add	%sp, 16, %sp
+
+	LDPTR	[%o0 + MAT_INV], %o0		! o0 = mat->inv
+	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
+	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
+	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
+	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
+
+	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
+
+	/* dest->count = in->count */
+	st	%g1, [%o4 + V4F_COUNT]
+
+	cmp	%g1, 1
+	bl	7f
+	 cmp	%o3, 0
+	bne	4f
+	 clr	%o4				! 'i' for STRIDE_LOOP
+
+1:	/* LENGTHS == NULL */
+	ld	[%o5 + 0x00], %f0		! ux = from[0]
+	ld	[%o5 + 0x04], %f1		! uy = from[1]
+	ld	[%o5 + 0x08], %f2		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	/* tx (f3) = (ux * m0) + (uy * m1) + (uz * m2)
+	 * ty (f5) = (ux * m4) + (uy * m5) + (uz * m6)
+	 * tz (f7) = (ux * m8) + (uy * m9) + (uz * m10)
+	 */
+	fmuls	%f0, M0, %f3			! FGM	Group
+	fmuls	%f1, M1, %f4			! FGM	Group
+	fmuls	%f0, M4, %f5			! FGM	Group
+	fmuls	%f1, M5, %f6			! FGM	Group
+	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
+	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
+	fadds	%f3, %f4, %f3			! FGA
+	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
+	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
+	fadds	%f5, %f6, %f5			! FGA
+	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
+	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
+	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
+	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
+	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
+
+	/* f3=tx, f5=ty, f7=tz */
+
+	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
+	fmuls	%f3, %f3, %f6			! FGM	Group	f3 available
+	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
+	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
+	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
+	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available
+
+	/* scale (f6) = 1.0 / sqrt(len) */
+	fsqrts	%f6, %f6			! FDIV  20 cycles
+	fdivs	%f12, %f6, %f6			! FDIV	14 cycles
+
+	fmuls	%f3, %f6, %f3
+	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
+	fmuls	%f5, %f6, %f5
+	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
+	fmuls	%f7, %f6, %f7
+	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	1b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+	ba	7f
+	 nop
+
+4:	/* LENGTHS != NULL */
+	fmuls	M0, %f15, M0
+	fmuls	M1, %f15, M1
+	fmuls	M2, %f15, M2
+	fmuls	M4, %f15, M4
+	fmuls	M5, %f15, M5
+	fmuls	M6, %f15, M6
+	fmuls	M8, %f15, M8
+	fmuls	M9, %f15, M9
+	fmuls	M10, %f15, M10
+
+5:
+	ld	[%o5 + 0x00], %f0		! ux = from[0]
+	ld	[%o5 + 0x04], %f1		! uy = from[1]
+	ld	[%o5 + 0x08], %f2		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	/* tx (f3) = (ux * m0) + (uy * m1) + (uz * m2)
+	 * ty (f5) = (ux * m4) + (uy * m5) + (uz * m6)
+	 * tz (f7) = (ux * m8) + (uy * m9) + (uz * m10)
+	 */
+	fmuls	%f0, M0, %f3			! FGM	Group
+	fmuls	%f1, M1, %f4			! FGM	Group
+	fmuls	%f0, M4, %f5			! FGM	Group
+	fmuls	%f1, M5, %f6			! FGM	Group
+	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
+	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
+	fadds	%f3, %f4, %f3			! FGA
+	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
+	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
+	fadds	%f5, %f6, %f5			! FGA
+	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
+	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
+	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
+	ld	[%o3], %f13			! LSU
+	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
+	add	%o3, 4, %o3			! IEU0
+	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
+
+	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
+
+	fmuls	%f3, %f13, %f3
+	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
+	fmuls	%f5, %f13, %f5
+	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
+	fmuls	%f7, %f13, %f7
+	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	5b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+7:	retl
+	 nop
+	
+	.globl	_mesa_sparc_transform_normalize_normals_no_rot
+_mesa_sparc_transform_normalize_normals_no_rot:
+	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
+
+	sethi	%hi(ONE_DOT_ZERO), %g2
+	sub	%sp, 16, %sp
+	st	%g2, [%sp + STACK_VAR_OFF+0x0]
+	st	%o1, [%sp + STACK_VAR_OFF+0x4]
+	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
+	ld	[%sp + STACK_VAR_OFF+0x4], %f15	! f15 = scale
+	add	%sp, 16, %sp
+
+	LDPTR	[%o0 + MAT_INV], %o0		! o0 = mat->inv
+	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
+	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
+	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
+	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
+
+	LDMATRIX_0_5_10(%o0)
+
+	/* dest->count = in->count */
+	st	%g1, [%o4 + V4F_COUNT]
+
+	cmp	%g1, 1
+	bl	7f
+	 cmp	%o3, 0
+	bne	4f
+	 clr	%o4				! 'i' for STRIDE_LOOP
+
+1:	/* LENGTHS == NULL */
+	ld	[%o5 + 0x00], %f0		! ux = from[0]
+	ld	[%o5 + 0x04], %f1		! uy = from[1]
+	ld	[%o5 + 0x08], %f2		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	/* tx (f3) = (ux * m0)
+	 * ty (f5) = (uy * m5)
+	 * tz (f7) = (uz * m10)
+	 */
+	fmuls	%f0, M0, %f3			! FGM	Group
+	fmuls	%f1, M5, %f5			! FGM	Group
+	fmuls	%f2, M10, %f7			! FGM	Group
+
+	/* f3=tx, f5=ty, f7=tz */
+
+	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
+	fmuls	%f3, %f3, %f6			! FGM	Group	stall, f3 available
+	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
+	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
+	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
+	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available
+
+	/* scale (f6) = 1.0 / sqrt(len) */
+	fsqrts	%f6, %f6			! FDIV  20 cycles
+	fdivs	%f12, %f6, %f6			! FDIV	14 cycles
+
+	fmuls	%f3, %f6, %f3
+	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
+	fmuls	%f5, %f6, %f5
+	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
+	fmuls	%f7, %f6, %f7
+	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	1b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+	ba	7f
+	 nop
+
+4:	/* LENGTHS != NULL */
+	fmuls	M0, %f15, M0
+	fmuls	M5, %f15, M5
+	fmuls	M10, %f15, M10
+
+5:
+	ld	[%o5 + 0x00], %f0		! ux = from[0]
+	ld	[%o5 + 0x04], %f1		! uy = from[1]
+	ld	[%o5 + 0x08], %f2		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	/* tx (f3) = (ux * m0)
+	 * ty (f5) = (uy * m5)
+	 * tz (f7) = (uz * m10)
+	 */
+	fmuls	%f0, M0, %f3			! FGM	Group
+	ld	[%o3], %f13			! LSU
+	fmuls	%f1, M5, %f5			! FGM	Group
+	add	%o3, 4, %o3			! IEU0
+	fmuls	%f2, M10, %f7			! FGM	Group
+
+	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
+
+	fmuls	%f3, %f13, %f3
+	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
+	fmuls	%f5, %f13, %f5
+	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
+	fmuls	%f7, %f13, %f7
+	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	5b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+7:	retl
+	 nop
+
+	.globl	_mesa_sparc_transform_rescale_normals_no_rot
+_mesa_sparc_transform_rescale_normals_no_rot:
+	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
+	sub	%sp, 16, %sp
+	st	%o1, [%sp + STACK_VAR_OFF+0x0]
+	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
+	add	%sp, 16, %sp
+
+	LDPTR	[%o0 + MAT_INV], %o0		! o0 = mat->inv
+	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
+	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
+	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
+	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
+
+	LDMATRIX_0_5_10(%o0)
+
+	/* dest->count = in->count */
+	st	%g1, [%o4 + V4F_COUNT]
+
+	cmp	%g1, 1
+	bl	7f
+	 clr	%o4				! 'i' for STRIDE_LOOP
+
+	fmuls	M0, %f15, M0
+	fmuls	M5, %f15, M5
+	fmuls	M10, %f15, M10
+
+1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
+	ld	[%o5 + 0x04], %f1		! uy = from[1]
+	ld	[%o5 + 0x08], %f2		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	/* tx (f3) = (ux * m0)
+	 * ty (f5) = (uy * m5)
+	 * tz (f7) = (uz * m10)
+	 */
+	fmuls	%f0, M0, %f3			! FGM	Group
+	st	%f3, [%g3 + 0x00]		! LSU
+	fmuls	%f1, M5, %f5			! FGM	Group
+	st	%f5, [%g3 + 0x04]		! LSU
+	fmuls	%f2, M10, %f7			! FGM	Group
+	st	%f7, [%g3 + 0x08]		! LSU
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	1b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+7:	retl
+	 nop
+
+	.globl	_mesa_sparc_transform_rescale_normals
+_mesa_sparc_transform_rescale_normals:
+	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
+	sub	%sp, 16, %sp
+	st	%o1, [%sp + STACK_VAR_OFF+0x0]
+	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
+	add	%sp, 16, %sp
+
+	LDPTR	[%o0 + MAT_INV], %o0		! o0 = mat->inv
+	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
+	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
+	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
+	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
+
+	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
+
+	/* dest->count = in->count */
+	st	%g1, [%o4 + V4F_COUNT]
+
+	cmp	%g1, 1
+	bl	7f
+	 clr	%o4				! 'i' for STRIDE_LOOP
+
+	fmuls	M0, %f15, M0
+	fmuls	M1, %f15, M1
+	fmuls	M2, %f15, M2
+	fmuls	M4, %f15, M4
+	fmuls	M5, %f15, M5
+	fmuls	M6, %f15, M6
+	fmuls	M8, %f15, M8
+	fmuls	M9, %f15, M9
+	fmuls	M10, %f15, M10
+
+1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
+	ld	[%o5 + 0x04], %f1		! uy = from[1]
+	ld	[%o5 + 0x08], %f2		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	fmuls	%f0, M0, %f3			! FGM	Group
+	fmuls	%f1, M1, %f4			! FGM	Group
+	fmuls	%f0, M4, %f5			! FGM	Group
+	fmuls	%f1, M5, %f6			! FGM	Group
+	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
+	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
+	fadds	%f3, %f4, %f3			! FGA
+	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
+	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
+	fadds	%f5, %f6, %f5			! FGA
+	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
+	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
+	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
+	st	%f3, [%g3 + 0x00]		! LSU
+	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
+	st	%f5, [%g3 + 0x04]		! LSU
+	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
+	st	%f7, [%g3 + 0x08]		! LSU
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	1b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+7:	retl
+	 nop
+
+	.globl	_mesa_sparc_transform_normals_no_rot
+_mesa_sparc_transform_normals_no_rot:
+	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
+	LDPTR	[%o0 + MAT_INV], %o0		! o0 = mat->inv
+	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
+	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
+	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
+	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
+
+	LDMATRIX_0_5_10(%o0)
+
+	/* dest->count = in->count */
+	st	%g1, [%o4 + V4F_COUNT]
+
+	cmp	%g1, 1
+	bl	7f
+	 clr	%o4				! 'i' for STRIDE_LOOP
+
+1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
+	ld	[%o5 + 0x04], %f1		! uy = from[1]
+	ld	[%o5 + 0x08], %f2		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	/* tx (f3) = (ux * m0)
+	 * ty (f5) = (uy * m5)
+	 * tz (f7) = (uz * m10)
+	 */
+	fmuls	%f0, M0, %f3			! FGM	Group
+	st	%f3, [%g3 + 0x00]		! LSU
+	fmuls	%f1, M5, %f5			! FGM	Group
+	st	%f5, [%g3 + 0x04]		! LSU
+	fmuls	%f2, M10, %f7			! FGM	Group
+	st	%f7, [%g3 + 0x08]		! LSU
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	1b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+7:	retl
+	 nop
+
+	.globl	_mesa_sparc_transform_normals
+_mesa_sparc_transform_normals:
+	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
+	LDPTR	[%o0 + MAT_INV], %o0		! o0 = mat->inv
+	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
+	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
+	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
+	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
+
+	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
+
+	/* dest->count = in->count */
+	st	%g1, [%o4 + V4F_COUNT]
+
+	cmp	%g1, 1
+	bl	7f
+	 clr	%o4				! 'i' for STRIDE_LOOP
+
+1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
+	ld	[%o5 + 0x04], %f1		! uy = from[1]
+	ld	[%o5 + 0x08], %f2		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	fmuls	%f0, M0, %f3			! FGM	Group
+	fmuls	%f1, M1, %f4			! FGM	Group
+	fmuls	%f0, M4, %f5			! FGM	Group
+	fmuls	%f1, M5, %f6			! FGM	Group
+	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
+	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
+	fadds	%f3, %f4, %f3			! FGA
+	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
+	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
+	fadds	%f5, %f6, %f5			! FGA
+	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
+	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
+	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
+	st	%f3, [%g3 + 0x00]		! LSU
+	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
+	st	%f5, [%g3 + 0x04]		! LSU
+	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
+	st	%f7, [%g3 + 0x08]		! LSU
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	1b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+7:	retl
+	 nop
+
+	.globl	_mesa_sparc_normalize_normals
+_mesa_sparc_normalize_normals:
+	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
+
+	sethi	%hi(ONE_DOT_ZERO), %g2
+	sub	%sp, 16, %sp
+	st	%g2, [%sp + STACK_VAR_OFF+0x0]
+	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
+	add	%sp, 16, %sp
+
+	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
+	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
+	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
+	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
+
+	/* dest->count = in->count */
+	st	%g1, [%o4 + V4F_COUNT]
+
+	cmp	%g1, 1
+	bl	7f
+	 cmp	%o3, 0
+	bne	4f
+	 clr	%o4				! 'i' for STRIDE_LOOP
+
+1:	/* LENGTHS == NULL */
+	ld	[%o5 + 0x00], %f3		! ux = from[0]
+	ld	[%o5 + 0x04], %f5		! uy = from[1]
+	ld	[%o5 + 0x08], %f7		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	/* f3=tx, f5=ty, f7=tz */
+
+	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
+	fmuls	%f3, %f3, %f6			! FGM	Group	f3 available
+	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
+	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
+	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
+	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available
+
+	/* scale (f6) = 1.0 / sqrt(len) */
+	fsqrts	%f6, %f6			! FDIV  20 cycles
+	fdivs	%f12, %f6, %f6			! FDIV	14 cycles
+
+	fmuls	%f3, %f6, %f3
+	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
+	fmuls	%f5, %f6, %f5
+	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
+	fmuls	%f7, %f6, %f7
+	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	1b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+	ba	7f
+	 nop
+
+4:	/* LENGTHS != NULL */
+
+5:
+	ld	[%o5 + 0x00], %f3		! ux = from[0]
+	ld	[%o5 + 0x04], %f5		! uy = from[1]
+	ld	[%o5 + 0x08], %f7		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	ld	[%o3], %f13			! LSU
+	add	%o3, 4, %o3			! IEU0
+
+	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
+
+	fmuls	%f3, %f13, %f3
+	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
+	fmuls	%f5, %f13, %f5
+	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
+	fmuls	%f7, %f13, %f7
+	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	5b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+7:	retl
+	 nop
+
+	.globl	_mesa_sparc_rescale_normals
+_mesa_sparc_rescale_normals:
+	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
+
+	sethi	%hi(ONE_DOT_ZERO), %g2
+	sub	%sp, 16, %sp
+	st	%o1, [%sp + STACK_VAR_OFF+0x0]
+	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
+	add	%sp, 16, %sp
+
+	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
+	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
+	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
+	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
+
+	/* dest->count = in->count */
+	st	%g1, [%o4 + V4F_COUNT]
+
+	cmp	%g1, 1
+	bl	7f
+	 clr	%o4				! 'i' for STRIDE_LOOP
+
+1:
+	ld	[%o5 + 0x00], %f3		! ux = from[0]
+	ld	[%o5 + 0x04], %f5		! uy = from[1]
+	ld	[%o5 + 0x08], %f7		! uz = from[2]
+	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
+	add	%o4, 1, %o4			! i++
+
+	/* f3=tx, f5=ty, f7=tz */
+
+	fmuls	%f3, %f15, %f3
+	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
+	fmuls	%f5, %f15, %f5
+	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
+	fmuls	%f7, %f15, %f7
+	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale
+
+	cmp	%o4, %g1			! continue if (i < count)
+	bl	1b
+	 add	%g3, 0x10, %g3			! advance out vector pointer
+
+7:	retl
+	 nop
--- a/src/arch/sparc/sparc.c
+++ b/src/arch/sparc/sparc.c
@ -0,0 +1,142 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ * 
+ * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Sparc assembly code by David S. Miller
+ */
+
+
+#include "sparc.h"
+
+#ifdef USE_SPARC_ASM
+
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+
+#ifdef DEBUG
+#include "math/m_debug.h"
+#endif
+
+#define XFORM_ARGS 	GLvector4f *to_vec, 		\
+			const GLfloat m[16], 		\
+			const GLvector4f *from_vec
+
+#define DECLARE_XFORM_GROUP(pfx, sz)					   \
+ extern void _mesa_##pfx##_transform_points##sz##_general(XFORM_ARGS);     \
+ extern void _mesa_##pfx##_transform_points##sz##_identity(XFORM_ARGS);    \
+ extern void _mesa_##pfx##_transform_points##sz##_3d_no_rot(XFORM_ARGS);   \
+ extern void _mesa_##pfx##_transform_points##sz##_perspective(XFORM_ARGS); \
+ extern void _mesa_##pfx##_transform_points##sz##_2d(XFORM_ARGS);          \
+ extern void _mesa_##pfx##_transform_points##sz##_2d_no_rot(XFORM_ARGS);   \
+ extern void _mesa_##pfx##_transform_points##sz##_3d(XFORM_ARGS);
+
+#define ASSIGN_XFORM_GROUP(pfx, sz)					\
+   _mesa_transform_tab[sz][MATRIX_GENERAL] =				\
+      _mesa_##pfx##_transform_points##sz##_general;			\
+   _mesa_transform_tab[sz][MATRIX_IDENTITY] =				\
+      _mesa_##pfx##_transform_points##sz##_identity;			\
+   _mesa_transform_tab[sz][MATRIX_3D_NO_ROT] =				\
+      _mesa_##pfx##_transform_points##sz##_3d_no_rot;			\
+   _mesa_transform_tab[sz][MATRIX_PERSPECTIVE] =			\
+      _mesa_##pfx##_transform_points##sz##_perspective;			\
+   _mesa_transform_tab[sz][MATRIX_2D] =					\
+      _mesa_##pfx##_transform_points##sz##_2d;				\
+   _mesa_transform_tab[sz][MATRIX_2D_NO_ROT] =				\
+      _mesa_##pfx##_transform_points##sz##_2d_no_rot;			\
+   _mesa_transform_tab[sz][MATRIX_3D] =					\
+      _mesa_##pfx##_transform_points##sz##_3d;
+
+
+DECLARE_XFORM_GROUP(sparc, 1)
+DECLARE_XFORM_GROUP(sparc, 2)
+DECLARE_XFORM_GROUP(sparc, 3)
+DECLARE_XFORM_GROUP(sparc, 4)
+
+extern GLvector4f  *_mesa_sparc_cliptest_points4(GLvector4f *clip_vec,
+						 GLvector4f *proj_vec,
+						 GLubyte clipMask[],
+						 GLubyte *orMask,
+						 GLubyte *andMask,
+						 GLboolean viewport_z_clip);
+
+extern GLvector4f  *_mesa_sparc_cliptest_points4_np(GLvector4f *clip_vec,
+						    GLvector4f *proj_vec,
+						    GLubyte clipMask[],
+						    GLubyte *orMask,
+						    GLubyte *andMask,
+						    GLboolean viewport_z_clip);
+
+#define NORM_ARGS	const GLmatrix *mat,				\
+			GLfloat scale,					\
+			const GLvector4f *in,				\
+			const GLfloat *lengths,				\
+			GLvector4f *dest
+
+extern void _mesa_sparc_transform_normalize_normals(NORM_ARGS);
+extern void _mesa_sparc_transform_normalize_normals_no_rot(NORM_ARGS);
+extern void _mesa_sparc_transform_rescale_normals_no_rot(NORM_ARGS);
+extern void _mesa_sparc_transform_rescale_normals(NORM_ARGS);
+extern void _mesa_sparc_transform_normals_no_rot(NORM_ARGS);
+extern void _mesa_sparc_transform_normals(NORM_ARGS);
+extern void _mesa_sparc_normalize_normals(NORM_ARGS);
+extern void _mesa_sparc_rescale_normals(NORM_ARGS);
+
+
+
+void _mesa_init_all_sparc_transform_asm(void)
+{
+   ASSIGN_XFORM_GROUP(sparc, 1)
+   ASSIGN_XFORM_GROUP(sparc, 2)
+   ASSIGN_XFORM_GROUP(sparc, 3)
+   ASSIGN_XFORM_GROUP(sparc, 4)
+
+   _mesa_clip_tab[4] = _mesa_sparc_cliptest_points4;
+   _mesa_clip_np_tab[4] = _mesa_sparc_cliptest_points4_np;
+
+   _mesa_normal_tab[NORM_TRANSFORM | NORM_NORMALIZE] =
+	   _mesa_sparc_transform_normalize_normals;
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE] =
+	   _mesa_sparc_transform_normalize_normals_no_rot;
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] =
+	   _mesa_sparc_transform_rescale_normals_no_rot;
+   _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] =
+	   _mesa_sparc_transform_rescale_normals;
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] =
+	   _mesa_sparc_transform_normals_no_rot;
+   _mesa_normal_tab[NORM_TRANSFORM] =
+	   _mesa_sparc_transform_normals;
+   _mesa_normal_tab[NORM_NORMALIZE] =
+	   _mesa_sparc_normalize_normals;
+   _mesa_normal_tab[NORM_RESCALE] =
+	   _mesa_sparc_rescale_normals;
+
+#ifdef DEBUG_MATH
+   _math_test_all_transform_functions("sparc");
+   _math_test_all_cliptest_functions("sparc");
+   _math_test_all_normal_transform_functions("sparc");
+#endif
+}
+
+#endif /* USE_SPARC_ASM */
--- a/src/arch/sparc/sparc.h
+++ b/src/arch/sparc/sparc.h
@ -0,0 +1,36 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.1
+ * 
+ * Copyright (C) 1999  Brian Paul   All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Sparc assembly code by David S. Miller
+ */
+
+
+#ifndef SPARC_H
+#define SPARC_H
+
+extern void _mesa_init_all_sparc_transform_asm(void);
+
+#endif /* !(SPARC_H) */
--- a/src/arch/sparc/sparc_clip.S
+++ b/src/arch/sparc/sparc_clip.S
@ -0,0 +1,233 @@
+/*
+ * Clip testing in SPARC assembly
+ */
+
+#if __arch64__
+#define LDPTR		ldx
+#define V4F_DATA	0x00
+#define V4F_START	0x08
+#define V4F_COUNT	0x10
+#define V4F_STRIDE	0x14
+#define V4F_SIZE	0x18
+#define V4F_FLAGS	0x1c
+#else
+#define LDPTR		ld
+#define V4F_DATA	0x00
+#define V4F_START	0x04
+#define V4F_COUNT	0x08
+#define V4F_STRIDE	0x0c
+#define V4F_SIZE	0x10
+#define V4F_FLAGS	0x14
+#endif
+
+#define VEC_SIZE_1   	1
+#define VEC_SIZE_2   	3
+#define VEC_SIZE_3   	7
+#define VEC_SIZE_4   	15
+
+        .register %g2, #scratch
+        .register %g3, #scratch
+
+	.text
+	.align		64
+
+one_dot_zero:
+	.word		0x3f800000	/* 1.0f */
+
+	/* This trick is shamelessly stolen from the x86
+	 * Mesa asm.  Very clever, and we can do it too
+	 * since we have the necessary add with carry
+	 * instructions on Sparc.
+	 */
+clip_table:
+	.byte	 0,  1,  0,  2,  4,  5,  4,  6
+	.byte	 0,  1,  0,  2,  8,  9,  8, 10
+	.byte	32, 33, 32, 34, 36, 37, 36, 38
+	.byte	32, 33, 32, 34, 40, 41, 40, 42
+	.byte	 0,  1,  0,  2,  4,  5,  4,  6
+	.byte	 0,  1,  0,  2,  8,  9,  8, 10
+	.byte	16, 17, 16, 18, 20, 21, 20, 22
+	.byte	16, 17, 16, 18, 24, 25, 24, 26
+	.byte	63, 61, 63, 62, 55, 53, 55, 54
+	.byte	63, 61, 63, 62, 59, 57, 59, 58
+	.byte	47, 45, 47, 46, 39, 37, 39, 38
+	.byte	47, 45, 47, 46, 43, 41, 43, 42
+	.byte	63, 61, 63, 62, 55, 53, 55, 54
+	.byte	63, 61, 63, 62, 59, 57, 59, 58
+	.byte	31, 29, 31, 30, 23, 21, 23, 22
+	.byte	31, 29, 31, 30, 27, 25, 27, 26
+
+/* GLvector4f *clip_vec, GLvector4f *proj_vec, 
+   GLubyte clipMask[], GLubyte *orMask, GLubyte *andMask,
+   GLboolean viewport_z_enable */
+
+	.align		64
+__pc_tramp:
+	retl
+	 nop
+
+	.globl		_mesa_sparc_cliptest_points4
+_mesa_sparc_cliptest_points4:
+	save		%sp, -64, %sp
+	call		__pc_tramp
+	 sub		%o7, (. - one_dot_zero - 4), %g1
+	ld		[%g1 + 0x0], %f4
+	add		%g1, 0x4, %g1
+
+	ld		[%i0 + V4F_STRIDE], %l1
+	ld		[%i0 + V4F_COUNT], %l3
+	LDPTR		[%i0 + V4F_START], %i0
+	LDPTR		[%i1 + V4F_START], %i5
+	ldub		[%i3], %g2
+	ldub		[%i4], %g3
+	sll		%g3, 8, %g3
+	or		%g2, %g3, %g2
+
+	ld		[%i1 + V4F_FLAGS], %g3
+	or		%g3, VEC_SIZE_4, %g3
+	st		%g3, [%i1 + V4F_FLAGS]
+	mov		3, %g3
+	st		%g3, [%i1 + V4F_SIZE]
+	st		%l3, [%i1 + V4F_COUNT]
+	clr		%l2
+	clr		%l0
+
+	/* l0:	i
+	 * l3:	count
+	 * l1:	stride
+	 * l2:	c
+	 * g2:	(tmpAndMask << 8) | tmpOrMask
+	 * g1:	clip_table
+	 * i0:	from[stride][i]
+	 * i2:	clipMask
+	 * i5:	vProj[4][i]
+	 */
+
+1:	ld		[%i0 + 0x0c], %f3	! LSU	Group
+	ld		[%i0 + 0x0c], %g5	! LSU	Group
+	ld		[%i0 + 0x08], %g4	! LSU	Group
+	fdivs		%f4, %f3, %f8		! FGM
+	addcc		%g5, %g5, %g5		! IEU1	Group
+	addx		%g0, 0x0, %g3		! IEU1	Group
+	addcc		%g4, %g4, %g4		! IEU1	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	subcc		%g5, %g4, %g0		! IEU1	Group
+	ld		[%i0 + 0x04], %g4	! LSU	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	addcc		%g4, %g4, %g4		! IEU1	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	subcc		%g5, %g4, %g0		! IEU1	Group
+	ld		[%i0 + 0x00], %g4	! LSU	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	addcc		%g4, %g4, %g4		! IEU1	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	subcc		%g5, %g4, %g0		! IEU1	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	ldub		[%g1 + %g3], %g3	! LSU	Group
+	cmp		%g3, 0			! IEU1	Group, stall
+	be		2f			! CTI
+	 stb		%g3, [%i2]		! LSU
+	sll		%g3, 8, %g4		! IEU1	Group
+	add		%l2, 1, %l2		! IEU0
+	st		%g0, [%i5 + 0x00]	! LSU
+	or		%g4, 0xff, %g4		! IEU0	Group
+	or		%g2, %g3, %g2		! IEU1
+	st		%g0, [%i5 + 0x04]	! LSU
+	and		%g2, %g4, %g2		! IEU0	Group
+	st		%g0, [%i5 + 0x08]	! LSU
+	b		3f			! CTI
+	 st		%f4, [%i5 + 0x0c]	! LSU	Group
+2:	ld		[%i0 + 0x00], %f0	! LSU	Group
+	ld		[%i0 + 0x04], %f1	! LSU	Group
+	ld		[%i0 + 0x08], %f2	! LSU	Group
+	fmuls		%f0, %f8, %f0		! FGM
+	st		%f0, [%i5 + 0x00]	! LSU	Group
+	fmuls		%f1, %f8, %f1		! FGM
+	st		%f1, [%i5 + 0x04]	! LSU	Group
+	fmuls		%f2, %f8, %f2		! FGM
+	st		%f2, [%i5 + 0x08]	! LSU	Group
+	st		%f8, [%i5 + 0x0c]	! LSU	Group
+3:	add		%i5, 0x10, %i5		! IEU1
+	add		%l0, 1, %l0		! IEU0	Group
+	add		%i2, 1, %i2		! IEU0	Group
+	cmp		%l0, %l3		! IEU1	Group
+	bne		1b			! CTI
+	 add		%i0, %l1, %i0		! IEU0	Group
+	stb		%g2, [%i3]		! LSU
+	srl		%g2, 8, %g3		! IEU0	Group
+	cmp		%l2, %l3		! IEU1	Group
+	bl,a		1f			! CTI
+	 clr		%g3			! IEU0
+1:	stb		%g3, [%i4]		! LSU	Group
+	ret					! CTI	Group
+	 restore	%i1, 0x0, %o0
+
+	.globl		_mesa_sparc_cliptest_points4_np
+_mesa_sparc_cliptest_points4_np:
+	save		%sp, -64, %sp
+
+	call		__pc_tramp
+	 sub		%o7, (. - one_dot_zero - 4), %g1
+	add		%g1, 0x4, %g1
+
+	ld		[%i0 + V4F_STRIDE], %l1
+	ld		[%i0 + V4F_COUNT], %l3
+	LDPTR		[%i0 + V4F_START], %i0
+	ldub		[%i3], %g2
+	ldub		[%i4], %g3
+	sll		%g3, 8, %g3
+	or		%g2, %g3, %g2
+
+	clr		%l2
+	clr		%l0
+
+	/* l0:	i
+	 * l3:	count
+	 * l1:	stride
+	 * l2:	c
+	 * g2:	(tmpAndMask << 8) | tmpOrMask
+	 * g1:	clip_table
+	 * i0:	from[stride][i]
+	 * i2:	clipMask
+	 */
+
+1:	ld		[%i0 + 0x0c], %g5	! LSU	Group
+	ld		[%i0 + 0x08], %g4	! LSU	Group
+	addcc		%g5, %g5, %g5		! IEU1	Group
+	addx		%g0, 0x0, %g3		! IEU1	Group
+	addcc		%g4, %g4, %g4		! IEU1	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	subcc		%g5, %g4, %g0		! IEU1	Group
+	ld		[%i0 + 0x04], %g4	! LSU	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	addcc		%g4, %g4, %g4		! IEU1	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	subcc		%g5, %g4, %g0		! IEU1	Group
+	ld		[%i0 + 0x00], %g4	! LSU	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	addcc		%g4, %g4, %g4		! IEU1	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	subcc		%g5, %g4, %g0		! IEU1	Group
+	addx		%g3, %g3, %g3		! IEU1	Group
+	ldub		[%g1 + %g3], %g3	! LSU	Group
+	cmp		%g3, 0			! IEU1	Group, stall
+	be		2f			! CTI
+	 stb		%g3, [%i2]		! LSU
+	sll		%g3, 8, %g4		! IEU1	Group
+	add		%l2, 1, %l2		! IEU0
+	or		%g4, 0xff, %g4		! IEU0	Group
+	or		%g2, %g3, %g2		! IEU1
+	and		%g2, %g4, %g2		! IEU0	Group
+2:	add		%l0, 1, %l0		! IEU0	Group
+	add		%i2, 1, %i2		! IEU0	Group
+	cmp		%l0, %l3		! IEU1	Group
+	bne		1b			! CTI
+	 add		%i0, %l1, %i0		! IEU0	Group
+	stb		%g2, [%i3]		! LSU
+	srl		%g2, 8, %g3		! IEU0	Group
+	cmp		%l2, %l3		! IEU1	Group
+	bl,a		1f			! CTI
+	 clr		%g3			! IEU0
+1:	stb		%g3, [%i4]		! LSU	Group
+	ret					! CTI	Group
+	 restore	%i1, 0x0, %o0
--- a/src/arch/sparc/sparc_matrix.h
+++ b/src/arch/sparc/sparc_matrix.h
@ -0,0 +1,170 @@
+/*
+ * SPARC assembly matrix code.
+ */
+
+#ifndef _SPARC_MATRIX_H
+#define _SPARC_MATRIX_H
+
+#ifdef __arch64__
+#define LDPTR		ldx
+#define MAT_M		0x00
+#define MAT_INV		0x08
+#define V4F_DATA	0x00
+#define V4F_START	0x08
+#define V4F_COUNT	0x10
+#define V4F_STRIDE	0x14
+#define V4F_SIZE	0x18
+#define V4F_FLAGS	0x1c
+#else
+#define LDPTR		ld
+#define MAT_M		0x00
+#define MAT_INV		0x04
+#define V4F_DATA	0x00
+#define V4F_START	0x04
+#define V4F_COUNT	0x08
+#define V4F_STRIDE	0x0c
+#define V4F_SIZE	0x10
+#define V4F_FLAGS	0x14
+#endif
+
+#define VEC_SIZE_1   	1
+#define VEC_SIZE_2   	3
+#define VEC_SIZE_3   	7
+#define VEC_SIZE_4   	15
+
+#define M0		%f16
+#define M1		%f17
+#define M2		%f18
+#define M3		%f19
+#define M4		%f20
+#define M5		%f21
+#define M6		%f22
+#define M7		%f23
+#define M8		%f24
+#define M9		%f25
+#define M10		%f26
+#define M11		%f27
+#define M12		%f28
+#define M13		%f29
+#define M14		%f30
+#define M15		%f31
+
+#define LDMATRIX_0_1_2_3_12_13_14_15(BASE)	\
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ldd	[BASE + ( 2 * 0x4)], M2;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ldd	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_1_12_13(BASE)		\
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ldd	[BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_12_13(BASE)			\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ldd	[BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_1_2_12_13_14(BASE)		\
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 2 * 0x4)], M2;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_12_13_14(BASE)		\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_14(BASE)			\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_1_2_3_4_5_6_7_12_13_14_15(BASE) \
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ldd	[BASE + ( 2 * 0x4)], M2;	\
+	ldd	[BASE + ( 4 * 0x4)], M4;	\
+	ldd	[BASE + ( 6 * 0x4)], M6;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ldd	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_12_13(BASE) 		\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 5 * 0x4)], M5;	\
+	ldd	[BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_1_2_3_4_5_6_12_13_14(BASE)	\
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ldd	[BASE + ( 2 * 0x4)], M2;	\
+	ldd	[BASE + ( 4 * 0x4)], M4;	\
+	ld	[BASE + ( 6 * 0x4)], M6;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_12_13_14(BASE)		\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 5 * 0x4)], M5;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_14(BASE)			\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 5 * 0x4)], M5;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15(BASE) \
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ldd	[BASE + ( 2 * 0x4)], M2;	\
+	ldd	[BASE + ( 4 * 0x4)], M4;	\
+	ldd	[BASE + ( 6 * 0x4)], M6;	\
+	ldd	[BASE + ( 8 * 0x4)], M8;	\
+	ldd	[BASE + (10 * 0x4)], M10;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ldd	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_1_4_5_12_13(BASE) 		\
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ldd	[BASE + ( 4 * 0x4)], M4;	\
+	ldd	[BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_5_12_13(BASE) 		\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 5 * 0x4)], M5;	\
+	ldd	[BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_1_2_4_5_6_8_9_10(BASE) \
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 2 * 0x4)], M2;	\
+	ldd	[BASE + ( 4 * 0x4)], M4;	\
+	ld	[BASE + ( 6 * 0x4)], M6;	\
+	ldd	[BASE + ( 8 * 0x4)], M8;	\
+	ld	[BASE + (10 * 0x4)], M10
+
+#define LDMATRIX_0_1_2_4_5_6_8_9_10_12_13_14(BASE) \
+	ldd	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 2 * 0x4)], M2;	\
+	ldd	[BASE + ( 4 * 0x4)], M4;	\
+	ld	[BASE + ( 6 * 0x4)], M6;	\
+	ldd	[BASE + ( 8 * 0x4)], M8;	\
+	ld	[BASE + (10 * 0x4)], M10;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_10(BASE) 			\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 5 * 0x4)], M5;	\
+	ld	[BASE + (10 * 0x4)], M10;	\
+
+#define LDMATRIX_0_5_10_12_13_14(BASE) 		\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 5 * 0x4)], M5;	\
+	ld	[BASE + (10 * 0x4)], M10;	\
+	ldd	[BASE + (12 * 0x4)], M12;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_8_9_10_14(BASE) 		\
+	ld	[BASE + ( 0 * 0x4)], M0;	\
+	ld	[BASE + ( 5 * 0x4)], M5;	\
+	ldd	[BASE + ( 8 * 0x4)], M8;	\
+	ld	[BASE + (10 * 0x4)], M10;	\
+	ld	[BASE + (14 * 0x4)], M14
+
+#endif /* !(_SPARC_MATRIX_H) */
--- a/src/arch/sparc/xform.S
+++ b/src/arch/sparc/xform.S
--- a/src/arch/x86-64/Makefile.am
+++ b/src/arch/x86-64/Makefile.am
@ -0,0 +1,40 @@
+# Copyright © 2012 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+if HAVE_X86_64_ASM
+
+AM_CPPFLAGS = \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/mesa \
+	-I$(top_srcdir)/src/GLdispatch/mapi \
+	$(API_DEFINES) \
+	$(DEFINES)
+
+noinst_PROGRAMS = gen_matypes
+
+gen_matypes_SOURCES = ../x86/gen_matypes.c
+BUILT_SOURCES = matypes.h
+CLEANFILES = matypes.h
+
+matypes.h: gen_matypes
+	$(AM_V_GEN)./gen_matypes > $@
+
+endif
--- a/src/arch/x86-64/calling_convention.txt
+++ b/src/arch/x86-64/calling_convention.txt
@ -0,0 +1,50 @@
+Register Usage
+rax      temporary register; with variable arguments passes information
+         about the number of SSE registers used; 1st return register
+
+rbx*     callee-saved register; optionally used as base pointer
+
+rcx      used to pass 4th integer argument to functions
+
+rdx      used to pass 3rd argument to functions 2nd return register
+
+rsp*     stack pointer
+
+rbp*     callee-saved register; optionally used as frame pointer
+
+rsi      used to pass 2nd argument to functions
+
+rdi      used to pass 1st argument to functions
+
+r8       used to pass 5th argument to functions
+
+r9       used to pass 6th argument to functions
+
+r10      temporary register, used for passing a function's static chain pointer
+
+r11      temporary register
+
+r12-15*  callee-saved registers
+
+xmm01   used to pass and return floating point arguments
+
+xmm27   used to pass floating point arguments
+
+xmm815  temporary registers
+
+mmx07   temporary registers
+
+st0      temporary register; used to return long double arguments
+
+st1      temporary registers; used to return long double arguments
+
+st27    temporary registers
+
+fs       Reserved for system use (as thread specific data register)
+
+	
+
+*) must be preserved across function calls
+
+Integer arguments from list: rdi,rsi,rdx,rcx,r8,r9,stack
+Floating point arguments from list: xmm0-xmm7
--- a/src/arch/x86-64/x86-64.c
+++ b/src/arch/x86-64/x86-64.c
@ -0,0 +1,119 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ *
+ * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * x86-64 optimizations shamelessy converted from x86/sse/3dnow assembly by
+ * Mikko Tiihonen
+ */
+
+#ifdef USE_X86_64_ASM
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+#include "x86-64.h"
+#include "../x86/x86_xform.h"
+
+#ifdef DEBUG
+#include "math/m_debug.h"
+#endif
+
+extern void _mesa_x86_64_cpuid(unsigned int *regs);
+
+DECLARE_XFORM_GROUP( x86_64, 4 )
+DECLARE_XFORM_GROUP( 3dnow, 4 )
+
+#else
+/* just to silence warning below */
+#include "x86-64.h"
+#endif
+
+/*
+extern void _mesa_x86_64_transform_points4_general( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_identity( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_perspective( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_3d( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_3d_no_rot( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_2d_no_rot( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_2d( XFORM_ARGS );
+*/
+
+#ifdef USE_X86_64_ASM
+static void message( const char *msg )
+{
+   if (_mesa_getenv("MESA_DEBUG")) {
+      _mesa_debug( NULL, "%s", msg );
+   }
+}
+#endif
+
+
+void _mesa_init_all_x86_64_transform_asm(void)
+{
+#ifdef USE_X86_64_ASM
+   unsigned int regs[4];
+
+   if ( _mesa_getenv( "MESA_NO_ASM" ) ) {
+     return;
+   }
+
+   message("Initializing x86-64 optimizations\n");
+
+
+   _mesa_transform_tab[4][MATRIX_GENERAL] =
+      _mesa_x86_64_transform_points4_general;
+   _mesa_transform_tab[4][MATRIX_IDENTITY] =
+      _mesa_x86_64_transform_points4_identity;
+   _mesa_transform_tab[4][MATRIX_3D] =
+      _mesa_x86_64_transform_points4_3d;
+
+   regs[0] = 0x80000001;
+   regs[1] = 0x00000000;
+   regs[2] = 0x00000000;
+   regs[3] = 0x00000000;
+   _mesa_x86_64_cpuid(regs);
+   if (regs[3] & (1U << 31)) {
+      message("3Dnow! detected\n");
+      _mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
+	  _mesa_3dnow_transform_points4_3d_no_rot;
+      _mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
+	  _mesa_3dnow_transform_points4_perspective;
+      _mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
+	  _mesa_3dnow_transform_points4_2d_no_rot;
+      _mesa_transform_tab[4][MATRIX_2D] =
+	  _mesa_3dnow_transform_points4_2d;
+
+   }
+
+   
+#ifdef DEBUG_MATH
+   _math_test_all_transform_functions("x86_64");
+   _math_test_all_cliptest_functions("x86_64");
+   _math_test_all_normal_transform_functions("x86_64");
+#endif
+
+#endif
+}
--- a/src/arch/x86-64/x86-64.h
+++ b/src/arch/x86-64/x86-64.h
@ -0,0 +1,31 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __X86_64_ASM_H__
+#define __X86_64_ASM_H__
+
+extern void _mesa_init_all_x86_64_transform_asm( void );
+
+#endif
--- a/src/arch/x86-64/xform4.S
+++ b/src/arch/x86-64/xform4.S
@ -0,0 +1,483 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.1
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_X86_64_ASM
+
+#include "matypes.h"
+
+.text
+
+.align 16
+.globl _mesa_x86_64_cpuid
+.hidden _mesa_x86_64_cpuid
+_mesa_x86_64_cpuid:
+	pushq	%rbx
+	movl	(%rdi), %eax
+	movl	8(%rdi), %ecx
+
+	cpuid
+
+	movl	%ebx, 4(%rdi)
+	movl	%eax, (%rdi)
+	movl	%ecx, 8(%rdi)
+	movl	%edx, 12(%rdi)
+	popq	%rbx
+	ret
+
+.align 16
+.globl _mesa_x86_64_transform_points4_general
+.hidden _mesa_x86_64_transform_points4_general
+_mesa_x86_64_transform_points4_general:
+/*
+ *	rdi = dest
+ *	rsi = matrix
+ *	rdx = source
+ */
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	testl %ecx, %ecx		/* verify non-zero count */
+	prefetchnta 64(%rsi)
+	jz p4_general_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	prefetch 16(%rdx)
+
+	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
+	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
+	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
+	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
+        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
+
+p4_general_loop:
+
+	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
+	prefetchw 16(%rdi)
+
+	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
+	addq %rax, %rdx
+	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
+	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
+	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
+	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
+	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
+	prefetch 16(%rdx)
+	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+
+	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+	addq $16, %rdi
+
+	decl %ecx
+	jnz p4_general_loop
+
+p4_general_done:
+	.byte 0xf3
+	ret
+	
+.section .rodata
+
+.align 16
+p4_constants:
+.byte  0xff, 0xff, 0xff, 0xff
+.byte  0xff, 0xff, 0xff, 0xff
+.byte  0xff, 0xff, 0xff, 0xff
+.byte  0x00, 0x00, 0x00, 0x00
+
+.byte  0x00, 0x00, 0x00, 0x00
+.byte  0x00, 0x00, 0x00, 0x00
+.byte  0x00, 0x00, 0x00, 0x00
+.float 1.0
+
+.text
+.align 16
+.globl _mesa_x86_64_transform_points4_3d
+.hidden _mesa_x86_64_transform_points4_3d
+/*
+ * this is slower than _mesa_x86_64_transform_points4_general
+ * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
+ */
+_mesa_x86_64_transform_points4_3d:
+
+	leaq p4_constants(%rip), %rax
+
+	prefetchnta 64(%rsi)
+	
+	movaps (%rax), %xmm9
+	movaps 16(%rax), %xmm10
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	testl %ecx, %ecx		/* verify non-zero count */
+	jz p4_3d_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	prefetch 16(%rdx)
+
+	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
+	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
+	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
+	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
+	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
+        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
+	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
+	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
+	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
+	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
+
+p4_3d_loop:
+
+	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
+	prefetchw 16(%rdi)
+
+	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
+	addq %rax, %rdx
+	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
+	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
+	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
+	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
+	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
+	prefetch 16(%rdx)
+	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+
+	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+	addq $16, %rdi
+
+	dec %ecx
+	jnz p4_3d_loop
+
+p4_3d_done:
+	.byte 0xf3
+	ret
+
+
+.align 16
+.globl _mesa_x86_64_transform_points4_identity
+.hidden _mesa_x86_64_transform_points4_identity
+_mesa_x86_64_transform_points4_identity:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	jz p4_identity_done
+
+	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+	prefetch 64(%rsi)
+	prefetchw 64(%rdi)
+
+	add %ecx, %ecx
+
+	rep movsq
+
+p4_identity_done:
+	.byte 0xf3
+	ret
+
+	
+.align 16
+.globl _mesa_3dnow_transform_points4_3d_no_rot
+.hidden _mesa_3dnow_transform_points4_3d_no_rot
+_mesa_3dnow_transform_points4_3d_no_rot:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
+	jz p4_3d_no_rot_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	prefetch (%rdx)
+	
+	movd (%rsi), %mm0		/*                 | m00             */
+	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
+	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
+
+	movd 40(%rsi), %mm2		/*                 | m22             */
+	movq 48(%rsi), %mm1		/* m31             | m30             */
+
+	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
+
+p4_3d_no_rot_loop:
+
+	prefetchw 32(%rdi)
+	
+	movq  (%rdx), %mm4		/* x1              | x0              */
+	movq  8(%rdx), %mm5		/* x3              | x2              */
+	movd  12(%rdx), %mm7		/*                 | x3              */
+
+	movq  %mm5, %mm6		/* x3              | x2              */
+	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
+
+	punpckhdq %mm6, %mm6		/* x3              | x3              */
+	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
+
+	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
+	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
+
+        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
+
+	addq %rax, %rdx
+	movq %mm4, (%rdi)		/* write r0, r1                      */
+	movq %mm5, 8(%rdi)		/* write r2, r3                      */
+
+	addq $16, %rdi
+	
+	decl %ecx
+	prefetch 32(%rdx)
+	jnz p4_3d_no_rot_loop
+
+p4_3d_no_rot_done:
+	femms
+	ret
+
+	
+.align 16
+.globl _mesa_3dnow_transform_points4_perspective
+.hidden _mesa_3dnow_transform_points4_perspective
+_mesa_3dnow_transform_points4_perspective:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
+	jz p4_perspective_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	movd (%rsi), %mm0		/*                 | m00             */
+        pxor %mm7, %mm7			/* 0               | 0               */
+	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
+	
+	movq 32(%rsi), %mm2		/* m21             | m20             */
+	prefetch (%rdx)
+	
+	movd 40(%rsi), %mm1		/*                 | m22             */
+
+	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
+	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
+
+
+p4_perspective_loop:
+
+	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+
+	movq (%rdx), %mm4		/* x1              | x0              */
+	movq 8(%rdx), %mm5		/* x3              | x2              */
+	movd 8(%rdx), %mm3		/*                 | x2              */
+
+	movq %mm5, %mm6			/* x3              | x2              */
+	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
+
+	punpckldq %mm5, %mm5		/* x2              | x2              */
+
+	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
+	pfsubr %mm7, %mm3		/*                 | -x2             */
+
+	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
+	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
+
+	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
+
+	movq %mm5, (%rdi)		/* write r0, r1                      */
+	addq %rax, %rdx	
+	movq %mm6, 8(%rdi)		/* write r2, r3                      */
+
+	addq $16, %rdi
+
+	decl %ecx
+	prefetch 32(%rdx)		/* hopefully stride is zero          */
+	jnz p4_perspective_loop
+
+p4_perspective_done:
+	femms
+	ret
+
+.align 16
+.globl _mesa_3dnow_transform_points4_2d_no_rot
+.hidden _mesa_3dnow_transform_points4_2d_no_rot
+_mesa_3dnow_transform_points4_2d_no_rot:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	.byte 0x90			/* manual align += 1 */
+	jz p4_2d_no_rot_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	movd (%rsi), %mm0		/*                 | m00             */
+	prefetch (%rdx)
+	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
+	
+	movq 48(%rsi), %mm1		/* m31             | m30             */
+
+p4_2d_no_rot_loop:
+
+	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+
+	movq (%rdx), %mm4		/* x1              | x0              */
+	movq 8(%rdx), %mm5		/* x3              | x2              */
+
+	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
+	movq %mm5, %mm6			/* x3              | x2              */
+
+	punpckhdq %mm6, %mm6		/* x3              | x3              */
+
+	addq %rax, %rdx	
+	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
+
+	prefetch 32(%rdx)		/* hopefully stride is zero          */
+	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
+
+	movq %mm6, (%rdi)		/* write r0, r1                      */
+	movq %mm5, 8(%rdi)		/* write r2, r3                      */
+
+	addq $16, %rdi
+
+	decl %ecx
+	jnz p4_2d_no_rot_loop
+
+p4_2d_no_rot_done:
+	femms
+	ret
+
+	
+.align 16
+.globl _mesa_3dnow_transform_points4_2d
+.hidden _mesa_3dnow_transform_points4_2d
+_mesa_3dnow_transform_points4_2d:
+
+	movl V4F_COUNT(%rdx), %ecx	/* count */
+	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
+
+	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
+	movl $4, V4F_SIZE(%rdi)		/* set dest size */
+	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
+	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+	
+	test %ecx, %ecx
+	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
+	jz p4_2d_done
+
+	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
+	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
+
+	movd (%rsi), %mm0		/*                 | m00             */
+	movd 4(%rsi), %mm1		/*                 | m01             */
+
+	prefetch (%rdx)
+
+	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
+	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
+	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
+
+	movq 48(%rsi), %mm2		/* m31             | m30             */
+
+p4_2d_loop:
+
+	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
+
+	movq (%rdx), %mm3		/* x1              | x0              */
+	movq 8(%rdx), %mm5		/* x3              | x2              */
+
+	movq %mm3, %mm4			/* x1              | x0              */
+	movq %mm5, %mm6			/* x3              | x2              */
+
+	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
+	punpckhdq %mm6, %mm6		/* x3              | x3              */
+
+	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
+
+	addq %rax, %rdx	
+	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
+
+	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
+	prefetch 32(%rdx)		/* hopefully stride is zero          */
+
+	pfadd %mm6, %mm3		/* r1              | r0              */
+
+	movq %mm3, (%rdi)		/* write r0, r1                      */
+	movq %mm5, 8(%rdi)		/* write r2, r3                      */
+
+	addq $16, %rdi
+
+	decl %ecx
+	jnz p4_2d_loop
+
+p4_2d_done:
+	femms
+	ret
+			
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/3dnow.c
+++ b/src/arch/x86/3dnow.c
@ -0,0 +1,91 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  5.0.1
+ *
+ * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * 3DNow! optimizations contributed by
+ * Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+
+#include "3dnow.h"
+#include "x86_xform.h"
+
+#ifdef DEBUG_MATH
+#include "math/m_debug.h"
+#endif
+
+
+#ifdef USE_3DNOW_ASM
+DECLARE_XFORM_GROUP( 3dnow, 2 )
+DECLARE_XFORM_GROUP( 3dnow, 3 )
+DECLARE_XFORM_GROUP( 3dnow, 4 )
+
+DECLARE_NORM_GROUP( 3dnow )
+
+
+extern void _ASMAPI
+_mesa_v16_3dnow_general_xform( GLfloat *first_vert,
+			       const GLfloat *m,
+			       const GLfloat *src,
+			       GLuint src_stride,
+			       GLuint count );
+
+extern void _ASMAPI
+_mesa_3dnow_project_vertices( GLfloat *first,
+			      GLfloat *last,
+			      const GLfloat *m,
+			      GLuint stride );
+
+extern void _ASMAPI
+_mesa_3dnow_project_clipped_vertices( GLfloat *first,
+				      GLfloat *last,
+				      const GLfloat *m,
+				      GLuint stride,
+				      const GLubyte *clipmask );
+#endif
+
+
+void _mesa_init_3dnow_transform_asm( void )
+{
+#ifdef USE_3DNOW_ASM
+   ASSIGN_XFORM_GROUP( 3dnow, 2 );
+   ASSIGN_XFORM_GROUP( 3dnow, 3 );
+   ASSIGN_XFORM_GROUP( 3dnow, 4 );
+
+   /* There's a bug somewhere in the 3dnow_normal.S file that causes
+    * bad shading.  Disable for now.
+   ASSIGN_NORM_GROUP( 3dnow );
+   */
+
+#ifdef DEBUG_MATH
+   _math_test_all_transform_functions( "3DNow!" );
+   _math_test_all_normal_transform_functions( "3DNow!" );
+#endif
+#endif
+}
--- a/src/arch/x86/3dnow.h
+++ b/src/arch/x86/3dnow.h
@ -0,0 +1,36 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * 3DNow! optimizations contributed by
+ * Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ */
+
+#ifndef __3DNOW_H__
+#define __3DNOW_H__
+
+void _mesa_init_3dnow_transform_asm( void );
+
+#endif
--- a/src/arch/x86/3dnow_normal.S
+++ b/src/arch/x86/3dnow_normal.S
@ -0,0 +1,852 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  5.1
+ *
+ * Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * 3Dnow assembly code by Holger Waechtler
+ */
+
+#ifdef USE_3DNOW_ASM
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "norm_args.h"
+
+        SEG_TEXT
+
+#define M(i)    REGOFF(i * 4, ECX)
+#define STRIDE  REGOFF(12, ESI)
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals)
+HIDDEN(_mesa_3dnow_transform_normalize_normals)
+GLNAME(_mesa_3dnow_transform_normalize_normals):
+
+#define FRAME_OFFSET 12
+
+    PUSH_L     ( EDI )
+    PUSH_L     ( ESI )
+    PUSH_L     ( EBP )
+
+    MOV_L      ( ARG_LENGTHS, EDI )
+    MOV_L      ( ARG_IN, ESI )
+    MOV_L      ( ARG_DEST, EAX )
+    MOV_L      ( REGOFF(V4F_COUNT, ESI), EBP ) /*  dest->count = in->count   */
+    MOV_L      ( EBP, REGOFF(V4F_COUNT, EAX) )
+    MOV_L      ( REGOFF(V4F_START, ESI), EDX ) /*  in->start    */
+    MOV_L      ( REGOFF(V4F_START, EAX), EAX ) /*  dest->start  */
+    MOV_L      ( ARG_MAT, ECX )
+    MOV_L      ( REGOFF(MATRIX_INV, ECX), ECX ) /*  mat->inv     */
+
+    CMP_L      ( CONST(0), EBP )        /*   count > 0 ??  */
+    JE         ( LLBL (G3TN_end) )
+
+    MOV_L      ( REGOFF (V4F_COUNT, ESI), EBP )
+    FEMMS
+
+    PUSH_L     ( EBP )
+    PUSH_L     ( EAX )
+    PUSH_L     ( EDX )                  /*  save counter & pointer for   */
+                                        /*  the normalize pass           */
+#undef  FRAME_OFFSET
+#define FRAME_OFFSET 24
+
+    MOVQ       ( M(0), MM3 )            /*  m1              | m0         */
+    MOVQ       ( M(4), MM4 )            /*  m5              | m4         */
+
+    MOVD       ( M(2), MM5 )            /*                  | m2         */
+    PUNPCKLDQ  ( M(6), MM5 )            /*  m6              | m2         */
+
+    MOVQ       ( M(8), MM6 )            /*  m9              | m8         */
+    MOVQ       ( M(10), MM7 )           /*                  | m10        */
+
+    CMP_L      ( CONST(0), EDI )        /*  lengths == 0 ?                 */
+    JNE        ( LLBL (G3TN_scale_end ) )
+
+    MOVD       ( ARG_SCALE, MM0 )       /*               | scale           */
+    PUNPCKLDQ  ( MM0, MM0 )             /* scale         | scale           */
+
+    PFMUL      ( MM0, MM3 )             /* scale * m1    | scale * m0      */
+    PFMUL      ( MM0, MM4 )             /* scale * m5    | scale * m4      */
+    PFMUL      ( MM0, MM5 )             /* scale * m6    | scale * m2      */
+    PFMUL      ( MM0, MM6 )             /* scale * m9    | scale * m8      */
+    PFMUL      ( MM0, MM7 )             /*               | scale * m10     */
+
+ALIGNTEXT32
+LLBL (G3TN_scale_end):
+LLBL (G3TN_transform):
+    MOVQ       ( REGIND (EDX), MM0 )    /*  x1              | x0         */
+    MOVD       ( REGOFF (8, EDX), MM2 ) /*                  | x2         */
+
+    MOVQ       ( MM0, MM1 )             /*  x1              | x0           */
+    PUNPCKLDQ  ( MM2, MM2 )             /*  x2              | x2           */
+
+    PFMUL      ( MM3, MM0 )             /*  x1*m1           | x0*m0        */
+    ADD_L      ( CONST(16), EAX )       /*  next r                         */
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    PFMUL      ( MM4, MM1 )             /*  x1*m5           | x0*m4        */
+    PFACC      ( MM1, MM0 )             /*  x0*m4+x1*m5     | x0*m0+x1*m1  */
+
+    PFMUL      ( MM5, MM2 )             /*  x2*m6           | x2*m2        */
+    PFADD      ( MM2, MM0 )             /* x0*m4+x1*m5+x2*m6| x0*m0+...+x2**/
+
+    MOVQ       ( REGIND (EDX), MM1 )    /*  x1           | x0              */
+    MOVQ       ( MM0, REGOFF(-16, EAX) ) /* write r0, r1                   */
+
+    PFMUL      ( MM6, MM1 )             /* x1*m9         | x0*m8           */
+    MOVD       ( REGOFF (8, EDX), MM2 ) /*               | x2              */
+
+    PFMUL      ( MM7, MM2 )             /*               | x2*m10          */
+    PFACC      ( MM1, MM1 )             /*  *not used*   | x0*m8+x1*m9     */
+
+    PFADD      ( MM2, MM1 )             /*  *not used*   | x0*m8+x1*m9+x2*m*/
+    ADD_L      ( STRIDE, EDX )          /*  next normal                    */
+
+    PREFETCH   ( REGIND(EDX) )
+
+    MOVD       ( MM1, REGOFF(-8, EAX) ) /*  write r2                       */
+    SUB_L      ( CONST(1), EBP )                  /*  decrement normal counter       */
+    JNZ        ( LLBL (G3TN_transform) )
+
+
+    POP_L      ( EDX )                  /*  end of transform ---           */
+    POP_L      ( EAX )                  /*    now normalizing ...          */
+    POP_L      ( EBP )
+
+    CMP_L      ( CONST(0), EDI )        /*  lengths == 0 ?                 */
+    JE         ( LLBL (G3TN_norm ) )    /*  calculate lengths              */
+
+
+ALIGNTEXT32
+LLBL (G3TN_norm_w_lengths):
+
+    PREFETCHW  ( REGOFF(12,EAX) )
+
+    MOVQ       ( REGIND(EAX), MM0 )     /*  x1              | x0           */
+    MOVD       ( REGOFF(8, EAX), MM1 )  /*                  | x2           */
+
+    MOVD       ( REGIND (EDI), MM3 )    /*                  | length (x)   */
+    PFMUL      ( MM3, MM1 )             /*                  | x2 (normalize*/
+
+    PUNPCKLDQ  ( MM3, MM3 )             /*  length (x)      | length (x)   */
+    PFMUL      ( MM3, MM0 )             /*  x1 (normalized) | x0 (normalize*/
+
+    ADD_L      ( STRIDE, EDX )          /*  next normal                    */
+    ADD_L      ( CONST(4), EDI )        /*  next length                    */
+
+    PREFETCH   ( REGIND(EDI) )
+
+    MOVQ       ( MM0, REGIND(EAX) )     /*  write new x0, x1               */
+    MOVD       ( MM1, REGOFF(8, EAX) )  /*  write new x2                   */
+
+    ADD_L      ( CONST(16), EAX )       /*  next r                         */
+    SUB_L      ( CONST(1), EBP )        /*  decrement normal counter       */
+
+    JNZ        ( LLBL (G3TN_norm_w_lengths) )
+    JMP        ( LLBL (G3TN_exit_3dnow) )
+
+ALIGNTEXT32
+LLBL (G3TN_norm):
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    MOVQ       ( REGIND (EAX), MM0 )    /*  x1             | x0           */
+    MOVD       ( REGOFF(8, EAX), MM1 )  /*                 | x2           */
+
+    MOVQ       ( MM0, MM3 )             /*  x1              | x0           */
+    MOVQ       ( MM1, MM4 )             /*                  | x2           */
+
+    PFMUL      ( MM0, MM3 )             /*  x1*x1           | x0*x0        */
+    ADD_L      ( CONST(16), EAX )       /*  next r                         */
+
+    PFMUL      ( MM1, MM4 )             /*                  | x2*x2        */
+    PFADD      ( MM4, MM3 )             /*                  | x0*x0+x2*x2  */
+
+    PFACC      ( MM3, MM3 )             /* **not used**    | x0*x0+x1*x1+x2**/
+    PFRSQRT    ( MM3, MM5 )             /*  1/sqrt (x0*x0+x1*x1+x2*x2)     */
+
+    MOVQ       ( MM5, MM4 )
+    PUNPCKLDQ  ( MM3, MM3 )
+
+    SUB_L      ( CONST(1), EBP )                  /*  decrement normal counter       */
+    PFMUL      ( MM5, MM5 )
+
+    PFRSQIT1   ( MM3, MM5 )
+    PFRCPIT2   ( MM4, MM5 )
+
+    PFMUL      ( MM5, MM0 )             /*  x1 (normalized) | x0 (normalize*/
+
+    MOVQ       ( MM0, REGOFF(-16, EAX) ) /*  write new x0, x1              */
+    PFMUL      ( MM5, MM1 )             /*                 | x2 (normalize*/
+
+    MOVD       ( MM1, REGOFF(-8, EAX) ) /*  write new x2                  */
+    JNZ        ( LLBL (G3TN_norm) )
+
+LLBL (G3TN_exit_3dnow):
+    FEMMS
+
+LLBL (G3TN_end):
+    POP_L      ( EBP )
+    POP_L      ( ESI )
+    POP_L      ( EDI )
+    RET
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot)
+HIDDEN(_mesa_3dnow_transform_normalize_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 12
+
+    PUSH_L     ( EDI )
+    PUSH_L     ( ESI )
+    PUSH_L     ( EBP )
+
+    MOV_L      ( ARG_LENGTHS, EDI )
+    MOV_L      ( ARG_IN, ESI )
+    MOV_L      ( ARG_DEST, EAX )
+    MOV_L      ( REGOFF(V4F_COUNT, ESI), EBP ) /*  dest->count = in->count   */
+    MOV_L      ( EBP, REGOFF(V4F_COUNT, EAX) )
+    MOV_L      ( ARG_MAT, ECX )
+    MOV_L      ( REGOFF(V4F_START, EAX), EAX ) /*  dest->start  */
+    MOV_L      ( REGOFF(MATRIX_INV, ECX), ECX ) /*  mat->inv     */
+    MOV_L      ( REGOFF(V4F_START, ESI), EDX ) /*  in->start    */
+
+    CMP_L      ( CONST(0), EBP ) /*   count > 0 ??  */
+    JE         ( LLBL (G3TNNR_end) )
+
+    FEMMS
+
+    MOVD       ( M(0), MM0 )            /*               | m0                 */
+    PUNPCKLDQ  ( M(5), MM0 )            /* m5            | m0                 */
+
+    MOVD       ( M(10), MM2 )           /*               | m10                */
+    PUNPCKLDQ  ( MM2, MM2 )             /* m10           | m10                */
+
+    CMP_L      ( CONST(0), EDI )        /*  lengths == 0 ?                    */
+    JNE        ( LLBL (G3TNNR_scale_end ) )
+
+    MOVD       ( ARG_SCALE, MM7 )       /*               | scale              */
+    PUNPCKLDQ  ( MM7, MM7 )             /* scale         | scale              */
+
+    PFMUL      ( MM7, MM0 )             /* scale * m5    | scale * m0         */
+    PFMUL      ( MM7, MM2 )             /* scale * m10   | scale * m10        */
+
+ALIGNTEXT32
+LLBL (G3TNNR_scale_end):
+    CMP_L      ( CONST(0), EDI )        /* lengths == 0 ?                     */
+    JE         ( LLBL (G3TNNR_norm) )   /* need to calculate lengths          */
+
+    MOVD       ( REGIND(EDI), MM3 )     /*                 | length (x)       */
+
+
+ALIGNTEXT32
+LLBL (G3TNNR_norm_w_lengths):           /* use precalculated lengths          */
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    MOVQ       ( REGIND(EDX), MM6 )     /* x1            | x0                 */
+    MOVD       ( REGOFF(8, EDX), MM7 )  /*               | x2                 */
+
+    PFMUL      ( MM0, MM6 )             /* x1*m5         | x0*m0              */
+    ADD_L      ( STRIDE, EDX )          /* next normal                        */
+
+    PREFETCH   ( REGIND(EDX) )
+
+    PFMUL      ( MM2, MM7 )             /*               | x2*m10             */
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+
+    PFMUL      ( MM3, MM7 )             /*               | x2 (normalized)  */
+    PUNPCKLDQ  ( MM3, MM3 )             /* length (x)    | length (x)       */
+
+    ADD_L      ( CONST(4), EDI )        /* next length                        */
+    PFMUL      ( MM3, MM6 )             /* x1 (normalized) | x0 (normalized)  */
+
+    SUB_L      ( CONST(1), EBP )        /* decrement normal counter           */
+    MOVQ       ( MM6, REGOFF(-16, EAX) ) /* write r0, r1                      */
+
+    MOVD       ( MM7, REGOFF(-8, EAX) ) /* write r2                           */
+    MOVD       ( REGIND(EDI), MM3 )     /*                 | length (x)       */
+
+    JNZ        ( LLBL (G3TNNR_norm_w_lengths) )
+    JMP        ( LLBL (G3TNNR_exit_3dnow) )
+
+ALIGNTEXT32
+LLBL (G3TNNR_norm):                     /* need to calculate lengths          */
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    MOVQ       ( REGIND(EDX), MM6 )     /* x1              | x0               */
+    MOVD       ( REGOFF(8, EDX), MM7 )  /*                 | x2               */
+
+    PFMUL      ( MM0, MM6 )             /* x1*m5           | x0*m0            */
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+
+    PFMUL      ( MM2, MM7 )             /*                 | x2*m10           */
+    MOVQ       ( MM6, MM3 )             /* x1 (transformed)| x0 (transformed) */
+
+    MOVQ       ( MM7, MM4 )             /*                 | x2 (transformed) */
+    PFMUL      ( MM6, MM3 )             /* x1*x1           | x0*x0            */
+
+
+    PFMUL      ( MM7, MM4 )             /*                 | x2*x2            */
+    PFACC      ( MM3, MM3 )             /* **not used**    | x0*x0+x1*x1      */
+
+    PFADD      ( MM4, MM3 )             /*                 | x0*x0+x1*x1+x2*x2*/
+    ADD_L      ( STRIDE, EDX )          /* next normal            */
+
+    PREFETCH   ( REGIND(EDX) )
+
+    PFRSQRT    ( MM3, MM5 )             /* 1/sqrt (x0*x0+x1*x1+x2*x2)         */
+    MOVQ       ( MM5, MM4 )
+
+    PUNPCKLDQ  ( MM3, MM3 )
+    PFMUL      ( MM5, MM5 )
+
+    PFRSQIT1   ( MM3, MM5 )
+    SUB_L      ( CONST(1), EBP )        /* decrement normal counter           */
+
+    PFRCPIT2   ( MM4, MM5 )
+    PFMUL      ( MM5, MM6 )             /* x1 (normalized) | x0 (normalized)  */
+
+    MOVQ       ( MM6, REGOFF(-16, EAX) ) /* write r0, r1                      */
+    PFMUL      ( MM5, MM7 )             /*                 | x2 (normalized)  */
+
+    MOVD       ( MM7, REGOFF(-8, EAX) ) /* write r2                           */
+    JNZ        ( LLBL (G3TNNR_norm) )
+
+
+LLBL (G3TNNR_exit_3dnow):
+    FEMMS
+
+LLBL (G3TNNR_end):
+    POP_L      ( EBP )
+    POP_L      ( ESI )
+    POP_L      ( EDI )
+    RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot)
+HIDDEN(_mesa_3dnow_transform_rescale_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 12
+
+    PUSH_L     ( EDI )
+    PUSH_L     ( ESI )
+    PUSH_L     ( EBP )
+
+    MOV_L      ( ARG_IN, EAX )
+    MOV_L      ( ARG_DEST, EDX )
+    MOV_L      ( REGOFF(V4F_COUNT, EAX), EBP ) /*  dest->count = in->count   */
+    MOV_L      ( EBP, REGOFF(V4F_COUNT, EDX) )
+    MOV_L      ( ARG_IN, ESI )
+    MOV_L      ( ARG_MAT, ECX )
+    MOV_L      ( REGOFF(MATRIX_INV, ECX), ECX ) /*  mat->inv     */
+    MOV_L      ( REGOFF(V4F_START, EDX), EAX ) /*  dest->start  */
+    MOV_L      ( REGOFF(V4F_START, ESI), EDX ) /*  in->start    */
+
+    CMP_L      ( CONST(0), EBP )
+    JE         ( LLBL (G3TRNR_end) )
+
+    FEMMS
+
+    MOVD       ( ARG_SCALE, MM6 )       /*               | scale              */
+    PUNPCKLDQ  ( MM6, MM6 )             /* scale         | scale              */
+
+    MOVD       ( REGIND(ECX), MM0 )     /*               | m0                 */
+    PUNPCKLDQ  ( REGOFF(20, ECX), MM0 ) /* m5            | m0                 */
+
+    PFMUL      ( MM6, MM0 )             /* scale*m5      | scale*m0           */
+    MOVD       ( REGOFF(40, ECX), MM2 ) /*               | m10                */
+
+    PFMUL      ( MM6, MM2 )             /*               | scale*m10          */
+
+ALIGNTEXT32
+LLBL (G3TRNR_rescale):
+
+    PREFETCHW  ( REGIND(EAX) )
+	
+    MOVQ       ( REGIND(EDX), MM4 )     /* x1            | x0                 */
+    MOVD       ( REGOFF(8, EDX), MM5 )  /*               | x2                 */
+	
+    PFMUL      ( MM0, MM4 )             /* x1*m5         | x0*m0              */
+    ADD_L      ( STRIDE, EDX )          /* next normal                        */
+
+    PREFETCH   ( REGIND(EDX) )
+
+    PFMUL      ( MM2, MM5 )             /*               | x2*m10             */
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+
+    SUB_L      ( CONST(1), EBP )        /* decrement normal counter           */
+    MOVQ       ( MM4, REGOFF(-16, EAX) ) /* write r0, r1                      */
+
+    MOVD       ( MM5, REGOFF(-8, EAX) ) /* write r2                           */
+    JNZ        ( LLBL (G3TRNR_rescale) ) /* cnt > 0 ? -> process next normal  */
+
+    FEMMS
+
+LLBL (G3TRNR_end):
+    POP_L      ( EBP )
+    POP_L      ( ESI )
+    POP_L      ( EDI )
+    RET
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals)
+HIDDEN(_mesa_3dnow_transform_rescale_normals)
+GLNAME(_mesa_3dnow_transform_rescale_normals):
+
+#undef  FRAME_OFFSET
+#define FRAME_OFFSET 8
+
+    PUSH_L     ( EDI )
+    PUSH_L     ( ESI )
+
+    MOV_L      ( ARG_IN, ESI )
+    MOV_L      ( ARG_DEST, EAX )
+    MOV_L      ( ARG_MAT, ECX )
+    MOV_L      ( REGOFF(V4F_COUNT, ESI), EDI ) /*  dest->count = in->count   */
+    MOV_L      ( EDI, REGOFF(V4F_COUNT, EAX) )
+    MOV_L      ( REGOFF(V4F_START, EAX), EAX ) /*  dest->start  */
+    MOV_L      ( REGOFF(V4F_START, ESI), EDX ) /*  in->start    */
+    MOV_L      ( REGOFF(MATRIX_INV, ECX), ECX ) /*  mat->inv     */
+
+    CMP_L      ( CONST(0), EDI )
+    JE         ( LLBL (G3TR_end) )
+
+    FEMMS
+
+    MOVQ       ( REGIND(ECX), MM3 )     /* m1            | m0                 */
+
+    MOVQ       ( REGOFF(16,ECX), MM4 )  /* m5            | m4                 */
+    MOVD       ( ARG_SCALE, MM0 )       /* scale       */
+
+    MOVD       ( REGOFF(8,ECX), MM5 )   /*               | m2                 */
+    PUNPCKLDQ  ( MM0, MM0 )             /* scale         | scale              */
+
+    PUNPCKLDQ  ( REGOFF(24, ECX), MM5 )
+    PFMUL      ( MM0, MM3 )             /* scale*m1      | scale*m0           */
+
+    MOVQ       ( REGOFF(32, ECX), MM6 ) /* m9            | m8*/
+    PFMUL      ( MM0, MM4 )             /* scale*m5      | scale*m4           */
+
+    MOVD       ( REGOFF(40, ECX), MM7 ) /*               | m10                */
+    PFMUL      ( MM0, MM5 )             /* scale*m6      | scale*m2           */
+
+    PFMUL      ( MM0, MM6 )             /* scale*m9      | scale*m8           */
+
+    PFMUL      ( MM0, MM7 )             /*               | scale*m10          */
+
+ALIGNTEXT32
+LLBL (G3TR_rescale):
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    MOVQ       ( REGIND(EDX), MM0 )     /* x1            | x0                 */
+    MOVD       ( REGOFF(8, EDX), MM2 )  /*               | x2                 */
+
+    MOVQ       ( MM0, MM1 )             /* x1            | x0                 */
+    PUNPCKLDQ  ( MM2, MM2 )             /* x2            | x2                 */
+
+    PFMUL      ( MM3, MM0 )             /* x1*m1         | x0*m0              */
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+
+    PFMUL      ( MM4, MM1 )             /* x1*m5         | x0*m4              */
+    PFACC      ( MM1, MM0 )             /* x0*m4+x1*m5   | x0*m0+x1*m1        */
+
+    MOVQ       ( REGIND(EDX), MM1 )     /* x1            | x0                 */
+
+    PFMUL      ( MM5, MM2 )             /* x2*m6         | x2*m2              */
+    PFADD      ( MM2, MM0 )             /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2  */
+
+    MOVD       ( REGOFF(8, EDX), MM2 )  /*               | x2                 */
+    ADD_L      ( STRIDE, EDX )          /* next normal                    */
+
+    PREFETCH   ( REGIND(EDX) )
+
+    MOVQ       ( MM0, REGOFF(-16, EAX) ) /* write r0, r1                      */
+    PFMUL      ( MM6, MM1 )             /* x1*m9         | x0*m8              */
+
+    PFMUL      ( MM7, MM2 )             /*               | x2*m10             */
+    PFACC      ( MM1, MM1 )             /* *not used*    | x0*m8+x1*m9        */
+
+    PFADD      ( MM2, MM1 )             /* *not used*    | x0*m8+x1*m9+x2*m10 */
+    MOVD       ( MM1, REGOFF(-8, EAX) ) /* write r2                           */
+
+    SUB_L      ( CONST(1), EDI )        /* decrement normal counter           */
+    JNZ        ( LLBL (G3TR_rescale) )
+
+    FEMMS
+
+LLBL (G3TR_end):
+    POP_L       ( ESI )
+    POP_L       ( EDI )
+    RET
+
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normals_no_rot)
+HIDDEN(_mesa_3dnow_transform_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_normals_no_rot):
+
+#undef  FRAME_OFFSET
+#define FRAME_OFFSET 8
+
+    PUSH_L     ( EDI )
+    PUSH_L     ( ESI )
+
+    MOV_L      ( ARG_IN, ESI )
+    MOV_L      ( ARG_DEST, EAX )
+    MOV_L      ( ARG_MAT, ECX )
+    MOV_L      ( REGOFF(V4F_COUNT, ESI), EDI ) /*  dest->count = in->count   */
+    MOV_L      ( EDI, REGOFF(V4F_COUNT, EAX) )
+    MOV_L      ( REGOFF(V4F_START, EAX), EAX ) /*  dest->start  */
+    MOV_L      ( REGOFF(V4F_START, ESI), EDX ) /*  in->start    */
+    MOV_L      ( REGOFF(MATRIX_INV, ECX), ECX ) /*  mat->inv     */
+
+    CMP_L      ( CONST(0), EDI )
+    JE         ( LLBL (G3TNR_end) )
+
+    FEMMS
+
+    MOVD       ( REGIND(ECX), MM0 )     /*               | m0                 */
+    PUNPCKLDQ  ( REGOFF(20, ECX), MM0 ) /* m5            | m0                 */
+
+    MOVD       ( REGOFF(40, ECX), MM2 ) /*               | m10                */
+    PUNPCKLDQ  ( MM2, MM2 )             /* m10           | m10                */
+
+ALIGNTEXT32
+LLBL (G3TNR_transform):
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    MOVQ       ( REGIND(EDX), MM4 )     /* x1            | x0                 */
+    MOVD       ( REGOFF(8, EDX), MM5 )  /*               | x2                 */
+
+    PFMUL      ( MM0, MM4 )             /* x1*m5         | x0*m0              */
+    ADD_L      ( STRIDE, EDX)           /* next normal      */
+
+    PREFETCH   ( REGIND(EDX) )
+
+    PFMUL      ( MM2, MM5 )             /*               | x2*m10             */
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+
+    SUB_L      ( CONST(1), EDI )        /* decrement normal counter           */
+    MOVQ       ( MM4, REGOFF(-16, EAX) ) /* write r0, r1                      */
+
+    MOVD       ( MM5, REGOFF(-8, EAX) ) /* write r2                           */
+    JNZ        ( LLBL (G3TNR_transform) )
+
+    FEMMS
+
+LLBL (G3TNR_end):
+    POP_L       ( ESI )
+    POP_L       ( EDI )
+    RET
+
+
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normals)
+HIDDEN(_mesa_3dnow_transform_normals)
+GLNAME(_mesa_3dnow_transform_normals):
+
+#undef  FRAME_OFFSET
+#define FRAME_OFFSET 8
+
+    PUSH_L     ( EDI )
+    PUSH_L     ( ESI )
+
+    MOV_L      ( ARG_IN, ESI )
+    MOV_L      ( ARG_DEST, EAX )
+    MOV_L      ( ARG_MAT, ECX )
+    MOV_L      ( REGOFF(V4F_COUNT, ESI), EDI ) /*  dest->count = in->count   */
+    MOV_L      ( EDI, REGOFF(V4F_COUNT, EAX) )
+    MOV_L      ( REGOFF(V4F_START, EAX), EAX ) /*  dest->start  */
+    MOV_L      ( REGOFF(V4F_START, ESI), EDX ) /*  in->start    */
+    MOV_L      ( REGOFF(MATRIX_INV, ECX), ECX ) /*  mat->inv     */
+
+    CMP_L      ( CONST(0), EDI )        /* count > 0 ??                       */
+    JE         ( LLBL (G3T_end) )
+
+    FEMMS
+
+    MOVQ       ( REGIND(ECX), MM3 )     /* m1            | m0                 */
+    MOVQ       ( REGOFF(16, ECX), MM4 ) /* m5            | m4                 */
+
+    MOVD       ( REGOFF(8, ECX), MM5 )  /*               | m2                 */
+    PUNPCKLDQ  ( REGOFF(24, ECX), MM5 ) /* m6            | m2                 */
+
+    MOVQ       ( REGOFF(32, ECX), MM6 ) /* m9            | m8                 */
+    MOVD       ( REGOFF(40, ECX), MM7 ) /*               | m10                */
+
+ALIGNTEXT32
+LLBL (G3T_transform):
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    MOVQ       ( REGIND(EDX), MM0 )     /* x1            | x0                 */
+    MOVD       ( REGOFF(8, EDX), MM2 )  /*               | x2                 */
+
+    MOVQ       ( MM0, MM1 )             /* x1            | x0                 */
+    PUNPCKLDQ  ( MM2, MM2 )             /* x2            | x2                 */
+
+    PFMUL      ( MM3, MM0 )             /* x1*m1         | x0*m0              */
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+
+    PFMUL      ( MM4, MM1 )             /* x1*m5         | x0*m4              */
+    PFACC      ( MM1, MM0 )             /* x0*m4+x1*m5   | x0*m0+x1*m1        */
+
+    PFMUL      ( MM5, MM2 )             /* x2*m6         | x2*m2              */
+    PFADD      ( MM2, MM0 )             /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2  */
+
+    MOVQ       ( REGIND(EDX), MM1 )     /* x1            | x0                 */
+    MOVQ       ( MM0, REGOFF(-16, EAX) ) /* write r0, r1                      */
+
+    PFMUL      ( MM6, MM1 )             /* x1*m9         | x0*m8              */
+    MOVD       ( REGOFF(8, EDX), MM2 )  /*               | x2                 */
+
+    PFMUL      ( MM7, MM2 )             /*               | x2*m10             */
+    ADD_L      ( STRIDE, EDX )          /* next normal               */
+
+    PREFETCH   ( REGIND(EDX) )
+
+    PFACC      ( MM1, MM1 )             /* *not used*    | x0*m8+x1*m9        */
+    PFADD      ( MM2, MM1 )             /* *not used*    | x0*m8+x1*m9+x2*m10 */
+
+    MOVD       ( MM1, REGOFF(-8, EAX) ) /* write r2                           */
+    SUB_L      ( CONST(1), EDI )        /* decrement normal counter           */
+
+    JNZ        ( LLBL (G3T_transform) )
+
+    FEMMS
+
+LLBL (G3T_end):
+    POP_L  ( ESI )
+    POP_L  ( EDI )
+    RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_normalize_normals)
+HIDDEN(_mesa_3dnow_normalize_normals)
+GLNAME(_mesa_3dnow_normalize_normals):
+
+#undef  FRAME_OFFSET
+#define FRAME_OFFSET 12
+
+    PUSH_L     ( EDI )
+    PUSH_L     ( ESI )
+    PUSH_L     ( EBP )
+
+    MOV_L      ( ARG_IN, ESI )
+    MOV_L      ( ARG_DEST, EAX )
+    MOV_L      ( REGOFF(V4F_COUNT, ESI), EBP ) /*  dest->count = in->count   */
+    MOV_L      ( EBP, REGOFF(V4F_COUNT, EAX) )
+    MOV_L      ( REGOFF(V4F_START, EAX), EAX ) /*  dest->start  */
+    MOV_L      ( REGOFF(V4F_START, ESI), ECX ) /*  in->start    */
+    MOV_L      ( ARG_LENGTHS, EDX )
+
+    CMP_L      ( CONST(0), EBP ) /* count > 0 ?? */
+    JE         ( LLBL (G3N_end) )
+
+    FEMMS
+
+    CMP_L      ( CONST(0), EDX )        /* lengths == 0 ?                     */
+    JE         ( LLBL (G3N_norm2) )     /* calculate lengths                  */
+
+ALIGNTEXT32
+LLBL (G3N_norm1):                       /* use precalculated lengths          */
+
+    PREFETCH   ( REGIND(EAX) )
+
+    MOVQ       ( REGIND(ECX), MM0 )     /* x1              | x0               */
+    MOVD       ( REGOFF(8, ECX), MM1 )  /*                 | x2               */
+
+    MOVD       ( REGIND(EDX), MM3 )     /*                 | length (x)       */
+    PFMUL      ( MM3, MM1 )             /*                 | x2 (normalized)  */
+
+    PUNPCKLDQ  ( MM3, MM3 )             /* length (x)      | length (x)       */
+    ADD_L      ( STRIDE, ECX )          /* next normal            */
+
+    PREFETCH   ( REGIND(ECX) )
+
+    PFMUL      ( MM3, MM0 )             /* x1 (normalized) | x0 (normalized)  */
+    MOVQ       ( MM0, REGIND(EAX) )     /* write new x0, x1                   */
+
+    MOVD       ( MM1, REGOFF(8, EAX) )  /* write new x2                       */
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+
+    ADD_L      ( CONST(4), EDX )        /* next length                        */
+    SUB_L      ( CONST(1), EBP )        /* decrement normal counter           */
+
+    JNZ        ( LLBL (G3N_norm1) )
+
+    JMP        ( LLBL (G3N_end1) )
+
+ALIGNTEXT32
+LLBL (G3N_norm2):                       /* need to calculate lengths          */
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    PREFETCH   ( REGIND(ECX) )
+
+    MOVQ       ( REGIND(ECX), MM0 )     /* x1              | x0               */
+    MOVD       ( REGOFF(8, ECX), MM1 )  /*                 | x2               */
+
+    MOVQ       ( MM0, MM3 )             /* x1              | x0               */
+    ADD_L      ( STRIDE, ECX )          /* next normal    */
+
+    PFMUL      ( MM0, MM3 )             /* x1*x1           | x0*x0            */
+    MOVQ       ( MM1, MM4 )             /*                 | x2               */
+
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+    PFMUL      ( MM1, MM4 )             /*                 | x2*x2            */
+
+    PFADD      ( MM4, MM3 )             /*                 | x0*x0+x2*x2      */
+    PFACC      ( MM3, MM3 )             /* x0*x0+...+x2*x2 | x0*x0+x1*x1+x2*x2*/
+
+    PFRSQRT    ( MM3, MM5 )             /* 1/sqrt (x0*x0+x1*x1+x2*x2)         */
+    MOVQ       ( MM5, MM4 )
+
+    PUNPCKLDQ  ( MM3, MM3 )
+    PFMUL      ( MM5, MM5 )
+
+    PFRSQIT1   ( MM3, MM5 )
+    SUB_L      ( CONST(1), EBP )        /* decrement normal counter           */
+
+    PFRCPIT2   ( MM4, MM5 )
+
+    PFMUL      ( MM5, MM0 )             /* x1 (normalized) | x0 (normalized)  */
+    MOVQ       ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1                  */
+
+    PFMUL      ( MM5, MM1 )             /*                 | x2 (normalized)  */
+    MOVD       ( MM1, REGOFF(-8, EAX) ) /* write new x2                       */
+
+    JNZ        ( LLBL (G3N_norm2) )
+
+LLBL (G3N_end1):
+    FEMMS
+
+LLBL (G3N_end):
+    POP_L      ( EBP )
+    POP_L      ( ESI )
+    POP_L      ( EDI )
+    RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_rescale_normals)
+HIDDEN(_mesa_3dnow_rescale_normals)
+GLNAME(_mesa_3dnow_rescale_normals):
+
+#undef  FRAME_OFFSET
+#define FRAME_OFFSET 8
+    PUSH_L     ( EDI )
+    PUSH_L     ( ESI )
+
+    MOV_L      ( ARG_IN, ESI )
+    MOV_L      ( ARG_DEST, EAX )
+    MOV_L      ( REGOFF(V4F_COUNT, ESI), EDX ) /*  dest->count = in->count   */
+    MOV_L      ( EDX, REGOFF(V4F_COUNT, EAX) )
+    MOV_L      ( REGOFF(V4F_START, EAX), EAX ) /*  dest->start  */
+    MOV_L      ( REGOFF(V4F_START, ESI), ECX ) /*  in->start    */
+
+    CMP_L      ( CONST(0), EDX )
+    JE         ( LLBL (G3R_end) )
+
+    FEMMS
+
+    MOVD       ( ARG_SCALE, MM0 )       /* scale                              */
+    PUNPCKLDQ  ( MM0, MM0 )
+
+ALIGNTEXT32
+LLBL (G3R_rescale):
+
+    PREFETCHW  ( REGIND(EAX) )
+
+    MOVQ       ( REGIND(ECX), MM1 )     /* x1            | x0                 */
+    MOVD       ( REGOFF(8, ECX), MM2 )  /*               | x2                 */
+
+    PFMUL      ( MM0, MM1 )             /* x1*scale      | x0*scale           */
+    ADD_L      ( STRIDE, ECX )          /* next normal                  */
+
+    PREFETCH   ( REGIND(ECX) )
+
+    PFMUL      ( MM0, MM2 )             /*               | x2*scale           */
+    ADD_L      ( CONST(16), EAX )       /* next r                             */
+
+    MOVQ       ( MM1, REGOFF(-16, EAX) ) /* write r0, r1                      */
+    MOVD       ( MM2, REGOFF(-8, EAX) ) /* write r2                           */
+
+    SUB_L      ( CONST(1), EDX )        /* decrement normal counter           */
+    JNZ        ( LLBL (G3R_rescale) )
+
+    FEMMS
+
+LLBL (G3R_end):
+    POP_L      ( ESI )
+    POP_L      ( EDI )
+    RET
+
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/3dnow_xform1.S
+++ b/src/arch/x86/3dnow_xform1.S
@ -0,0 +1,437 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_3DNOW_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+    SEG_TEXT
+
+#define FRAME_OFFSET	4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_general )
+HIDDEN(_mesa_3dnow_transform_points1_general)
+GLNAME( _mesa_3dnow_transform_points1_general ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(4, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPGR_3 ) )
+
+    MOVQ      ( REGIND(ECX), MM0 )	/* m01             | m00             */
+    MOVQ      ( REGOFF(8, ECX), MM1 )	/* m03             | m02             */
+
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+    MOVQ      ( REGOFF(56, ECX), MM3 )	/* m33             | m32             */
+
+ALIGNTEXT16
+LLBL( G3TPGR_2 ):
+
+    MOVD      ( REGIND(EAX), MM4 )	/*                 | x0              */
+    PUNPCKLDQ ( MM4, MM4 )		/* x0              | x0              */
+
+    MOVQ      ( MM4, MM5 )		/* x0              | x0              */
+    PFMUL     ( MM0, MM4 )		/* x0*m01          | x0*m00          */
+
+    PFMUL     ( MM1, MM5 )		/* x0*m03          | x0*m02          */
+    PFADD     ( MM2, MM4 )		/* x0*m01+m31      | x0*m00+m30      */
+
+    PFADD     ( MM3, MM5 )		/* x0*m03+m33      | x0*m02+m32      */
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+
+    MOVQ      ( MM5, REGOFF(8, EDX) )	/* write r3, r2                      */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TPGR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPGR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_identity )
+HIDDEN(_mesa_3dnow_transform_points1_identity)
+GLNAME( _mesa_3dnow_transform_points1_identity ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(1), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(4, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPIR_4) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_3 ):
+
+    MOVD      ( REGIND(EAX), MM0 )	/*                 | x0              */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    MOVD      ( MM0, REGIND(EDX) )	/*                 | r0              */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPIR_3 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPIR_4 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_3d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points1_3d_no_rot)
+GLNAME( _mesa_3dnow_transform_points1_3d_no_rot ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(4, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP3NRR_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+
+    MOVD      ( REGOFF(56, ECX), MM3 )	/*                 | m32             */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_2 ):
+
+    MOVD      ( REGIND(EAX), MM4 )	/*                 | x0              */
+    PFMUL     ( MM0, MM4 )		/*                 | x0*m00          */
+
+    PFADD     ( MM2, MM4 )		/* m31             | x0*m00+m30      */
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+
+    MOVD      ( MM3, REGOFF(8, EDX) )	/* write r2                          */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TP3NRR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP3NRR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_perspective )
+HIDDEN(_mesa_3dnow_transform_points1_perspective)
+GLNAME( _mesa_3dnow_transform_points1_perspective ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(4, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPPR_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    MOVD      ( REGOFF(56, ECX), MM3 )	/*                 | m32             */
+
+ALIGNTEXT16
+LLBL( G3TPPR_2 ):
+
+    MOVD      ( REGIND(EAX), MM4 )	/* 0               | x0              */
+    PFMUL     ( MM0, MM4 )		/* 0               | x0*m00          */
+
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+    MOVQ      ( MM3, REGOFF(8, EDX) )	/* write r2  (=m32), r3 (=0)         */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPPR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPPR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_2d )
+HIDDEN(_mesa_3dnow_transform_points1_2d)
+GLNAME( _mesa_3dnow_transform_points1_2d ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(4, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP2R_3 ) )
+
+    MOVQ      ( REGIND(ECX), MM0 )	/* m01             | m00             */
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+    MOVD      ( REGIND(EAX), MM4 )	/*                 | x0              */
+    PUNPCKLDQ ( MM4, MM4 )		/* x0              | x0              */
+
+    PFMUL     ( MM0, MM4 )		/* x0*m01          | x0*m00          */
+    PFADD     ( MM2, MM4 )		/* x0*m01+m31      | x0*m00+m30      */
+
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TP2R_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP2R_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_2d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points1_2d_no_rot)
+GLNAME( _mesa_3dnow_transform_points1_2d_no_rot ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(4, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP2NRR_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+    MOVD      ( REGIND(EAX), MM4 )	/*                 | x0              */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    PFMUL     ( MM0, MM4 )		/*                 | x0*m00          */
+    PFADD     ( MM2, MM4 )		/* m31             | x0*m00+m30      */
+
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TP2NRR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP2NRR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_3d )
+HIDDEN(_mesa_3dnow_transform_points1_3d)
+GLNAME( _mesa_3dnow_transform_points1_3d ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(4, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP3R_3 ) )
+
+    MOVQ      ( REGIND(ECX), MM0 )	/* m01             | m00             */
+    MOVD      ( REGOFF(8, ECX), MM1 )	/*                 | m02             */
+
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+    MOVD      ( REGOFF(56, ECX), MM3 )	/*                 | m32             */
+
+ALIGNTEXT16
+LLBL( G3TP3R_2 ):
+
+    MOVD      ( REGIND(EAX), MM4 )	/*                 | x0              */
+    PUNPCKLDQ ( MM4, MM4 )		/* x0              | x0              */
+
+    MOVQ      ( MM4, MM5 )		/*                 | x0              */
+    PFMUL     ( MM0, MM4 )		/* x0*m01          | x0*m00          */
+
+    PFMUL     ( MM1, MM5 )		/*                 | x0*m02          */
+    PFADD     ( MM2, MM4 )		/* x0*m01+m31      | x0*m00+m30      */
+
+    PFADD     ( MM3, MM5 )		/*                 | x0*m02+m32      */
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+
+    MOVD      ( MM5, REGOFF(8, EDX) )	/* write r2                          */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TP3R_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP3R_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/3dnow_xform2.S
+++ b/src/arch/x86/3dnow_xform2.S
@ -0,0 +1,477 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_3DNOW_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+    SEG_TEXT
+
+#define FRAME_OFFSET	4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_general )
+HIDDEN(_mesa_3dnow_transform_points2_general)
+GLNAME( _mesa_3dnow_transform_points2_general ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPGR_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(16, ECX), MM0 )	/* m10             | m00             */
+
+    MOVD      ( REGOFF(4, ECX), MM1 )	/*                 | m01             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM1 )	/* m11             | m01             */
+
+    MOVD      ( REGOFF(8, ECX), MM2 )	/*                 | m02             */
+    PUNPCKLDQ ( REGOFF(24, ECX), MM2 )	/* m12             | m02             */
+
+    MOVD      ( REGOFF(12, ECX), MM3 )	/*                 | m03             */
+    PUNPCKLDQ ( REGOFF(28, ECX), MM3 )	/* m13             | m03             */
+
+    MOVQ      ( REGOFF(48, ECX), MM4 )	/* m31             | m30             */
+    MOVQ      ( REGOFF(56, ECX), MM5 )	/* m33             | m32             */
+
+ALIGNTEXT16
+LLBL( G3TPGR_2 ):
+
+    MOVQ      ( REGIND(EAX), MM6 )	/* x1              | x0              */
+    MOVQ      ( MM6, MM7 )		/* x1              | x0              */
+
+    PFMUL     ( MM0, MM6 )		/* x1*m10          | x0*m00          */
+    PFMUL     ( MM1, MM7 )		/* x1*m11          | x0*m01          */
+
+    PFACC     ( MM7, MM6 )		/* x0*m01+x1*m11   | x0*x00+x1*m10   */
+    PFADD     ( MM4, MM6 )		/* x0*...*m11+m31  | x0*...*m10+m30  */
+
+    MOVQ      ( MM6, REGIND(EDX) )	/* write r1, r0                      */
+    MOVQ      ( REGIND(EAX), MM6 )	/* x1              | x0              */
+
+    MOVQ      ( MM6, MM7 )		/* x1              | x0              */
+    PFMUL     ( MM2, MM6 )		/* x1*m12          | x0*m02          */
+
+    PFMUL     ( MM3, MM7 )		/* x1*m13          | x0*m03          */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    PFACC     ( MM7, MM6 )		/* x0*m03+x1*m13   | x0*x02+x1*m12   */
+    PFADD     ( MM5, MM6 )		/* x0*...*m13+m33  | x0*...*m12+m32  */
+
+    MOVQ      ( MM6, REGOFF(8, EDX) )	/* write r3, r2                      */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPGR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPGR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_perspective )
+HIDDEN(_mesa_3dnow_transform_points2_perspective)
+GLNAME( _mesa_3dnow_transform_points2_perspective ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPPR_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVD      ( REGOFF(56, ECX), MM3 )	/*                 | m32             */
+
+ALIGNTEXT16
+LLBL( G3TPPR_2 ):
+
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+    MOVQ      ( MM3, REGOFF(8, EDX) )	/* write r2  (=m32), r3 (=0)         */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPPR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPPR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_3d )
+HIDDEN(_mesa_3dnow_transform_points2_3d)
+GLNAME( _mesa_3dnow_transform_points2_3d ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP3R_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(16, ECX), MM0 )	/* m10             | m00             */
+
+    MOVD      ( REGOFF(4, ECX), MM1 )	/*                 | m01             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM1 )	/* m11             | m01             */
+
+    MOVD      ( REGOFF(8, ECX), MM2 )	/*                 | m02             */
+    PUNPCKLDQ ( REGOFF(24, ECX), MM2 )	/* m12             | m02             */
+
+    MOVQ      ( REGOFF(48, ECX), MM4 )	/* m31             | m30             */
+    MOVD      ( REGOFF(56, ECX), MM5 )	/*                 | m32             */
+
+ALIGNTEXT16
+LLBL( G3TP3R_2 ):
+
+    MOVQ      ( REGIND(EAX), MM6 )	/* x1              | x0              */
+    MOVQ      ( MM6, MM7 )		/* x1              | x0              */
+
+    PFMUL     ( MM0, MM6 )		/* x1*m10          | x0*m00          */
+    PFMUL     ( MM1, MM7 )		/* x1*m11          | x0*m01          */
+
+    PFACC     ( MM7, MM6 )		/* x0*m01+x1*m11   | x0*x00+x1*m10   */
+    PFADD     ( MM4, MM6 )		/* x0*...*m11+m31  | x0*...*m10+m30  */
+
+    MOVQ      ( MM6, REGIND(EDX) )	/* write r1, r0                      */
+    MOVQ      ( REGIND(EAX), MM6 )	/* x1              | x0              */
+
+    MOVQ      ( MM6, MM7 )		/* x1              | x0              */
+    PFMUL     ( MM2, MM6 )		/* x1*m12          | x0*m02          */
+
+    PFACC     ( MM7, MM6 )		/* ***trash***     | x0*x02+x1*m12   */
+    PFADD     ( MM5, MM6 )		/* ***trash***     | x0*...*m12+m32  */
+
+    MOVD      ( MM6, REGOFF(8, EDX) )	/* write r2                          */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TP3R_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP3R_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_3d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points2_3d_no_rot)
+GLNAME( _mesa_3dnow_transform_points2_3d_no_rot ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP3NRR_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+    MOVD      ( REGOFF(56, ECX), MM3 )	/*                 | m32             */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_2 ):
+
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+
+    PFADD     ( MM2, MM4 )		/* x1*m11+m31      | x0*m00+m30      */
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+
+    MOVD      ( MM3, REGOFF(8, EDX) )	/* write r2                          */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TP3NRR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP3NRR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_2d )
+HIDDEN(_mesa_3dnow_transform_points2_2d)
+GLNAME( _mesa_3dnow_transform_points2_2d ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP2R_3 ) )
+
+    MOVQ      ( REGIND(ECX), MM0 )	/* m01             | m00             */
+    MOVQ      ( REGOFF(16, ECX), MM1 )	/* m11             | m10             */
+
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+    MOVD      ( REGIND(EAX), MM4 )	/*                 | x0              */
+    MOVD      ( REGOFF(4, EAX), MM5 )	/*                 | x1              */
+
+    PUNPCKLDQ ( MM4, MM4 )		/* x0              | x0              */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    PFMUL     ( MM0, MM4 )		/* x0*m01          | x0*m00          */
+    PUNPCKLDQ ( MM5, MM5 )		/* x1              | x1              */
+
+    PFMUL     ( MM1, MM5 )		/* x1*m11          | x1*m10          */
+    PFADD     ( MM2, MM4 )		/* x...x1*m11+31   | x0*..*m10+m30   */
+
+    PFADD     ( MM5, MM4 )		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TP2R_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP2R_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_2d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points2_2d_no_rot)
+GLNAME( _mesa_3dnow_transform_points2_2d_no_rot ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP2NRR_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+    PFADD     ( MM2, MM4 )		/* m31             | x0*m00+m30      */
+
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r1, r0                      */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TP2NRR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP2NRR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_identity )
+HIDDEN(_mesa_3dnow_transform_points2_identity)
+GLNAME( _mesa_3dnow_transform_points2_identity ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPIR_3 ) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_3 ):
+
+    MOVQ      ( REGIND(EAX), MM0 )	/* x1              | x0              */
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+
+    MOVQ      ( MM0, REGIND(EDX) )	/* r1              | r0              */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPIR_3 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPIR_4 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/3dnow_xform3.S
+++ b/src/arch/x86/3dnow_xform3.S
@ -0,0 +1,561 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_3DNOW_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+    SEG_TEXT
+
+#define FRAME_OFFSET	4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_general )
+HIDDEN(_mesa_3dnow_transform_points3_general)
+GLNAME( _mesa_3dnow_transform_points3_general ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPGR_2 ) )
+
+    PREFETCHW ( REGIND(EDX) )
+
+ALIGNTEXT16
+LLBL( G3TPGR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM0 )	/* x1              | x0              */
+    MOVD      ( REGOFF(8, EAX), MM2 )	/*                 | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+
+    MOVQ      ( MM0, MM1 )		/* x1              | x0              */
+    PUNPCKLDQ ( MM2, MM2 )		/* x2              | x2              */
+
+    PUNPCKLDQ ( MM0, MM0 )		/* x0              | x0              */
+    MOVQ      ( MM2, MM5 )		/* x2              | x2              */
+
+    PUNPCKHDQ ( MM1, MM1 )		/* x1              | x1              */
+    PFMUL     ( REGOFF(32, ECX), MM2 )	/* x2*m9           | x2*m8           */
+
+    MOVQ      ( MM0, MM3 )		/* x0              | x0              */
+    PFMUL     ( REGOFF(40, ECX), MM5 )	/* x2*m11          | x2*m10          */
+
+    MOVQ      ( MM1, MM4 )		/* x1              | x1              */
+    PFMUL     ( REGIND(ECX), MM0 )	/* x0*m1           | x0*m0           */
+
+    PFADD     ( REGOFF(48, ECX), MM2 )	/* x2*m9+m13       | x2*m8+m12       */
+    PFMUL     ( REGOFF(16, ECX), MM1 )	/* x1*m5           | x1*m4           */
+
+    PFADD     ( REGOFF(56, ECX), MM5 )	/* x2*m11+m15      | x2*m10+m14      */
+    PFADD     ( MM0, MM1 )		/* x0*m1+x1*m5     | x0*m0+x1*m4     */
+
+    PFMUL     ( REGOFF(8, ECX), MM3 )	/* x0*m3           | x0*m2           */
+    PFADD     ( MM1, MM2 )		/* r1              | r0              */
+
+    PFMUL     ( REGOFF(24, ECX), MM4 )	/* x1*m7           | x1*m6           */
+    ADD_L     ( CONST(16), EDX )	/* next output vertex                */
+
+    PFADD     ( MM3, MM4 )		/* x0*m3+x1*m7     | x0*m2+x1*m6     */
+    MOVQ      ( MM2, REGOFF(-16, EDX) )	/* write r0, r1                      */
+
+    PFADD     ( MM4, MM5 )		/* r3              | r2              */
+    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPGR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPGR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_perspective )
+HIDDEN(_mesa_3dnow_transform_points3_perspective)
+GLNAME( _mesa_3dnow_transform_points3_perspective ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPPR_2 ) )
+
+    PREFETCH  ( REGIND(EAX) )
+    PREFETCHW ( REGIND(EDX) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVQ      ( REGOFF(32, ECX), MM1 )	/* m21             | m20             */
+    MOVD      ( REGOFF(40, ECX), MM2 )	/*                 | m22             */
+
+    MOVD      ( REGOFF(56, ECX), MM3 )	/*                 | m32             */
+
+ALIGNTEXT16
+LLBL( G3TPPR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVD      ( REGOFF(8, EAX), MM5 )	/*                 | x2              */
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+
+    PXOR      ( MM7, MM7 )		/* 0               | 0               */
+    MOVQ      ( MM5, MM6 )		/*                 | x2              */
+
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+    PFSUB     ( MM5, MM7 )		/*                 | -x2             */
+
+    PFMUL     ( MM2, MM6 )		/*                 | x2*m22          */
+    PUNPCKLDQ ( MM5, MM5 )		/* x2              | x2              */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    PFMUL     ( MM1, MM5 )		/* x2*m21          | x2*m20          */
+
+    PFADD     ( MM3, MM6 )		/*                 | x2*m22+m32      */
+    PFADD     ( MM4, MM5 )		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
+
+    MOVQ      ( MM5, REGOFF(-16, EDX) )	/* write r0, r1                      */
+    MOVD      ( MM6, REGOFF(-8, EDX) )	/* write r2                          */
+
+    MOVD      ( MM7, REGOFF(-4, EDX) )	/* write r3                          */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPPR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPPR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_3d )
+HIDDEN(_mesa_3dnow_transform_points3_3d)
+GLNAME( _mesa_3dnow_transform_points3_3d ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP3R_2 ) )
+
+    PREFETCH  ( REGIND(EAX) )
+    PREFETCH  ( REGIND(EDX) )
+
+    MOVD      ( REGOFF(8, ECX), MM7 )	/*                 | m2              */
+    PUNPCKLDQ ( REGOFF(24, ECX), MM7 )	/* m6              | m2              */
+
+
+ALIGNTEXT16
+LLBL( G3TP3R_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM0 )	/* x1              | x0              */
+    MOVD      ( REGOFF(8, EAX), MM1 )	/*                 | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+
+    MOVQ      ( MM0, MM2 )		/* x1              | x0              */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    PUNPCKLDQ ( MM2, MM2 )		/* x0              | x0              */
+    MOVQ      ( MM0, MM3 )		/* x1              | x0              */
+
+    PFMUL     ( REGIND(ECX), MM2 )	/* x0*m1           | x0*m0           */
+    PUNPCKHDQ ( MM3, MM3 )		/* x1              | x1              */
+
+    MOVQ      ( MM1, MM4 )		/*                 | x2              */
+    PFMUL     ( REGOFF(16, ECX), MM3 )	/* x1*m5           | x1*m4           */
+
+    PUNPCKLDQ ( MM4, MM4 )		/* x2              | x2              */
+    PFADD     ( MM2, MM3 )		/* x0*m1+x1*m5     | x0*m0+x1*m4     */
+
+    PFMUL     ( REGOFF(32, ECX), MM4 )	/* x2*m9           | x2*m8           */
+    PFADD     ( REGOFF(48, ECX), MM3 )	/* x0*m1+...+m11   | x0*m0+x1*m4+m12 */
+
+    PFMUL     ( MM7, MM0 )		/* x1*m6           | x0*m2           */
+    PFADD     ( MM4, MM3 )		/* r1              | r0              */
+
+    PFMUL     ( REGOFF(40, ECX), MM1 )	/*                 | x2*m10          */
+    PUNPCKLDQ ( REGOFF(56, ECX), MM1 )	/* m14             | x2*m10          */
+
+    PFACC     ( MM0, MM1 )
+
+    MOVQ      ( MM3, REGOFF(-16, EDX) )	/* write r0, r1                      */
+    PFACC     ( MM1, MM1 )		/*                 | r2              */
+
+    MOVD      ( MM1, REGOFF(-8, EDX) )	/* write r2                          */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TP3R_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP3R_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_3d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points3_3d_no_rot)
+GLNAME( _mesa_3dnow_transform_points3_3d_no_rot ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP3NRR_2 ) )
+
+    PREFETCH  ( REGIND(EAX) )
+    PREFETCHW ( REGIND(EDX) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVD      ( REGOFF(40, ECX), MM2 )	/*                 | m22             */
+    PUNPCKLDQ ( MM2, MM2 )		/* m22             | m22             */
+
+    MOVQ      ( REGOFF(48, ECX), MM1 )	/* m31             | m30             */
+    MOVD      ( REGOFF(56, ECX), MM3 )	/*                 | m32             */
+
+    PUNPCKLDQ ( MM3, MM3 )		/* m32             | m32             */
+
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+    MOVD      ( REGOFF(8, EAX), MM5 )	/*                 | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCHW ( REGIND(EAX) )
+	
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+
+    PFADD     ( MM1, MM4 )		/* x1*m11+m31      | x0*m00+m30      */
+    PFMUL     ( MM2, MM5 )		/*                 | x2*m22          */
+
+    PFADD     ( MM3, MM5 )		/*                 | x2*m22+m32      */
+    MOVQ      ( MM4, REGIND(EDX) )	/* write r0, r1                      */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    MOVD      ( MM5, REGOFF(-8, EDX) )	/* write r2                          */
+    JNZ       ( LLBL( G3TP3NRR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP3NRR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_2d )
+HIDDEN(_mesa_3dnow_transform_points3_2d)
+GLNAME( _mesa_3dnow_transform_points3_2d ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP2R_3) )
+
+    PREFETCH  ( REGIND(EAX) )
+    PREFETCHW ( REGIND(EDX) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(16, ECX), MM0 )	/* m10             | m00             */
+
+    MOVD      ( REGOFF(4, ECX), MM1 )	/*                 | m01             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM1 )	/* m11             | m01             */
+
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM3 )	/* x1              | x0              */
+    MOVD      ( REGOFF(8, EAX), MM5 )	/*                 | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+
+    MOVQ      ( MM3, MM4 )		/* x1              | x0              */
+    PFMUL     ( MM0, MM3 )		/* x1*m10          | x0*m00          */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    PFMUL     ( MM1, MM4 )		/* x1*m11          | x0*m01          */
+
+    PFACC     ( MM4, MM3 )		/* x0*m00+x1*m10   | x0*m01+x1*m11   */
+    MOVD      ( MM5, REGOFF(-8, EDX) )	/* write r2 (=x2)                    */
+
+    PFADD     ( MM2, MM3 )		/* x0*...*m10+m30  | x0*...*m11+m31  */
+    MOVQ      ( MM3, REGOFF(-16, EDX) )	/* write r0, r1                      */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TP2R_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP2R_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_2d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points3_2d_no_rot)
+GLNAME( _mesa_3dnow_transform_points3_2d_no_rot ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP2NRR_2 ) )
+
+    PREFETCH  ( REGIND(EAX) )
+    PREFETCHW ( REGIND(EDX) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVQ      ( REGOFF(48, ECX), MM1 )	/* m31             | m30             */
+
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+    MOVD      ( REGOFF(8, EAX), MM5 )	/*                 | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    PFADD     ( MM1, MM4 )		/* x1*m11+m31      | x0*m00+m30      */
+
+    MOVQ      ( MM4, REGOFF(-16, EDX) )	/* write r0, r1                      */
+    MOVD      ( MM5, REGOFF(-8, EDX) )	/* write r2 (=x2)                    */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TP2NRR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP2NRR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_identity )
+HIDDEN(_mesa_3dnow_transform_points3_identity)
+GLNAME( _mesa_3dnow_transform_points3_identity ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPIR_2 ) )
+
+    PREFETCHW ( REGIND(EDX) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )
+
+    MOVQ      ( REGIND(EAX), MM0 )	/* x1              | x0              */
+    MOVD      ( REGOFF(8, EAX), MM1 )	/*                 | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    MOVQ      ( MM0, REGOFF(-16, EDX) )	/* r1              | r0              */
+
+    MOVD      ( MM1, REGOFF(-8, EDX) )	/*                 | r2              */
+    JNZ       ( LLBL( G3TPIR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPIR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/3dnow_xform4.S
+++ b/src/arch/x86/3dnow_xform4.S
@ -0,0 +1,570 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_3DNOW_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+    SEG_TEXT
+
+#define FRAME_OFFSET	4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_general )
+HIDDEN(_mesa_3dnow_transform_points4_general)
+GLNAME( _mesa_3dnow_transform_points4_general ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPGR_2 ) )
+
+    PREFETCHW ( REGIND(EDX) )
+
+ALIGNTEXT16
+LLBL( G3TPGR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM0 )	/* x1            | x0                */
+    MOVQ      ( REGOFF(8, EAX), MM4 )	/* x3            | x2                */
+	
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+	
+    MOVQ      ( MM0, MM2 )		/* x1              | x0              */
+    MOVQ      ( MM4, MM6 )		/* x3              | x2              */
+
+    PUNPCKLDQ ( MM0, MM0 )		/* x0              | x0              */
+    PUNPCKHDQ ( MM2, MM2 )		/* x1              | x1              */
+
+    MOVQ      ( MM0, MM1 )		/* x0              | x0              */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    PFMUL     ( REGIND(ECX), MM0 )	/* x0*m1           | x0*m0           */
+    MOVQ      ( MM2, MM3 )		/* x1              | x1              */
+
+    PFMUL     ( REGOFF(8, ECX), MM1 )	/* x0*m3           | x0*m2           */
+    PUNPCKLDQ ( MM4, MM4 )		/* x2              | x2              */
+
+    PFMUL     ( REGOFF(16, ECX), MM2 )	/* x1*m5           | x1*m4           */
+    MOVQ      ( MM4, MM5 )		/* x2              | x2              */
+
+    PFMUL     ( REGOFF(24, ECX), MM3 )	/* x1*m7           | x1*m6           */
+    PUNPCKHDQ ( MM6, MM6 )		/* x3              | x3              */
+
+    PFMUL     ( REGOFF(32, ECX), MM4 )	/* x2*m9           | x2*m8           */
+    MOVQ      ( MM6, MM7 )		/* x3              | x3              */
+
+    PFMUL     ( REGOFF(40, ECX), MM5 )	/* x2*m11          | x2*m10          */
+    PFADD     ( MM0, MM2 )
+
+    PFMUL     ( REGOFF(48, ECX), MM6 )	/* x3*m13          | x3*m12          */
+    PFADD     ( MM1, MM3 )
+
+    PFMUL     ( REGOFF(56, ECX), MM7 )	/* x3*m15          | x3*m14          */
+    PFADD     ( MM4, MM6 )
+
+    PFADD     ( MM5, MM7 )
+    PFADD     ( MM2, MM6 )
+
+    PFADD     ( MM3, MM7 )
+    MOVQ      ( MM6, REGOFF(-16, EDX) )
+
+    MOVQ      ( MM7, REGOFF(-8, EDX) )
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPGR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPGR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_perspective )
+HIDDEN(_mesa_3dnow_transform_points4_perspective)
+GLNAME( _mesa_3dnow_transform_points4_perspective ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPPR_2 ) )
+
+    PREFETCH  ( REGIND(EAX) )
+    PREFETCHW ( REGIND(EDX) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVD      ( REGOFF(40, ECX), MM1 )	/*                 | m22             */
+    PUNPCKLDQ ( REGOFF(56, ECX), MM1 )	/* m32             | m22             */
+
+    MOVQ      ( REGOFF(32, ECX), MM2 )	/* m21             | m20             */
+    PXOR      ( MM7, MM7 )		/* 0               | 0               */
+
+ALIGNTEXT16
+LLBL( G3TPPR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+    MOVQ      ( REGOFF(8, EAX), MM5 )	/* x3              | x2              */
+    MOVD      ( REGOFF(8, EAX), MM3 )	/*                 | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGOFF(32, EAX) )	/* hopefully stride is zero          */
+
+    MOVQ      ( MM5, MM6 )		/* x3              | x2              */
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+
+    PUNPCKLDQ ( MM5, MM5 )		/* x2              | x2              */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    PFMUL     ( MM2, MM5 )		/* x2*m21          | x2*m20          */
+    PFSUBR    ( MM7, MM3 )		/*                 | -x2             */
+
+    PFMUL     ( MM1, MM6 )		/* x3*m32          | x2*m22          */
+    PFADD     ( MM4, MM5 )		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
+
+    PFACC     ( MM3, MM6 )		/* -x2             | x2*m22+x3*m32   */
+    MOVQ      ( MM5, REGOFF(-16, EDX) )	/* write r0, r1                      */
+
+    MOVQ      ( MM6, REGOFF(-8, EDX) )	/* write r2, r3                      */
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TPPR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPPR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_3d )
+HIDDEN(_mesa_3dnow_transform_points4_3d)
+GLNAME( _mesa_3dnow_transform_points4_3d ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP3R_2 ) )
+
+    MOVD      ( REGOFF(8, ECX), MM6 )	/*                 | m2              */
+    PUNPCKLDQ ( REGOFF(24, ECX), MM6 )	/* m6              | m2              */
+
+    MOVD      ( REGOFF(40, ECX), MM7 )	/*                 | m10             */
+    PUNPCKLDQ ( REGOFF(56, ECX), MM7 )	/* m14             | m10             */
+
+ALIGNTEXT16
+LLBL( G3TP3R_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+    PREFETCH  ( REGOFF(32, EAX) )	/* hopefully array is tightly packed */
+
+    MOVQ      ( REGIND(EAX), MM2 )	/* x1              | x0              */
+    MOVQ      ( REGOFF(8, EAX), MM3 )	/* x3              | x2              */
+
+    MOVQ      ( MM2, MM0 )		/* x1              | x0              */
+    MOVQ      ( MM3, MM4 )		/* x3              | x2              */
+
+    MOVQ      ( MM0, MM1 )		/* x1              | x0              */
+    MOVQ      ( MM4, MM5 )		/* x3              | x2              */
+
+    PUNPCKLDQ ( MM0, MM0 )		/* x0              | x0              */
+    PUNPCKHDQ ( MM1, MM1 )		/* x1              | x1              */
+
+    PFMUL     ( REGIND(ECX), MM0 )	/* x0*m1           | x0*m0           */
+    PUNPCKLDQ ( MM3, MM3 )		/* x2              | x2              */
+
+    PFMUL     ( REGOFF(16, ECX), MM1 )	/* x1*m5           | x1*m4           */
+    PUNPCKHDQ ( MM4, MM4 )		/* x3              | x3              */
+
+    PFMUL     ( MM6, MM2 )		/* x1*m6           | x0*m2           */
+    PFADD     ( MM0, MM1 )		/* x0*m1+x1*m5     | x0*m0+x1*m4     */
+
+    PFMUL     ( REGOFF(32, ECX), MM3 )	/* x2*m9           | x2*m8           */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    PFMUL     ( REGOFF(48, ECX), MM4 )	/* x3*m13          | x3*m12          */
+    PFADD     ( MM1, MM3 )		/* x0*m1+..+x2*m9  | x0*m0+...+x2*m8 */
+
+    PFMUL     ( MM7, MM5 )		/* x3*m14          | x2*m10          */
+    PFADD     ( MM3, MM4 )		/* r1              | r0              */
+
+    PFACC     ( MM2, MM5 )		/* x0*m2+x1*m6     | x2*m10+x3*m14   */
+    MOVD      ( REGOFF(12, EAX), MM0 )	/*                 | x3              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PFACC     ( MM0, MM5 )		/* r3              | r2              */
+
+    MOVQ      ( MM4, REGOFF(-16, EDX) )	/* write r0, r1                      */
+    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TP3R_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP3R_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_3d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points4_3d_no_rot)
+GLNAME( _mesa_3dnow_transform_points4_3d_no_rot ):
+
+    PUSH_L    ( ESI )
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP3NRR_2 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVD      ( REGOFF(40, ECX), MM2 )	/*                 | m22             */
+    PUNPCKLDQ ( REGOFF(56, ECX), MM2 )	/* m32             | m22             */
+
+    MOVQ      ( REGOFF(48, ECX), MM1 )	/* m31             | m30             */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+    MOVQ      ( REGOFF(8, EAX), MM5 )	/* x3              | x2              */
+    MOVD      ( REGOFF(12, EAX), MM7 )	/*                 | x3              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGOFF(32, EAX) )	/* hopefully stride is zero          */
+
+    MOVQ      ( MM5, MM6 )		/* x3              | x2              */
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+
+    PUNPCKHDQ ( MM6, MM6 )		/* x3              | x3              */
+    PFMUL     ( MM2, MM5 )		/* x3*m32          | x2*m22          */
+
+    PFMUL     ( MM1, MM6 )		/* x3*m31          | x3*m30          */
+    PFACC     ( MM7, MM5 )		/* x3              | x2*m22+x3*m32   */
+
+    PFADD     ( MM6, MM4 )		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    MOVQ      ( MM4, REGOFF(-16, EDX) )	/* write r0, r1                      */
+    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TP3NRR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP3NRR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_2d )
+HIDDEN(_mesa_3dnow_transform_points4_2d)
+GLNAME( _mesa_3dnow_transform_points4_2d ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP2R_2 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(16, ECX), MM0 )	/* m10             | m00             */
+
+    MOVD      ( REGOFF(4, ECX), MM1 )	/*                 | m01             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM1 )	/* m11             | m01             */
+
+    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
+
+ALIGNTEXT16
+LLBL( G3TP2R_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )       /* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM3 )	/* x1              | x0              */
+    MOVQ      ( REGOFF(8, EAX), MM5 )	/* x3              | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+
+    MOVQ      ( MM3, MM4 )		/* x1              | x0              */
+    MOVQ      ( MM5, MM6 )		/* x3              | x2              */
+
+    PFMUL     ( MM1, MM4 )		/* x1*m11          | x0*m01          */
+    PUNPCKHDQ ( MM6, MM6 )		/* x3              | x3              */
+
+    PFMUL     ( MM0, MM3 )		/* x1*m10          | x0*m00          */
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+
+    PFACC     ( MM4, MM3 )		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
+    PFMUL     ( MM2, MM6 )		/* x3*m31          | x3*m30          */
+
+    PFADD     ( MM6, MM3 )		/* r1              | r0              */
+    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
+
+    MOVQ      ( MM3, REGOFF(-16, EDX) )	/* write r0, r1                      */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TP2R_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP2R_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_2d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points4_2d_no_rot)
+GLNAME( _mesa_3dnow_transform_points4_2d_no_rot ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TP2NRR_3 ) )
+
+    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
+    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
+
+    MOVQ      ( REGOFF(48, ECX), MM1 )	/* m31             | m30             */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
+
+    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
+    MOVQ      ( REGOFF(8, EAX), MM5 )	/* x3              | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+
+    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
+    MOVQ      ( MM5, MM6 )		/* x3              | x2              */
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    PUNPCKHDQ ( MM6, MM6 )		/* x3              | x3              */
+
+    PFMUL     ( MM1, MM6 )		/* x3*m31          | x3*m30          */
+    PFADD     ( MM4, MM6 )		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
+
+    MOVQ      ( MM6, REGOFF(-16, EDX) )	/* write r0, r1                      */
+    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+
+    JNZ       ( LLBL( G3TP2NRR_2 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TP2NRR_3 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_identity )
+HIDDEN(_mesa_3dnow_transform_points4_identity)
+GLNAME( _mesa_3dnow_transform_points4_identity ):
+
+    PUSH_L    ( ESI )
+
+    MOV_L     ( ARG_DEST, ECX )
+    MOV_L     ( ARG_MATRIX, ESI )
+    MOV_L     ( ARG_SOURCE, EAX )
+    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
+    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+    PUSH_L    ( EDI )
+
+    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
+    MOV_L     ( ESI, ECX )
+    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
+    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
+    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
+
+    TEST_L    ( ESI, ESI )
+    JZ        ( LLBL( G3TPIR_2 ) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_1 ):
+
+    PREFETCHW ( REGOFF(32, EDX) )       /* prefetch 2 vertices ahead         */
+	
+    MOVQ      ( REGIND(EAX), MM0 )	/* x1              | x0              */
+    MOVQ      ( REGOFF(8, EAX), MM1 )	/* x3              | x2              */
+
+    ADD_L     ( EDI, EAX )		/* next vertex                       */
+    PREFETCH  ( REGIND(EAX) )
+
+    ADD_L     ( CONST(16), EDX )	/* next r                            */
+    MOVQ      ( MM0, REGOFF(-16, EDX) )	/* r1              | r0              */
+
+    MOVQ      ( MM1, REGOFF(-8, EDX) )	/* r3              | r2              */
+
+    DEC_L     ( ESI )			/* decrement vertex counter          */
+    JNZ       ( LLBL( G3TPIR_1 ) )	/* cnt > 0 ? -> process next vertex  */
+
+LLBL( G3TPIR_2 ):
+
+    FEMMS
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/Makefile.am
+++ b/src/arch/x86/Makefile.am
@ -0,0 +1,40 @@
+# Copyright © 2012 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+if HAVE_X86_ASM
+
+AM_CPPFLAGS = \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/mesa \
+	-I$(top_srcdir)/src/GLdispatch/mapi \
+	$(API_DEFINES) \
+	$(DEFINES)
+
+noinst_PROGRAMS = gen_matypes
+
+gen_matypes_SOURCES = gen_matypes.c
+BUILT_SOURCES = matypes.h
+CLEANFILES = matypes.h
+
+matypes.h: gen_matypes
+	$(AM_V_GEN)./gen_matypes > $@
+
+endif
--- a/src/arch/x86/assyntax.h
+++ b/src/arch/x86/assyntax.h
--- a/src/arch/x86/clip_args.h
+++ b/src/arch/x86/clip_args.h
@ -0,0 +1,59 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Clip test function interface for assembly code.  Simply define
+ * FRAME_OFFSET to the number of bytes pushed onto the stack before
+ * using the ARG_* argument macros.
+ *
+ * Gareth Hughes
+ */
+
+#ifndef __CLIP_ARGS_H__
+#define __CLIP_ARGS_H__
+
+/*
+ * Offsets for clip_func arguments
+ *
+ * typedef GLvector4f *(*clip_func)( GLvector4f *clip_vec,
+ *	                             GLvector4f *proj_vec,
+ *	                             GLubyte clipMask[],
+ *	                             GLubyte *orMask,
+ *	                             GLubyte *andMask );
+ */
+
+#define OFFSET_SOURCE	4
+#define OFFSET_DEST	8
+#define OFFSET_CLIP	12
+#define OFFSET_OR	16
+#define OFFSET_AND	20
+
+#define ARG_SOURCE	REGOFF(FRAME_OFFSET+OFFSET_SOURCE, ESP)
+#define ARG_DEST	REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
+#define ARG_CLIP	REGOFF(FRAME_OFFSET+OFFSET_CLIP, ESP)
+#define ARG_OR		REGOFF(FRAME_OFFSET+OFFSET_OR, ESP)
+#define ARG_AND		REGOFF(FRAME_OFFSET+OFFSET_AND, ESP)
+
+#endif
--- a/src/arch/x86/common_x86.c
+++ b/src/arch/x86/common_x86.c
@ -0,0 +1,336 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.5.1
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file common_x86.c
+ *
+ * Check CPU capabilities & initialize optimized funtions for this particular
+ * processor.
+ *
+ * Changed by Andre Werthmann for using the new SSE functions.
+ *
+ * \author Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ * \author Andre Werthmann <wertmann@cs.uni-potsdam.de>
+ */
+
+/* XXX these includes should probably go into imports.h or glheader.h */
+#if defined(USE_SSE_ASM) && defined(__linux__)
+#include <linux/version.h>
+#endif
+#if defined(USE_SSE_ASM) && defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+#if defined(USE_SSE_ASM) && defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#endif
+
+#include "main/imports.h"
+#include "common_x86_asm.h"
+
+
+/** Bitmask of X86_FEATURE_x bits */
+int _mesa_x86_cpu_features = 0x0;
+
+static int detection_debug = GL_FALSE;
+
+/* No reason for this to be public.
+ */
+extern GLuint	_ASMAPI _mesa_x86_has_cpuid(void);
+extern void	_ASMAPI _mesa_x86_cpuid(GLuint op, GLuint *reg_eax, GLuint *reg_ebx, GLuint *reg_ecx, GLuint *reg_edx);
+extern GLuint	_ASMAPI _mesa_x86_cpuid_eax(GLuint op);
+extern GLuint	_ASMAPI _mesa_x86_cpuid_ebx(GLuint op);
+extern GLuint	_ASMAPI _mesa_x86_cpuid_ecx(GLuint op);
+extern GLuint	_ASMAPI _mesa_x86_cpuid_edx(GLuint op);
+
+
+#if defined(USE_SSE_ASM)
+/*
+ * We must verify that the Streaming SIMD Extensions are truly supported
+ * on this processor before we go ahead and hook out the optimized code.
+ *
+ * However, I have been told by Alan Cox that all 2.4 (and later) Linux
+ * kernels provide full SSE support on all processors that expose SSE via
+ * the CPUID mechanism.
+ */
+
+/* These are assembly functions: */
+extern void _mesa_test_os_sse_support( void );
+extern void _mesa_test_os_sse_exception_support( void );
+
+
+#if defined(_WIN32)
+#ifndef STATUS_FLOAT_MULTIPLE_TRAPS
+# define STATUS_FLOAT_MULTIPLE_TRAPS (0xC00002B5L)
+#endif
+static LONG WINAPI ExceptionFilter(LPEXCEPTION_POINTERS exp)
+{
+   PEXCEPTION_RECORD rec = exp->ExceptionRecord;
+   PCONTEXT ctx = exp->ContextRecord;
+
+   if ( rec->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION ) {
+      _mesa_debug(NULL, "EXCEPTION_ILLEGAL_INSTRUCTION\n" );
+      _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+   } else if ( rec->ExceptionCode == STATUS_FLOAT_MULTIPLE_TRAPS ) {
+      _mesa_debug(NULL, "STATUS_FLOAT_MULTIPLE_TRAPS\n");
+      /* Windows seems to clear the exception flag itself, we just have to increment Eip */
+   } else {
+      _mesa_debug(NULL, "UNEXPECTED EXCEPTION (0x%08x), terminating!\n" );
+      return EXCEPTION_EXECUTE_HANDLER;
+   }
+
+   if ( (ctx->ContextFlags & CONTEXT_CONTROL) != CONTEXT_CONTROL ) {
+      _mesa_debug(NULL, "Context does not contain control registers, terminating!\n");
+      return EXCEPTION_EXECUTE_HANDLER;
+   }
+   ctx->Eip += 3;
+
+   return EXCEPTION_CONTINUE_EXECUTION;
+}
+#endif /* _WIN32 */
+
+
+/**
+ * Check if SSE is supported.
+ * If not, turn off the X86_FEATURE_XMM flag in _mesa_x86_cpu_features.
+ */
+void _mesa_check_os_sse_support( void )
+{
+#if defined(__FreeBSD__)
+   {
+      int ret, enabled;
+      unsigned int len;
+      len = sizeof(enabled);
+      ret = sysctlbyname("hw.instruction_sse", &enabled, &len, NULL, 0);
+      if (ret || !enabled)
+         _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+   }
+#elif defined (__NetBSD__)
+   {
+      int ret, enabled;
+      size_t len = sizeof(enabled);
+      ret = sysctlbyname("machdep.sse", &enabled, &len, (void *)NULL, 0);
+      if (ret || !enabled)
+         _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+   }
+#elif defined(__OpenBSD__)
+   {
+      int mib[2];
+      int ret, enabled;
+      size_t len = sizeof(enabled);
+
+      mib[0] = CTL_MACHDEP;
+      mib[1] = CPU_SSE;
+
+      ret = sysctl(mib, 2, &enabled, &len, NULL, 0);
+      if (ret || !enabled)
+         _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+   }
+#elif defined(_WIN32)
+   LPTOP_LEVEL_EXCEPTION_FILTER oldFilter;
+   
+   /* Install our ExceptionFilter */
+   oldFilter = SetUnhandledExceptionFilter( ExceptionFilter );
+   
+   if ( cpu_has_xmm ) {
+      _mesa_debug(NULL, "Testing OS support for SSE...\n");
+
+      _mesa_test_os_sse_support();
+
+      if ( cpu_has_xmm ) {
+	 _mesa_debug(NULL, "Yes.\n");
+      } else {
+	 _mesa_debug(NULL, "No!\n");
+      }
+   }
+
+   if ( cpu_has_xmm ) {
+      _mesa_debug(NULL, "Testing OS support for SSE unmasked exceptions...\n");
+
+      _mesa_test_os_sse_exception_support();
+
+      if ( cpu_has_xmm ) {
+	 _mesa_debug(NULL, "Yes.\n");
+      } else {
+	 _mesa_debug(NULL, "No!\n");
+      }
+   }
+
+   /* Restore previous exception filter */
+   SetUnhandledExceptionFilter( oldFilter );
+
+   if ( cpu_has_xmm ) {
+      _mesa_debug(NULL, "Tests of OS support for SSE passed.\n");
+   } else {
+      _mesa_debug(NULL, "Tests of OS support for SSE failed!\n");
+   }
+#else
+   /* Do nothing on other platforms for now.
+    */
+   if (detection_debug)
+      _mesa_debug(NULL, "Not testing OS support for SSE, leaving enabled.\n");
+#endif /* __FreeBSD__ */
+}
+
+#endif /* USE_SSE_ASM */
+
+
+/**
+ * Initialize the _mesa_x86_cpu_features bitfield.
+ * This is a no-op if called more than once.
+ */
+void
+_mesa_get_x86_features(void)
+{
+   static int called = 0;
+
+   if (called)
+      return;
+
+   called = 1;
+
+#ifdef USE_X86_ASM
+   _mesa_x86_cpu_features = 0x0;
+
+   if (_mesa_getenv( "MESA_NO_ASM")) {
+      return;
+   }
+
+   if (!_mesa_x86_has_cpuid()) {
+       _mesa_debug(NULL, "CPUID not detected\n");
+   }
+   else {
+       GLuint cpu_features;
+       GLuint cpu_ext_features;
+       GLuint cpu_ext_info;
+       char cpu_vendor[13];
+       GLuint result;
+
+       /* get vendor name */
+       _mesa_x86_cpuid(0, &result, (GLuint *)(cpu_vendor + 0), (GLuint *)(cpu_vendor + 8), (GLuint *)(cpu_vendor + 4));
+       cpu_vendor[12] = '\0';
+
+       if (detection_debug)
+	  _mesa_debug(NULL, "CPU vendor: %s\n", cpu_vendor);
+
+       /* get cpu features */
+       cpu_features = _mesa_x86_cpuid_edx(1);
+
+       if (cpu_features & X86_CPU_FPU)
+	   _mesa_x86_cpu_features |= X86_FEATURE_FPU;
+       if (cpu_features & X86_CPU_CMOV)
+	   _mesa_x86_cpu_features |= X86_FEATURE_CMOV;
+
+#ifdef USE_MMX_ASM
+       if (cpu_features & X86_CPU_MMX)
+	   _mesa_x86_cpu_features |= X86_FEATURE_MMX;
+#endif
+
+#ifdef USE_SSE_ASM
+       if (cpu_features & X86_CPU_XMM)
+	   _mesa_x86_cpu_features |= X86_FEATURE_XMM;
+       if (cpu_features & X86_CPU_XMM2)
+	   _mesa_x86_cpu_features |= X86_FEATURE_XMM2;
+#endif
+
+       /* query extended cpu features */
+       if ((cpu_ext_info = _mesa_x86_cpuid_eax(0x80000000)) > 0x80000000) {
+	   if (cpu_ext_info >= 0x80000001) {
+
+	       cpu_ext_features = _mesa_x86_cpuid_edx(0x80000001);
+
+	       if (cpu_features & X86_CPU_MMX) {
+
+#ifdef USE_3DNOW_ASM
+		   if (cpu_ext_features & X86_CPUEXT_3DNOW)
+		       _mesa_x86_cpu_features |= X86_FEATURE_3DNOW;
+		   if (cpu_ext_features & X86_CPUEXT_3DNOW_EXT)
+		       _mesa_x86_cpu_features |= X86_FEATURE_3DNOWEXT;
+#endif
+
+#ifdef USE_MMX_ASM
+		   if (cpu_ext_features & X86_CPUEXT_MMX_EXT)
+		       _mesa_x86_cpu_features |= X86_FEATURE_MMXEXT;
+#endif
+	       }
+	   }
+
+	   /* query cpu name */
+	   if (cpu_ext_info >= 0x80000002) {
+	       GLuint ofs;
+	       char cpu_name[49];
+	       for (ofs = 0; ofs < 3; ofs++)
+		   _mesa_x86_cpuid(0x80000002+ofs, (GLuint *)(cpu_name + (16*ofs)+0), (GLuint *)(cpu_name + (16*ofs)+4), (GLuint *)(cpu_name + (16*ofs)+8), (GLuint *)(cpu_name + (16*ofs)+12));
+	       cpu_name[48] = '\0'; /* the name should be NULL terminated, but just to be sure */
+
+	       if (detection_debug)
+		  _mesa_debug(NULL, "CPU name: %s\n", cpu_name);
+	   }
+       }
+
+   }
+
+#ifdef USE_MMX_ASM
+   if ( cpu_has_mmx ) {
+      if ( _mesa_getenv( "MESA_NO_MMX" ) == 0 ) {
+	 if (detection_debug)
+	    _mesa_debug(NULL, "MMX cpu detected.\n");
+      } else {
+         _mesa_x86_cpu_features &= ~(X86_FEATURE_MMX);
+      }
+   }
+#endif
+
+#ifdef USE_3DNOW_ASM
+   if ( cpu_has_3dnow ) {
+      if ( _mesa_getenv( "MESA_NO_3DNOW" ) == 0 ) {
+	 if (detection_debug)
+	    _mesa_debug(NULL, "3DNow! cpu detected.\n");
+      } else {
+         _mesa_x86_cpu_features &= ~(X86_FEATURE_3DNOW);
+      }
+   }
+#endif
+
+#ifdef USE_SSE_ASM
+   if ( cpu_has_xmm ) {
+      if ( _mesa_getenv( "MESA_NO_SSE" ) == 0 ) {
+	 if (detection_debug)
+	    _mesa_debug(NULL, "SSE cpu detected.\n");
+         if ( _mesa_getenv( "MESA_FORCE_SSE" ) == 0 ) {
+            _mesa_check_os_sse_support();
+         }
+      } else {
+         _mesa_debug(NULL, "SSE cpu detected, but switched off by user.\n");
+         _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+      }
+   }
+#endif
+
+#endif /* USE_X86_ASM */
+
+   (void) detection_debug;
+}
--- a/src/arch/x86/common_x86_asm.S
+++ b/src/arch/x86/common_x86_asm.S
@ -0,0 +1,220 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ *
+ * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Check extended CPU capabilities.  Now justs returns the raw CPUID
+ * feature information, allowing the higher level code to interpret the
+ * results.
+ *
+ * Written by Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ *
+ * Cleaned up and simplified by Gareth Hughes <gareth@valinux.com>
+ *
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...).  Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "matypes.h"
+#include "assyntax.h"
+#include "common_x86_features.h"
+
+	SEG_TEXT
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_has_cpuid)
+HIDDEN(_mesa_x86_has_cpuid)
+GLNAME(_mesa_x86_has_cpuid):
+
+	/* Test for the CPUID command.  If the ID Flag bit in EFLAGS
+	 * (bit 21) is writable, the CPUID command is present */
+	PUSHF_L
+	POP_L	(EAX)
+	MOV_L	(EAX, ECX)
+	XOR_L	(CONST(0x00200000), EAX)
+	PUSH_L	(EAX)
+	POPF_L
+	PUSHF_L
+	POP_L	(EAX)
+
+	/* Verify the ID Flag bit has been written. */
+	CMP_L	(ECX, EAX)
+	SETNE	(AL)
+	XOR_L	(CONST(0xff), EAX)
+
+	RET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid)
+HIDDEN(_mesa_x86_cpuid)
+GLNAME(_mesa_x86_cpuid):
+
+	MOV_L	(REGOFF(4, ESP), EAX)		/* cpuid op */
+	PUSH_L	(EDI)
+	PUSH_L	(EBX)
+
+	CPUID
+
+	MOV_L	(REGOFF(16, ESP), EDI)	/* *eax */
+	MOV_L	(EAX, REGIND(EDI))
+	MOV_L	(REGOFF(20, ESP), EDI)	/* *ebx */
+	MOV_L	(EBX, REGIND(EDI))
+	MOV_L	(REGOFF(24, ESP), EDI)	/* *ecx */
+	MOV_L	(ECX, REGIND(EDI))
+	MOV_L	(REGOFF(28, ESP), EDI)	/* *edx */
+	MOV_L	(EDX, REGIND(EDI))
+
+	POP_L	(EBX)
+	POP_L	(EDI)
+	RET
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid_eax)
+HIDDEN(_mesa_x86_cpuid_eax)
+GLNAME(_mesa_x86_cpuid_eax):
+
+	MOV_L	(REGOFF(4, ESP), EAX)		/* cpuid op */
+	PUSH_L	(EBX)
+
+	CPUID
+
+	POP_L	(EBX)
+	RET
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid_ebx)
+HIDDEN(_mesa_x86_cpuid_ebx)
+GLNAME(_mesa_x86_cpuid_ebx):
+
+	MOV_L	(REGOFF(4, ESP), EAX)		/* cpuid op */
+	PUSH_L	(EBX)
+
+	CPUID
+	MOV_L	(EBX, EAX)			/* return EBX */
+
+	POP_L	(EBX)
+	RET
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid_ecx)
+HIDDEN(_mesa_x86_cpuid_ecx)
+GLNAME(_mesa_x86_cpuid_ecx):
+
+	MOV_L	(REGOFF(4, ESP), EAX)		/* cpuid op */
+	PUSH_L	(EBX)
+
+	CPUID
+	MOV_L	(ECX, EAX)			/* return ECX */
+
+	POP_L	(EBX)
+	RET
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid_edx)
+HIDDEN(_mesa_x86_cpuid_edx)
+GLNAME(_mesa_x86_cpuid_edx):
+
+	MOV_L	(REGOFF(4, ESP), EAX)		/* cpuid op */
+	PUSH_L	(EBX)
+
+	CPUID
+	MOV_L	(EDX, EAX)			/* return EDX */
+
+	POP_L	(EBX)
+	RET
+
+#ifdef USE_SSE_ASM
+/* Execute an SSE instruction to see if the operating system correctly
+ * supports SSE.  A signal handler for SIGILL should have been set
+ * before calling this function, otherwise this could kill the client
+ * application.
+ *
+ *        -----> !!!! ATTENTION DEVELOPERS !!!! <-----
+ *
+ * If you're debugging with gdb and you get stopped in this function,
+ * just type 'continue'!  Execution will proceed normally.
+ * See freedesktop.org bug #1709 for more info.
+ */
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_test_os_sse_support )
+HIDDEN(_mesa_test_os_sse_support)
+GLNAME( _mesa_test_os_sse_support ):
+
+	XORPS	( XMM0, XMM0 )
+
+	RET
+
+
+/* Perform an SSE divide-by-zero to see if the operating system
+ * correctly supports unmasked SIMD FPU exceptions.  Signal handlers for
+ * SIGILL and SIGFPE should have been set before calling this function,
+ * otherwise this could kill the client application.
+ */
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_test_os_sse_exception_support )
+HIDDEN(_mesa_test_os_sse_exception_support)
+GLNAME( _mesa_test_os_sse_exception_support ):
+
+	PUSH_L	( EBP )
+	MOV_L	( ESP, EBP )
+	SUB_L	( CONST( 8 ), ESP )
+
+	/* Save the original MXCSR register value.
+	 */
+	STMXCSR	( REGOFF( -4, EBP ) )
+
+	/* Unmask the divide-by-zero exception and perform one.
+	 */
+	STMXCSR	( REGOFF( -8, EBP ) )
+	AND_L	( CONST( 0xfffffdff ), REGOFF( -8, EBP ) )
+	LDMXCSR	( REGOFF( -8, EBP ) )
+
+	XORPS	( XMM0, XMM0 )
+
+	PUSH_L	( CONST( 0x3f800000 ) )
+	PUSH_L	( CONST( 0x3f800000 ) )
+	PUSH_L	( CONST( 0x3f800000 ) )
+	PUSH_L	( CONST( 0x3f800000 ) )
+
+	MOVUPS	( REGIND( ESP ), XMM1 )
+
+	DIVPS	( XMM0, XMM1 )
+
+	/* Restore the original MXCSR register value.
+	 */
+	LDMXCSR	( REGOFF( -4, EBP ) )
+
+	LEAVE
+	RET
+
+#endif
+
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/common_x86_asm.h
+++ b/src/arch/x86/common_x86_asm.h
@ -0,0 +1,53 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Check CPU capabilities & initialize optimized funtions for this particular
+ * processor.
+ *
+ * Written by Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ * Changed by Andre Werthmann <wertmann@cs.uni-potsdam.de> for using the
+ * new SSE functions
+ *
+ * Reimplemented by Gareth Hughes in a more
+ * future-proof manner, based on code in the Linux kernel.
+ */
+
+#ifndef __COMMON_X86_ASM_H__
+#define __COMMON_X86_ASM_H__
+
+/* Do not reference mtypes.h from this file.
+ */
+#include "common_x86_features.h"
+
+extern int _mesa_x86_cpu_features;
+
+extern void _mesa_get_x86_features(void);
+
+extern void _mesa_check_os_sse_support(void);
+
+extern void _mesa_init_all_x86_transform_asm( void );
+
+#endif
--- a/src/arch/x86/common_x86_features.h
+++ b/src/arch/x86/common_x86_features.h
@ -0,0 +1,67 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  5.1
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * x86 CPUID feature information.  The raw data is returned by
+ * _mesa_identify_x86_cpu_features() and interpreted with the cpu_has_*
+ * helper macros.
+ *
+ * Gareth Hughes
+ */
+
+#ifndef __COMMON_X86_FEATURES_H__
+#define __COMMON_X86_FEATURES_H__
+
+#define X86_FEATURE_FPU		(1<<0)
+#define X86_FEATURE_CMOV	(1<<1)
+#define X86_FEATURE_MMXEXT	(1<<2)
+#define X86_FEATURE_MMX		(1<<3)
+#define X86_FEATURE_FXSR	(1<<4)
+#define X86_FEATURE_XMM		(1<<5)
+#define X86_FEATURE_XMM2	(1<<6)
+#define X86_FEATURE_3DNOWEXT	(1<<7)
+#define X86_FEATURE_3DNOW	(1<<8)
+
+/* standard X86 CPU features */
+#define X86_CPU_FPU		(1<<0)
+#define X86_CPU_CMOV		(1<<15)
+#define X86_CPU_MMX		(1<<23)
+#define X86_CPU_XMM		(1<<25)
+#define X86_CPU_XMM2		(1<<26)
+
+/* extended X86 CPU features */
+#define X86_CPUEXT_MMX_EXT	(1<<22)
+#define X86_CPUEXT_3DNOW_EXT	(1<<30)
+#define X86_CPUEXT_3DNOW	(1<<31)
+
+#define cpu_has_mmx		(_mesa_x86_cpu_features & X86_FEATURE_MMX)
+#define cpu_has_mmxext		(_mesa_x86_cpu_features & X86_FEATURE_MMXEXT)
+#define cpu_has_xmm		(_mesa_x86_cpu_features & X86_FEATURE_XMM)
+#define cpu_has_xmm2		(_mesa_x86_cpu_features & X86_FEATURE_XMM2)
+#define cpu_has_3dnow		(_mesa_x86_cpu_features & X86_FEATURE_3DNOW)
+#define cpu_has_3dnowext	(_mesa_x86_cpu_features & X86_FEATURE_3DNOWEXT)
+
+#endif
+
--- a/src/arch/x86/gen_matypes.c
+++ b/src/arch/x86/gen_matypes.c
@ -0,0 +1,240 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.5.1
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Gareth Hughes
+ */
+
+/*
+ * This generates an asm version of mtypes.h (called matypes.h), so that
+ * Mesa's x86 assembly code can access the internal structures easily.
+ * This will be particularly useful when developing new x86 asm code for
+ * Mesa, including lighting, clipping, texture image conversion etc.
+ */
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "tnl/t_context.h"
+
+
+#undef offsetof
+#define offsetof( type, member ) ((size_t) &((type *)0)->member)
+
+
+#define OFFSET_HEADER( x )						\
+do {									\
+   printf( "\n" );							\
+   printf( "\n" );							\
+   printf( "/* ====================================================="	\
+	   "========\n" );						\
+   printf( " * Offsets for %s\n", x );					\
+   printf( " */\n" );							\
+   printf( "\n" );							\
+} while (0)
+
+#define DEFINE_HEADER( x )						\
+do {									\
+   printf( "\n" );							\
+   printf( "/*\n" );							\
+   printf( " * Flags for %s\n", x );					\
+   printf( " */\n" );							\
+   printf( "\n" );							\
+} while (0)
+
+#define OFFSET( s, t, m )						\
+   printf( "#define %s\t%lu\n", s, (unsigned long) offsetof( t, m ) );
+
+#define SIZEOF( s, t )							\
+   printf( "#define %s\t%lu\n", s, (unsigned long) sizeof(t) );
+
+#define DEFINE( s, d )							\
+   printf( "#define %s\t0x%" PRIx64 "\n", s, (uint64_t) d );
+
+
+
+int main( int argc, char **argv )
+{
+   printf( "/*\n" );
+   printf( " * This file is automatically generated from the Mesa internal type\n" );
+   printf( " * definitions.  Do not edit directly.\n" );
+   printf( " */\n" );
+   printf( "\n" );
+   printf( "#ifndef __ASM_TYPES_H__\n" );
+   printf( "#define __ASM_TYPES_H__\n" );
+   printf( "\n" );
+
+
+   /* struct gl_context offsets:
+    */
+   OFFSET_HEADER( "struct gl_context" );
+
+   printf( "\n" );
+   OFFSET( "CTX_LIGHT_ENABLED           ", struct gl_context, Light.Enabled );
+   OFFSET( "CTX_LIGHT_SHADE_MODEL       ", struct gl_context, Light.ShadeModel );
+   OFFSET( "CTX_LIGHT_COLOR_MAT_FACE    ", struct gl_context, Light.ColorMaterialFace );
+   OFFSET( "CTX_LIGHT_COLOR_MAT_MODE    ", struct gl_context, Light.ColorMaterialMode );
+   OFFSET( "CTX_LIGHT_COLOR_MAT_MASK    ", struct gl_context, Light._ColorMaterialBitmask );
+   OFFSET( "CTX_LIGHT_COLOR_MAT_ENABLED ", struct gl_context, Light.ColorMaterialEnabled );
+   OFFSET( "CTX_LIGHT_ENABLED_LIST      ", struct gl_context, Light.EnabledList );
+   OFFSET( "CTX_LIGHT_NEED_VERTS        ", struct gl_context, Light._NeedVertices );
+   OFFSET( "CTX_LIGHT_BASE_COLOR        ", struct gl_context, Light._BaseColor );
+
+
+   /* struct vertex_buffer offsets:
+    */
+   OFFSET_HEADER( "struct vertex_buffer" );
+
+   OFFSET( "VB_SIZE                ", struct vertex_buffer, Size );
+   OFFSET( "VB_COUNT               ", struct vertex_buffer, Count );
+   printf( "\n" );
+   OFFSET( "VB_ELTS                ", struct vertex_buffer, Elts );
+   OFFSET( "VB_OBJ_PTR             ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_POS] );
+   OFFSET( "VB_EYE_PTR             ", struct vertex_buffer, EyePtr );
+   OFFSET( "VB_CLIP_PTR            ", struct vertex_buffer, ClipPtr );
+   OFFSET( "VB_PROJ_CLIP_PTR       ", struct vertex_buffer, NdcPtr );
+   OFFSET( "VB_CLIP_OR_MASK        ", struct vertex_buffer, ClipOrMask );
+   OFFSET( "VB_CLIP_MASK           ", struct vertex_buffer, ClipMask );
+   OFFSET( "VB_NORMAL_PTR          ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_NORMAL] );
+   OFFSET( "VB_EDGE_FLAG           ", struct vertex_buffer, EdgeFlag );
+   OFFSET( "VB_TEX0_COORD_PTR      ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_TEX0] );
+   OFFSET( "VB_TEX1_COORD_PTR      ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_TEX1] );
+   OFFSET( "VB_TEX2_COORD_PTR      ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_TEX2] );
+   OFFSET( "VB_TEX3_COORD_PTR      ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_TEX3] );
+   OFFSET( "VB_INDEX_PTR           ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_COLOR_INDEX] );
+   OFFSET( "VB_COLOR_PTR           ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_COLOR0] );
+   OFFSET( "VB_SECONDARY_COLOR_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_COLOR1] );
+   OFFSET( "VB_FOG_COORD_PTR       ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_FOG] );
+   OFFSET( "VB_PRIMITIVE           ", struct vertex_buffer, Primitive );
+   printf( "\n" );
+
+   DEFINE_HEADER( "struct vertex_buffer" );
+
+   /* XXX use new labels here someday after vertex proram is done */
+   DEFINE( "VERT_BIT_OBJ           ", VERT_BIT_POS );
+   DEFINE( "VERT_BIT_NORM          ", VERT_BIT_NORMAL );
+   DEFINE( "VERT_BIT_RGBA          ", VERT_BIT_COLOR0 );
+   DEFINE( "VERT_BIT_SPEC_RGB      ", VERT_BIT_COLOR1 );
+   DEFINE( "VERT_BIT_FOG_COORD     ", VERT_BIT_FOG );
+   DEFINE( "VERT_BIT_TEX0          ", VERT_BIT_TEX0 );
+   DEFINE( "VERT_BIT_TEX1          ", VERT_BIT_TEX1 );
+   DEFINE( "VERT_BIT_TEX2          ", VERT_BIT_TEX2 );
+   DEFINE( "VERT_BIT_TEX3          ", VERT_BIT_TEX3 );
+
+
+   /* GLvector4f offsets:
+    */
+   OFFSET_HEADER( "GLvector4f" );
+
+   OFFSET( "V4F_DATA          ", GLvector4f, data );
+   OFFSET( "V4F_START         ", GLvector4f, start );
+   OFFSET( "V4F_COUNT         ", GLvector4f, count );
+   OFFSET( "V4F_STRIDE        ", GLvector4f, stride );
+   OFFSET( "V4F_SIZE          ", GLvector4f, size );
+   OFFSET( "V4F_FLAGS         ", GLvector4f, flags );
+
+   DEFINE_HEADER( "GLvector4f" );
+
+   DEFINE( "VEC_MALLOC        ", VEC_MALLOC );
+   DEFINE( "VEC_NOT_WRITEABLE ", VEC_NOT_WRITEABLE );
+   DEFINE( "VEC_BAD_STRIDE    ", VEC_BAD_STRIDE );
+   printf( "\n" );
+   DEFINE( "VEC_SIZE_1        ", VEC_SIZE_1 );
+   DEFINE( "VEC_SIZE_2        ", VEC_SIZE_2 );
+   DEFINE( "VEC_SIZE_3        ", VEC_SIZE_3 );
+   DEFINE( "VEC_SIZE_4        ", VEC_SIZE_4 );
+
+
+   /* GLmatrix offsets:
+    */
+   OFFSET_HEADER( "GLmatrix" );
+
+   OFFSET( "MATRIX_DATA   ", GLmatrix, m );
+   OFFSET( "MATRIX_INV    ", GLmatrix, inv );
+   OFFSET( "MATRIX_FLAGS  ", GLmatrix, flags );
+   OFFSET( "MATRIX_TYPE   ", GLmatrix, type );
+
+
+   /* struct gl_light offsets:
+    */
+   OFFSET_HEADER( "struct gl_light" );
+
+   OFFSET( "LIGHT_NEXT              ", struct gl_light, next );
+   OFFSET( "LIGHT_PREV              ", struct gl_light, prev );
+   printf( "\n" );
+   OFFSET( "LIGHT_AMBIENT           ", struct gl_light, Ambient );
+   OFFSET( "LIGHT_DIFFUSE           ", struct gl_light, Diffuse );
+   OFFSET( "LIGHT_SPECULAR          ", struct gl_light, Specular );
+   OFFSET( "LIGHT_EYE_POSITION      ", struct gl_light, EyePosition );
+   OFFSET( "LIGHT_SPOT_DIRECTION    ", struct gl_light, SpotDirection );
+   OFFSET( "LIGHT_SPOT_EXPONENT     ", struct gl_light, SpotExponent );
+   OFFSET( "LIGHT_SPOT_CUTOFF       ", struct gl_light, SpotCutoff );
+   OFFSET( "LIGHT_COS_CUTOFF        ", struct gl_light, _CosCutoff );
+   OFFSET( "LIGHT_CONST_ATTEN       ", struct gl_light, ConstantAttenuation );
+   OFFSET( "LIGHT_LINEAR_ATTEN      ", struct gl_light, LinearAttenuation );
+   OFFSET( "LIGHT_QUADRATIC_ATTEN   ", struct gl_light, QuadraticAttenuation );
+   OFFSET( "LIGHT_ENABLED           ", struct gl_light, Enabled );
+   printf( "\n" );
+   OFFSET( "LIGHT_FLAGS             ", struct gl_light, _Flags );
+   printf( "\n" );
+   OFFSET( "LIGHT_POSITION          ", struct gl_light, _Position );
+   OFFSET( "LIGHT_VP_INF_NORM       ", struct gl_light, _VP_inf_norm );
+   OFFSET( "LIGHT_H_INF_NORM        ", struct gl_light, _h_inf_norm );
+   OFFSET( "LIGHT_NORM_DIRECTION    ", struct gl_light, _NormSpotDirection );
+   OFFSET( "LIGHT_VP_INF_SPOT_ATTEN ", struct gl_light, _VP_inf_spot_attenuation );
+   printf( "\n" );
+   OFFSET( "LIGHT_MAT_AMBIENT       ", struct gl_light, _MatAmbient );
+   OFFSET( "LIGHT_MAT_DIFFUSE       ", struct gl_light, _MatDiffuse );
+   OFFSET( "LIGHT_MAT_SPECULAR      ", struct gl_light, _MatSpecular );
+   printf( "\n" );
+   SIZEOF( "SIZEOF_GL_LIGHT         ", struct gl_light );
+
+   DEFINE_HEADER( "struct gl_light" );
+
+   DEFINE( "LIGHT_SPOT              ", LIGHT_SPOT );
+   DEFINE( "LIGHT_LOCAL_VIEWER      ", LIGHT_LOCAL_VIEWER );
+   DEFINE( "LIGHT_POSITIONAL        ", LIGHT_POSITIONAL );
+   printf( "\n" );
+   DEFINE( "LIGHT_NEED_VERTICES     ", LIGHT_NEED_VERTICES );
+
+
+   /* struct gl_lightmodel offsets:
+    */
+   OFFSET_HEADER( "struct gl_lightmodel" );
+
+   OFFSET( "LIGHT_MODEL_AMBIENT       ", struct gl_lightmodel, Ambient );
+   OFFSET( "LIGHT_MODEL_LOCAL_VIEWER  ", struct gl_lightmodel, LocalViewer );
+   OFFSET( "LIGHT_MODEL_TWO_SIDE      ", struct gl_lightmodel, TwoSide );
+   OFFSET( "LIGHT_MODEL_COLOR_CONTROL ", struct gl_lightmodel, ColorControl );
+
+
+   printf( "\n" );
+   printf( "\n" );
+   printf( "#endif /* __ASM_TYPES_H__ */\n" );
+
+   return 0;
+}
--- a/src/arch/x86/mmx.h
+++ b/src/arch/x86/mmx.h
@ -0,0 +1,59 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.5.2
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef ASM_MMX_H
+#define ASM_MMX_H
+
+#include "main/compiler.h"
+#include "main/glheader.h"
+
+struct gl_context;
+
+extern void _ASMAPI
+_mesa_mmx_blend_transparency( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+                              GLvoid *rgba, const GLvoid *dest,
+                              GLenum chanType );
+
+extern void _ASMAPI
+_mesa_mmx_blend_add( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+                     GLvoid *rgba, const GLvoid *dest,
+                     GLenum chanType );
+
+extern void _ASMAPI
+_mesa_mmx_blend_min( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+                     GLvoid *rgba, const GLvoid *dest,
+                     GLenum chanType );
+
+extern void _ASMAPI
+_mesa_mmx_blend_max( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+                     GLvoid *rgba, const GLvoid *dest,
+                     GLenum chanType );
+
+extern void _ASMAPI
+_mesa_mmx_blend_modulate( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+                          GLvoid *rgba, const GLvoid *dest,
+                          GLenum chanType );
+
+#endif
--- a/src/arch/x86/mmx_blend.S
+++ b/src/arch/x86/mmx_blend.S
@ -0,0 +1,402 @@
+	;
+/*
+ * Written by Jos<EFBFBD> Fonseca <j_r_fonseca@yahoo.co.uk>
+ */
+
+
+#ifdef USE_MMX_ASM
+#include "assyntax.h"
+#include "matypes.h"
+
+/* integer multiplication - alpha plus one
+ *
+ * makes the following approximation to the division (Sree)
+ *
+ *   rgb*a/255 ~= (rgb*(a+1)) >> 256
+ *
+ * which is the fastest method that satisfies the following OpenGL criteria
+ *
+ *   0*0 = 0 and 255*255 = 255
+ *
+ * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making
+ *
+ *   PCMPEQW    ( MX1, MX1 )
+ */
+#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \
+    PSUBW      ( MX1, MA1 )			/*   a1 + 1  |   a1 + 1  |   a1 + 1  |   a1 + 1  */	;\
+    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
+													;\
+TWO(PSUBW      ( MX1, MA2 ))			/*   a2 + 1  |   a2 + 1  |   a2 + 1  |   a2 + 1  */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*               t1 >> 8 ~= t1/255               */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*               t2 >> 8 ~= t2/255               */	
+
+
+/* integer multiplication - geometric series
+ *
+ * takes the geometric series approximation to the division
+ *
+ *   t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
+ *
+ * in this case just the first two terms to fit in 16bit arithmetic
+ *
+ *   t/255 ~= (t + (t >> 8)) >> 8
+ *
+ * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, 
+ * so the special case a = 255 must be accounted or roundoff must be used
+ */
+#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \
+    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
+													;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+													;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* integer multiplication - geometric series plus rounding
+ *
+ * when using a geometric series division instead of truncating the result 
+ * use roundoff in the approximation (Jim Blinn)
+ *
+ *   t = rgb*a + 0x80
+ *
+ * achieving the exact results
+ *
+ * note that M80 is register with the 0x0080008000800080 constant
+ */
+#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \
+    PMULLW     ( MP1, MA1 )			/*                  t1 = p1*a1                   */	;\
+    PADDW      ( M80, MA1 )			/*                 t1 += 0x80                    */	;\
+													;\
+TWO(PMULLW     ( MP2, MA2 ))			/*                  t2 = p2*a2                   */	;\
+TWO(PADDW      ( M80, MA2 ))			/*                 t2 += 0x80                    */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
+													;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+													;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* linear interpolation - geometric series 
+ */
+#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \
+    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
+    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
+    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
+													;\
+TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
+TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
+													;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+													;\
+    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
+TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* linear interpolation - geometric series with roundoff
+ *
+ * this is a generalization of Blinn's formula to signed arithmetic
+ *
+ * note that M80 is a register with the 0x0080008000800080 constant
+ */
+#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \
+    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
+    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
+    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
+													;\
+TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
+TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
+													;\
+    PSRLW      ( CONST(15), MP1 )		/*                 q1 > p1 ? 1 : 0               */	;\
+TWO(PSRLW      ( CONST(15), MP2 ))		/*                 q2 > q2 ? 1 : 0               */	;\
+													;\
+    PSLLW      ( CONST(8), MP1 )		/*             q1 > p1 ? 0x100 : 0               */	;\
+TWO(PSLLW      ( CONST(8), MP2 ))		/*             q2 > q2 ? 0x100 : 0               */	;\
+													;\
+    PSUBW      ( MP1, MA1 )			/*                  t1 -=? 0x100                 */	;\
+TWO(PSUBW      ( MP2, MA2 ))			/*                  t2 -=? 0x100                 */	;\
+ 													;\
+    PADDW      ( M80, MA1 )			/*                 t1 += 0x80                    */	;\
+TWO(PADDW      ( M80, MA2 ))			/*                 t2 += 0x80                    */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
+													;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+													;\
+    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
+TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* linear interpolation - geometric series with correction
+ *
+ * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
+ *
+ *   t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
+ *
+ * note that although is faster than rounding off it doesn't give always the exact results
+ */
+#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \
+    PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */	;\
+    PSLLW      ( CONST(8), MQ1 )		/*                    q1 << 8                    */	;\
+    PMULLW     ( MP1, MA1 )			/*              t1 = (q1 - p1)*pa1               */	;\
+													;\
+TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */	;\
+TWO(PSLLW      ( CONST(8), MQ2 ))		/*                    q2 << 8                    */	;\
+TWO(PMULLW     ( MP2, MA2 ))			/*              t2 = (q2 - p2)*pa2               */	;\
+													;\
+    MOVQ       ( MA1, MP1 )										;\
+    PSRLW      ( CONST(8), MA1 )		/*                    t1 >> 8                    */	;\
+													;\
+TWO(MOVQ       ( MA2, MP2 ))										;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*                    t2 >> 8                    */	;\
+													;\
+    PADDW      ( MA1, MP1 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */	;\
+    PSRLW      ( CONST(7), MA1 )		/*                    t1 >> 15                   */	;\
+													;\
+TWO(PADDW      ( MA2, MP2 ))			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */	;\
+TWO(PSRLW      ( CONST(7), MA2 ))		/*                    t2 >> 15                   */	;\
+													;\
+    PADDW      ( MP1, MA1 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */	;\
+TWO(PADDW      ( MP2, MA2 ))			/*  t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8  */	;\
+													;\
+    PADDW      ( MQ1, MA1 )			/*              (t1/255 + q1) << 8               */	;\
+TWO(PADDW      ( MQ2, MA2 ))			/*              (t2/255 + q2) << 8               */	;\
+													;\
+    PSRLW      ( CONST(8), MA1 )		/*    sa1    |    sb1    |    sg1    |    sr1    */	;\
+TWO(PSRLW      ( CONST(8), MA2 ))		/*    sa2    |    sb2    |    sg2    |    sr2    */
+
+
+/* common blending setup code
+ *
+ * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making
+ *
+ *   PXOR      ( M00, M00 )
+ */
+#define GMB_LOAD(rgba, dest, MPP, MQQ) \
+ONE(MOVD       ( REGIND(rgba), MPP ))		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */	;\
+ONE(MOVD       ( REGIND(dest), MQQ ))		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */	;\
+													;\
+TWO(MOVQ       ( REGIND(rgba), MPP ))		/* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */	;\
+TWO(MOVQ       ( REGIND(dest), MQQ ))		/* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
+
+#define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \
+TWO(MOVQ       ( MP1, MP2 ))										;\
+TWO(MOVQ       ( MQ1, MQ2 ))										;\
+													;\
+    PUNPCKLBW  ( M00, MQ1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */	;\
+TWO(PUNPCKHBW  ( M00, MQ2 ))                    /*    qa2    |    qb2    |    qg2    |    qr2    */	;\
+    PUNPCKLBW  ( M00, MP1 )			/*    pa1    |    pb1    |    pg1    |    pr1    */	;\
+TWO(PUNPCKHBW  ( M00, MP2 ))                    /*    pa2    |    pb2    |    pg2    |    pr2    */
+
+#define GMB_ALPHA(MP1, MA1, MP2, MA2) \
+    MOVQ       ( MP1, MA1 )										;\
+TWO(MOVQ       ( MP2, MA2 ))										;\
+													;\
+    PUNPCKHWD  ( MA1, MA1 )			/*    pa1    |    pa1    |           |           */	;\
+TWO(PUNPCKHWD  ( MA2, MA2 ))			/*    pa2    |    pa2    |           |           */	;\
+    PUNPCKHDQ  ( MA1, MA1 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */	;\
+TWO(PUNPCKHDQ  ( MA2, MA2 ))                    /*    pa2    |    pa2    |    pa2    |    pa2    */
+
+#define GMB_PACK( MS1, MS2 ) \
+    PACKUSWB   ( MS2, MS1 )			/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */	;\
+
+#define GMB_STORE(rgba, MSS ) \
+ONE(MOVD       ( MSS, REGIND(rgba) ))		/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */	;\
+TWO(MOVQ       ( MSS, REGIND(rgba) ))		/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
+
+/* Kevin F. Quinn <kevquinn@gentoo.org> 2 July 2006
+ * Replace data segment constants with text-segment
+ * constants (via pushl/movq)
+    SEG_DATA
+
+ALIGNDATA8
+const_0080:
+    D_LONG 0x00800080, 0x00800080
+
+const_80:
+    D_LONG 0x80808080, 0x80808080
+*/
+#define const_0080_l 0x00800080
+#define const_0080_h 0x00800080
+#define const_80_l 0x80808080
+#define const_80_h 0x80808080
+
+    SEG_TEXT
+
+
+/* Blend transparency function
+ */
+
+#define TAG(x) CONCAT(x,_transparency)
+#define LLTAG(x) LLBL2(x,_transparency)
+
+#define INIT \
+    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
+
+#define MAIN( rgba, dest ) \
+    GMB_LOAD( rgba, dest, MM1, MM2 )									;\
+    GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 )								;\
+    GMB_ALPHA( MM1, MM3, MM4, MM6 )									;\
+    GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 )							;\
+    GMB_PACK( MM3, MM6 )										;\
+    GMB_STORE( rgba, MM3 )
+
+#include "mmx_blendtmp.h"
+
+
+/* Blend add function
+ *
+ * FIXME: Add some loop unrolling here...
+ */
+
+#define TAG(x) CONCAT(x,_add)
+#define LLTAG(x) LLBL2(x,_add)
+
+#define INIT
+
+#define MAIN( rgba, dest ) \
+ONE(MOVD       ( REGIND(rgba), MM1 ))		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */	;\
+ONE(MOVD       ( REGIND(dest), MM2 ))		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */	;\
+ONE(PADDUSB    ( MM2, MM1 ))										;\
+ONE(MOVD       ( MM1, REGIND(rgba) ))		/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */	;\
+													;\
+TWO(MOVQ       ( REGIND(rgba), MM1 ))		/* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */	;\
+TWO(PADDUSB    ( REGIND(dest), MM1 ))		/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */	;\
+TWO(MOVQ       ( MM1, REGIND(rgba) ))
+
+#include "mmx_blendtmp.h"
+
+
+/* Blend min function
+ */
+
+#define TAG(x) CONCAT(x,_min)
+#define LLTAG(x) LLBL2(x,_min)
+
+/* Kevin F. Quinn 2nd July 2006
+ * Replace data segment constants with text-segment instructions
+#define INIT \
+    MOVQ       ( CONTENT(const_80), MM7 )
+ */
+#define INIT \
+    PUSH_L     ( CONST(const_80_h) ) 		/* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/	;\
+    PUSH_L     ( CONST(const_80_l) ) 									;\
+    MOVQ       ( REGIND(ESP), MM7 ) 									;\
+    ADD_L      ( CONST(8), ESP)
+
+#define MAIN( rgba, dest ) \
+    GMB_LOAD( rgba, dest, MM1, MM2 )									;\
+    MOVQ       ( MM1, MM3 )										;\
+    MOVQ       ( MM2, MM4 )										;\
+    PXOR       ( MM7, MM3 )			/*              unsigned -> signed               */	;\
+    PXOR       ( MM7, MM4 )			/*              unsigned -> signed               */	;\
+    PCMPGTB    ( MM3, MM4 )			/*                 q > p ? 0xff : 0x00           */	;\
+    PAND       ( MM4, MM1 )			/*                 q > p ? p : 0                 */	;\
+    PANDN      ( MM2, MM4 )			/*                 q > p ? 0 : q                 */	;\
+    POR        ( MM1, MM4 )			/*                 q > p ? p : q                 */	;\
+    GMB_STORE( rgba, MM4 )
+
+#include "mmx_blendtmp.h"
+
+
+/* Blend max function
+ */
+
+#define TAG(x) CONCAT(x,_max)
+#define LLTAG(x) LLBL2(x,_max)
+
+/* Kevin F. Quinn 2nd July 2006
+ * Replace data segment constants with text-segment instructions
+#define INIT \
+    MOVQ       ( CONTENT(const_80), MM7 )
+ */
+#define INIT \
+    PUSH_L     ( CONST(const_80_l) ) 		/* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/	;\
+    PUSH_L     ( CONST(const_80_h) ) 									;\
+    MOVQ       ( REGIND(ESP), MM7 ) 									;\
+    ADD_L      ( CONST(8), ESP)
+
+#define MAIN( rgba, dest ) \
+    GMB_LOAD( rgba, dest, MM1, MM2 )									;\
+    MOVQ       ( MM1, MM3 )										;\
+    MOVQ       ( MM2, MM4 )										;\
+    PXOR       ( MM7, MM3 )			/*              unsigned -> signed               */	;\
+    PXOR       ( MM7, MM4 )			/*              unsigned -> signed               */	;\
+    PCMPGTB    ( MM3, MM4 )			/*                 q > p ? 0xff : 0x00           */	;\
+    PAND       ( MM4, MM2 )			/*                 q > p ? q : 0                 */	;\
+    PANDN      ( MM1, MM4 )			/*                 q > p ? 0 : p                 */	;\
+    POR        ( MM2, MM4 )			/*                 q > p ? p : q                 */	;\
+    GMB_STORE( rgba, MM4 )
+
+#include "mmx_blendtmp.h"
+
+
+/* Blend modulate function
+ */
+
+#define TAG(x) CONCAT(x,_modulate)
+#define LLTAG(x) LLBL2(x,_modulate)
+
+/* Kevin F. Quinn 2nd July 2006
+ * Replace data segment constants with text-segment instructions
+#define INIT \
+    MOVQ       ( CONTENT(const_0080), MM7 )
+ */
+#define INIT \
+    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */	;\
+    PUSH_L     ( CONST(const_0080_l) ) 	/*   0x0080  |   0x0080  |   0x0080  |   0x0080  */	;\
+    PUSH_L     ( CONST(const_0080_h) ) 								;\
+    MOVQ       ( REGIND(ESP), MM7 ) 									;\
+    ADD_L      ( CONST(8), ESP)
+
+#define MAIN( rgba, dest ) \
+    GMB_LOAD( rgba, dest, MM1, MM2 )									;\
+    GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 )								;\
+    GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 )								;\
+    GMB_PACK( MM2, MM5 )										;\
+    GMB_STORE( rgba, MM2 )
+
+#include "mmx_blendtmp.h"
+
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/mmx_blendtmp.h
+++ b/src/arch/x86/mmx_blendtmp.h
@ -0,0 +1,114 @@
+/*
+ * Written by José Fonseca <j_r_fonseca@yahoo.co.uk>
+ */
+
+
+/*
+ * void _mesa_mmx_blend( struct gl_context *ctx,
+ *                       GLuint n, 
+ *                       const GLubyte mask[],
+ *                       GLchan rgba[][4], 
+ *                       CONST GLchan dest[][4] )
+ * 
+ */
+ALIGNTEXT16
+GLOBL GLNAME( TAG(_mesa_mmx_blend) )
+HIDDEN( TAG(_mesa_mmx_blend) )
+GLNAME( TAG(_mesa_mmx_blend) ):
+
+    PUSH_L     ( EBP )
+    MOV_L      ( ESP, EBP )
+    PUSH_L     ( ESI )
+    PUSH_L     ( EDI )
+    PUSH_L     ( EBX )
+
+    MOV_L      ( REGOFF(12, EBP), ECX )		/* n */
+    CMP_L      ( CONST(0), ECX)
+    JE         ( LLTAG(GMB_return) )
+
+    MOV_L      ( REGOFF(16, EBP), EBX )		/* mask */
+    MOV_L      ( REGOFF(20, EBP), EDI )         /* rgba */
+    MOV_L      ( REGOFF(24, EBP), ESI )         /* dest */
+
+    INIT
+    
+    TEST_L     ( CONST(4), EDI )		/* align rgba on an 8-byte boundary */
+    JZ         ( LLTAG(GMB_align_end) )
+
+    CMP_B      ( CONST(0), REGIND(EBX) )	/* *mask == 0 */
+    JE         ( LLTAG(GMB_align_continue) )
+
+    /* runin */
+#define ONE(x)	x
+#define TWO(x)  
+    MAIN       ( EDI, ESI )
+#undef ONE
+#undef TWO
+
+LLTAG(GMB_align_continue):
+
+    DEC_L      ( ECX )				/* n -= 1 */
+    INC_L      ( EBX )		                /* mask += 1 */
+    ADD_L      ( CONST(4), EDI )		/* rgba += 1 */
+    ADD_L      ( CONST(4), ESI )		/* dest += 1 */ 
+
+LLTAG(GMB_align_end):
+
+    CMP_L      ( CONST(2), ECX)
+    JB         ( LLTAG(GMB_loop_end) )
+
+ALIGNTEXT16
+LLTAG(GMB_loop_begin):
+
+    CMP_W      ( CONST(0), REGIND(EBX) )	/* *mask == 0 && *(mask + 1) == 0 */
+    JE         ( LLTAG(GMB_loop_continue) )
+
+    /* main loop */
+#define ONE(x)
+#define TWO(x)	x
+    MAIN       ( EDI, ESI )
+#undef ONE
+#undef TWO
+
+LLTAG(GMB_loop_continue):
+
+    DEC_L      ( ECX )
+    DEC_L      ( ECX )				/* n -= 2 */
+    ADD_L      ( CONST(2), EBX )		/* mask += 2 */
+    ADD_L      ( CONST(8), EDI )		/* rgba += 2 */
+    ADD_L      ( CONST(8), ESI )		/* dest += 2 */ 
+    CMP_L      ( CONST(2), ECX )
+    JAE        ( LLTAG(GMB_loop_begin) )
+
+LLTAG(GMB_loop_end):
+
+    CMP_L      ( CONST(1), ECX )
+    JB         ( LLTAG(GMB_done) )
+
+    CMP_B      ( CONST(0), REGIND(EBX) )	/* *mask == 0 */
+    JE         ( LLTAG(GMB_done) )
+
+    /* runout */
+#define ONE(x)	x
+#define TWO(x)
+    MAIN       ( EDI, ESI )
+#undef ONE
+#undef TWO
+
+LLTAG(GMB_done):
+
+    EMMS
+
+LLTAG(GMB_return):
+
+    POP_L      ( EBX )
+    POP_L      ( EDI )
+    POP_L      ( ESI )
+    MOV_L      ( EBP, ESP )
+    POP_L      ( EBP )
+    RET
+
+#undef TAG
+#undef LLTAG
+#undef INIT
+#undef MAIN
--- a/src/arch/x86/norm_args.h
+++ b/src/arch/x86/norm_args.h
@ -0,0 +1,57 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Normal transform function interface for assembly code.  Simply define
+ * FRAME_OFFSET to the number of bytes pushed onto the stack before
+ * using the ARG_* argument macros.
+ *
+ * Gareth Hughes
+ */
+
+#ifndef __NORM_ARGS_H__
+#define __NORM_ARGS_H__
+
+/* Offsets for normal_func arguments
+ *
+ * typedef void (*normal_func)( const GLmatrix *mat,
+ *                              GLfloat scale,
+ *                              const GLvector4f *in,
+ *                              const GLfloat lengths[],
+ *                              GLvector4f *dest );
+ */
+#define OFFSET_MAT	4
+#define OFFSET_SCALE	8
+#define OFFSET_IN	12
+#define OFFSET_LENGTHS	16
+#define OFFSET_DEST	20
+
+#define ARG_MAT         REGOFF(FRAME_OFFSET+OFFSET_MAT, ESP)
+#define ARG_SCALE       REGOFF(FRAME_OFFSET+OFFSET_SCALE, ESP)
+#define ARG_IN          REGOFF(FRAME_OFFSET+OFFSET_IN, ESP)
+#define ARG_LENGTHS     REGOFF(FRAME_OFFSET+OFFSET_LENGTHS, ESP)
+#define ARG_DEST        REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
+
+#endif
--- a/src/arch/x86/read_rgba_span_x86.S
+++ b/src/arch/x86/read_rgba_span_x86.S
@ -0,0 +1,686 @@
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+ 
+/**
+ * \file read_rgba_span_x86.S
+ * Optimized routines to transfer pixel data from the framebuffer to a
+ * buffer in main memory.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+	.file	"read_rgba_span_x86.S"
+#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
+/* Kevin F. Quinn 2nd July 2006
+ * Replaced data segment constants with text-segment instructions.
+ */
+#define	LOAD_MASK(mvins,m1,m2) \
+   	pushl	$0xff00ff00 ;\
+   	pushl	$0xff00ff00 ;\
+   	pushl	$0xff00ff00 ;\
+   	pushl	$0xff00ff00 ;\
+	mvins	(%esp), m1	;\
+   	pushl	$0x00ff0000 ;\
+   	pushl	$0x00ff0000 ;\
+   	pushl	$0x00ff0000 ;\
+   	pushl	$0x00ff0000 ;\
+	mvins	(%esp), m2	;\
+	addl	$32, %esp
+
+/* I implemented these as macros because they appear in several places,
+ * and I've tweaked them a number of times.  I got tired of changing every
+ * place they appear. :)
+ */
+
+#define DO_ONE_PIXEL() \
+	movl	(%ebx), %eax ; \
+	addl	$4, %ebx ; \
+	bswap	%eax          /* ARGB -> BGRA */ ; \
+	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
+	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+	addl	$4, %ecx
+
+#define DO_ONE_LAST_PIXEL() \
+	movl	(%ebx), %eax ; \
+	bswap	%eax          /* ARGB -> BGRA */ ; \
+	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
+	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
+
+
+/**
+ * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
+ * 
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
+#endif
+	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
+_generic_read_RGBA_span_BGRA8888_REV_MMX:
+	pushl	%ebx
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+	LOAD_MASK(movq,%mm1,%mm2)
+
+	movl	8(%esp), %ebx	/* source pointer */
+	movl	16(%esp), %edx	/* number of pixels to copy */
+	movl	12(%esp), %ecx	/* destination pointer */
+
+	testl	%edx, %edx
+	jle	.L20		/* Bail if there's nothing to do. */
+
+	movl	%ebx, %eax
+
+	negl	%eax
+	sarl	$2, %eax
+	andl	$1, %eax
+	je	.L17
+
+	subl	%eax, %edx
+	DO_ONE_PIXEL()
+.L17:
+
+	/* Would it be faster to unroll this loop once and process 4 pixels
+	 * per pass, instead of just two?
+	 */
+
+	movl	%edx, %eax
+	shrl	%eax
+	jmp	.L18
+.L19:
+	movq	(%ebx), %mm0
+	addl	$8, %ebx
+
+	/* These 9 instructions do what PSHUFB (if there were such an
+	 * instruction) could do in 1. :(
+	 */
+
+	movq	%mm0, %mm3
+	movq	%mm0, %mm4
+
+	pand	%mm2, %mm3
+	psllq	$16, %mm4
+	psrlq	$16, %mm3
+	pand	%mm2, %mm4
+
+	pand	%mm1, %mm0
+	por	%mm4, %mm3
+	por	%mm3, %mm0
+
+	movq	%mm0, (%ecx)
+	addl	$8, %ecx
+	subl	$1, %eax
+.L18:
+	jne	.L19
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+
+	/* At this point there are either 1 or 0 pixels remaining to be
+	 * converted.  Convert the last pixel, if needed.
+	 */
+
+	testl	$1, %edx
+	je	.L20
+
+	DO_ONE_LAST_PIXEL()
+
+.L20:
+	popl	%ebx
+	ret
+	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
+
+
+/**
+ * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
+ * instructions are only actually used to read data from the framebuffer.
+ * In practice, the speed-up is pretty small.
+ *
+ * \todo
+ * Do some more testing and determine if there's any reason to have this
+ * function in addition to the MMX version.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
+#endif
+	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE:
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+
+	LOAD_MASK(movq,%mm1,%mm2)
+
+	movl	16(%esp), %ebx	/* source pointer */
+	movl	24(%esp), %edx	/* number of pixels to copy */
+	movl	20(%esp), %ecx	/* destination pointer */
+
+	testl	%edx, %edx
+	jle	.L35		/* Bail if there's nothing to do. */
+
+	movl	%esp, %ebp
+	subl	$16, %esp
+	andl	$0xfffffff0, %esp
+
+	movl	%ebx, %eax
+	movl	%edx, %esi
+
+	negl	%eax
+	andl	$15, %eax
+	sarl	$2, %eax
+	cmpl	%edx, %eax
+	cmovle	%eax, %esi
+
+	subl	%esi, %edx
+
+	testl	$1, %esi
+	je	.L32
+
+	DO_ONE_PIXEL()
+.L32:
+
+	testl	$2, %esi
+	je	.L31
+
+	movq	(%ebx), %mm0
+	addl	$8, %ebx
+
+	movq	%mm0, %mm3
+	movq	%mm0, %mm4
+	
+	pand	%mm2, %mm3
+	psllq	$16, %mm4
+	psrlq	$16, %mm3
+	pand	%mm2, %mm4
+
+	pand	%mm1, %mm0
+	por	%mm4, %mm3
+	por	%mm3, %mm0
+
+	movq	%mm0, (%ecx)
+	addl	$8, %ecx
+.L31:
+
+	movl	%edx, %eax
+	shrl	$2, %eax
+	jmp	.L33
+.L34:
+	movaps	(%ebx), %xmm0
+	addl	$16, %ebx
+
+	/* This would be so much better if we could just move directly from
+	 * an SSE register to an MMX register.  Unfortunately, that
+	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
+	 * instruction.
+	 */
+
+	movaps	%xmm0, (%esp)
+	movq	(%esp), %mm0
+	movq	8(%esp), %mm5
+
+	movq	%mm0, %mm3
+	movq	%mm0, %mm4
+	movq	%mm5, %mm6
+	movq	%mm5, %mm7
+
+	pand	%mm2, %mm3
+	pand	%mm2, %mm6
+
+	psllq	$16, %mm4
+	psllq	$16, %mm7
+
+	psrlq	$16, %mm3
+	psrlq	$16, %mm6
+
+	pand	%mm2, %mm4
+	pand	%mm2, %mm7
+
+	pand	%mm1, %mm0
+	pand	%mm1, %mm5
+
+	por	%mm4, %mm3
+	por	%mm7, %mm6
+
+	por	%mm3, %mm0
+	por	%mm6, %mm5
+
+	movq	%mm0, (%ecx)
+	movq	%mm5, 8(%ecx)
+	addl	$16, %ecx
+
+	subl	$1, %eax
+.L33:
+	jne	.L34
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+	movl	%ebp, %esp
+
+	/* At this point there are either [0, 3] pixels remaining to be
+	 * converted.
+	 */
+
+	testl	$2, %edx
+	je	.L36
+
+	movq	(%ebx), %mm0
+	addl	$8, %ebx
+
+	movq	%mm0, %mm3
+	movq	%mm0, %mm4
+	
+	pand	%mm2, %mm3
+	psllq	$16, %mm4
+	psrlq	$16, %mm3
+	pand	%mm2, %mm4
+
+	pand	%mm1, %mm0
+	por	%mm4, %mm3
+	por	%mm3, %mm0
+
+	movq	%mm0, (%ecx)
+	addl	$8, %ecx
+.L36:
+
+	testl	$1, %edx
+	je	.L35
+
+	DO_ONE_LAST_PIXEL()
+.L35:
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
+
+
+/**
+ * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
+ */
+
+	.text
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
+#endif
+	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE2:
+	pushl	%esi
+	pushl	%ebx
+
+	LOAD_MASK(movdqu,%xmm1,%xmm2)
+
+	movl	12(%esp), %ebx	/* source pointer */
+	movl	20(%esp), %edx	/* number of pixels to copy */
+	movl	16(%esp), %ecx	/* destination pointer */
+
+	movl	%ebx, %eax
+	movl	%edx, %esi
+
+	testl	%edx, %edx
+	jle	.L46		/* Bail if there's nothing to do. */
+
+	/* If the source pointer isn't a multiple of 16 we have to process
+	 * a few pixels the "slow" way to get the address aligned for
+	 * the SSE fetch intsructions.
+	 */
+
+	negl	%eax
+	andl	$15, %eax
+	sarl	$2, %eax
+
+	cmpl	%edx, %eax
+	cmovbe	%eax, %esi
+	subl	%esi, %edx
+
+	testl	$1, %esi
+	je	.L41
+
+	DO_ONE_PIXEL()  
+.L41:
+	testl	$2, %esi
+	je	.L40
+
+	movq	(%ebx), %xmm0
+	addl	$8, %ebx
+
+	movdqa	%xmm0, %xmm3
+	movdqa	%xmm0, %xmm4
+	andps	%xmm1, %xmm0
+
+	andps	%xmm2, %xmm3
+	pslldq	$2, %xmm4
+	psrldq	$2, %xmm3
+	andps	%xmm2, %xmm4
+
+	orps	%xmm4, %xmm3
+	orps	%xmm3, %xmm0
+
+	movq	%xmm0, (%ecx)
+	addl	$8, %ecx
+.L40:
+
+	/* Would it be worth having a specialized version of this loop for
+	 * the case where the destination is 16-byte aligned?  That version
+	 * would be identical except that it could use movedqa instead of
+	 * movdqu.
+	 */
+
+	movl	%edx, %eax
+	shrl	$2, %eax
+	jmp	.L42
+.L43:
+	movdqa	(%ebx), %xmm0
+	addl	$16, %ebx
+
+	movdqa	%xmm0, %xmm3
+	movdqa	%xmm0, %xmm4
+	andps	%xmm1, %xmm0
+
+	andps	%xmm2, %xmm3
+	pslldq	$2, %xmm4
+	psrldq	$2, %xmm3
+	andps	%xmm2, %xmm4
+
+	orps	%xmm4, %xmm3
+	orps	%xmm3, %xmm0
+
+	movdqu	%xmm0, (%ecx)
+	addl	$16, %ecx
+	subl	$1, %eax
+.L42:
+	jne	.L43
+
+
+	/* There may be upto 3 pixels remaining to be copied.  Take care
+	 * of them now.  We do the 2 pixel case first because the data
+	 * will be aligned.
+	 */
+
+	testl	$2, %edx
+	je	.L47
+
+	movq	(%ebx), %xmm0
+	addl	$8, %ebx
+        
+	movdqa	%xmm0, %xmm3
+	movdqa	%xmm0, %xmm4
+	andps	%xmm1, %xmm0
+
+	andps	%xmm2, %xmm3
+	pslldq	$2, %xmm4
+	psrldq	$2, %xmm3
+	andps	%xmm2, %xmm4
+
+	orps	%xmm4, %xmm3
+	orps	%xmm3, %xmm0
+
+	movq	%xmm0, (%ecx)
+	addl	$8, %ecx        
+.L47:
+
+	testl	$1, %edx
+	je	.L46
+
+	DO_ONE_LAST_PIXEL()  
+.L46:
+
+	popl	%ebx
+	popl	%esi
+	ret
+	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
+
+
+
+#define MASK_565_L	0x07e0f800
+#define MASK_565_H	0x0000001f
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the
+ * classic C implementation in Mesa.  Setting SCALE_ADJUST
+ * to 0 is slightly faster but at a small cost to accuracy.
+ */
+#define SCALE_ADJUST	5
+#if SCALE_ADJUST == 5
+#define PRESCALE_L 0x00100001
+#define PRESCALE_H 0x00000200
+#define SCALE_L 0x40C620E8
+#define SCALE_H 0x0000839d
+#elif SCALE_ADJUST == 0
+#define PRESCALE_L 0x00200001
+#define PRESCALE_H 0x00000800
+#define SCALE_L 0x01040108
+#define SCALE_H 0x00000108
+#else
+#error SCALE_ADJUST must either be 5 or 0.
+#endif
+#define ALPHA_L 0x00000000
+#define ALPHA_H 0x00ff0000
+
+/**
+ * MMX optimized version of the RGB565 to RGBA copy routine.
+ */
+
+	.text
+	.globl	_generic_read_RGBA_span_RGB565_MMX
+#ifndef USE_DRICORE
+        .hidden _generic_read_RGBA_span_RGB565_MMX
+#endif
+	.type	_generic_read_RGBA_span_RGB565_MMX, @function
+
+_generic_read_RGBA_span_RGB565_MMX:
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+
+	movl	4(%esp), %eax	/* source pointer */
+	movl	8(%esp), %edx	/* destination pointer */
+	movl	12(%esp), %ecx	/* number of pixels to copy */
+
+	pushl	$MASK_565_H
+	pushl	$MASK_565_L
+	movq	(%esp), %mm5
+	pushl	$PRESCALE_H
+	pushl	$PRESCALE_L
+	movq	(%esp), %mm6
+	pushl	$SCALE_H
+	pushl	$SCALE_L
+	movq	(%esp), %mm7
+	pushl	$ALPHA_H
+	pushl	$ALPHA_L
+	movq	(%esp), %mm3
+	addl	$32,%esp
+
+	sarl	$2, %ecx
+	jl	.L01		/* Bail early if the count is negative. */
+	jmp	.L02
+
+.L03:
+	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
+	 * second pixels into the four words of %mm0 and %mm2.
+      	 */
+
+	movq	(%eax), %mm4
+	addl	$8, %eax
+
+	pshufw	$0x00, %mm4, %mm0
+	pshufw	$0x55, %mm4, %mm2
+
+
+	/* Mask the pixels so that each word of each register contains only
+	 * one color component.
+	 */
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+
+
+	/* Adjust the component values so that they are as small as possible,
+	 * but large enough so that we can multiply them by an unsigned 16-bit
+	 * number and get a value as large as 0x00ff0000.
+ 	 */
+
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+
+	/* Scale the input component values to be on the range
+	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
+	 */
+
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+
+	/* Always set the alpha value to 0xff.
+	 */
+
+ 	por %mm3, %mm0
+ 	por %mm3, %mm2
+
+
+	/* Pack the 16-bit values to 8-bit values and store the converted
+	 * pixel data.
+	 */
+
+	packuswb	%mm2, %mm0
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+	pshufw	$0xaa, %mm4, %mm0
+	pshufw	$0xff, %mm4, %mm2
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+ 	por %mm3, %mm0
+ 	por %mm3, %mm2
+
+	packuswb	%mm2, %mm0
+
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+	subl	$1, %ecx
+.L02:
+	jne	.L03
+
+
+	/* At this point there can be at most 3 pixels left to process.  If
+	 * there is either 2 or 3 left, process 2.
+         */
+
+	movl	12(%esp), %ecx
+	testl	$0x02, %ecx
+	je	.L04
+
+	movd	(%eax), %mm4
+	addl	$4, %eax
+
+	pshufw	$0x00, %mm4, %mm0
+	pshufw	$0x55, %mm4, %mm2
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+ 	por %mm3, %mm0
+ 	por %mm3, %mm2
+
+	packuswb	%mm2, %mm0
+
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+.L04:
+	/* At this point there can be at most 1 pixel left to process.
+	 * Process it if needed.
+         */
+
+	testl	$0x01, %ecx
+	je	.L01
+
+	movzwl	(%eax), %ecx
+	movd	%ecx, %mm4
+
+	pshufw	$0x00, %mm4, %mm0
+
+	pand	%mm5, %mm0
+	pmullw	%mm6, %mm0
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+#endif
+	pmulhuw	%mm7, %mm0
+
+ 	por %mm3, %mm0
+
+	packuswb	%mm0, %mm0
+
+	movd	%mm0, (%edx)
+
+.L01:
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+	ret
+#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/read_rgba_span_x86.h
+++ b/src/arch/x86/read_rgba_span_x86.h
@ -0,0 +1,56 @@
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+ 
+/**
+ * \file read_rgba_span_x86.h
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#ifndef READ_RGBA_SPAN_X86_H
+#define READ_RGBA_SPAN_X86_H
+
+#if defined(USE_SSE_ASM) || defined(USE_MMX_ASM)
+#include "x86/common_x86_asm.h"
+#endif
+
+#if defined(USE_SSE_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_SSE2( const unsigned char *,
+    unsigned char *, unsigned );
+#endif
+
+#if defined(USE_SSE_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *,
+    unsigned char *, unsigned );
+#endif
+
+#if defined(USE_MMX_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *,
+    unsigned char *, unsigned );
+
+extern void _generic_read_RGBA_span_RGB565_MMX( const unsigned char *,
+    unsigned char *, unsigned );
+#endif
+
+#endif /* READ_RGBA_SPAN_X86_H */
--- a/src/arch/x86/rtasm/x86sse.c
+++ b/src/arch/x86/rtasm/x86sse.c
--- a/src/arch/x86/rtasm/x86sse.h
+++ b/src/arch/x86/rtasm/x86sse.h
@ -0,0 +1,256 @@
+
+#ifndef _X86SSE_H_
+#define _X86SSE_H_
+
+#if defined(__i386__) || defined(__386__)
+
+/* It is up to the caller to ensure that instructions issued are
+ * suitable for the host cpu.  There are no checks made in this module
+ * for mmx/sse/sse2 support on the cpu.
+ */
+struct x86_reg {
+   unsigned file:3;
+   unsigned idx:3;
+   unsigned mod:2;		/* mod_REG if this is just a register */
+   int      disp:24;		/* only +/- 23bits of offset - should be enough... */
+};
+
+struct x86_function {
+   unsigned size;
+   unsigned char *store;
+   unsigned char *csr;
+   unsigned stack_offset;
+   int need_emms;
+   const char *fn;
+};
+
+enum x86_reg_file {
+   file_REG32,
+   file_MMX,
+   file_XMM,
+   file_x87
+};
+
+/* Values for mod field of modr/m byte
+ */
+enum x86_reg_mod {
+   mod_INDIRECT,
+   mod_DISP8,
+   mod_DISP32,
+   mod_REG
+};
+
+enum x86_reg_name {
+   reg_AX,
+   reg_CX,
+   reg_DX,
+   reg_BX,
+   reg_SP,
+   reg_BP,
+   reg_SI,
+   reg_DI
+};
+
+
+enum x86_cc {
+   cc_O,			/* overflow */
+   cc_NO,			/* not overflow */
+   cc_NAE,			/* not above or equal / carry */
+   cc_AE,			/* above or equal / not carry */
+   cc_E,			/* equal / zero */
+   cc_NE			/* not equal / not zero */
+};
+
+enum sse_cc {
+   cc_Equal,
+   cc_LessThan,
+   cc_LessThanEqual,
+   cc_Unordered,
+   cc_NotEqual,
+   cc_NotLessThan,
+   cc_NotLessThanEqual,
+   cc_Ordered
+};
+
+#define cc_Z  cc_E
+#define cc_NZ cc_NE
+
+/* Begin/end/retreive function creation:
+ */
+
+
+void x86_init_func( struct x86_function *p );
+int x86_init_func_size( struct x86_function *p, unsigned code_size );
+void x86_release_func( struct x86_function *p );
+void (*x86_get_func( struct x86_function *p ))( void );
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx );
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp );
+
+struct x86_reg x86_deref( struct x86_reg reg );
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg );
+
+
+/* Labels, jumps and fixup:
+ */
+unsigned char *x86_get_label( struct x86_function *p );
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      unsigned char *label );
+
+unsigned char *x86_jcc_forward( struct x86_function *p,
+			  enum x86_cc cc );
+
+unsigned char *x86_jmp_forward( struct x86_function *p);
+
+unsigned char *x86_call_forward( struct x86_function *p);
+
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 unsigned char *fixup );
+
+void x86_jmp( struct x86_function *p, unsigned char *label );
+
+/* void x86_call( struct x86_function *p, void (*label)() ); */
+void x86_call( struct x86_function *p, struct x86_reg reg);
+
+/* michal:
+ * Temporary. As I need immediate operands, and dont want to mess with the codegen,
+ * I load the immediate into general purpose register and use it.
+ */
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
+
+
+/* Macro for sse_shufps() and sse2_pshufd():
+ */
+#define SHUF(_x,_y,_z,_w)       (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6))
+#define SHUF_NOOP               RSW(0,1,2,3)
+#define GET_SHUF(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
+
+void mmx_emms( struct x86_function *p );
+void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
+                unsigned char cc );
+void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                 unsigned char shuf );
+void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
+
+void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_dec( struct x86_function *p, struct x86_reg reg );
+void x86_inc( struct x86_function *p, struct x86_reg reg );
+void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mul( struct x86_function *p, struct x86_reg src );
+void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_pop( struct x86_function *p, struct x86_reg reg );
+void x86_push( struct x86_function *p, struct x86_reg reg );
+void x86_ret( struct x86_function *p );
+void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_sahf( struct x86_function *p );
+
+void x87_f2xm1( struct x86_function *p );
+void x87_fabs( struct x86_function *p );
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_faddp( struct x86_function *p, struct x86_reg dst );
+void x87_fchs( struct x86_function *p );
+void x87_fclex( struct x86_function *p );
+void x87_fcom( struct x86_function *p, struct x86_reg dst );
+void x87_fcomp( struct x86_function *p, struct x86_reg dst );
+void x87_fcos( struct x86_function *p );
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivp( struct x86_function *p, struct x86_reg dst );
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst );
+void x87_fild( struct x86_function *p, struct x86_reg arg );
+void x87_fist( struct x86_function *p, struct x86_reg dst );
+void x87_fistp( struct x86_function *p, struct x86_reg dst );
+void x87_fld( struct x86_function *p, struct x86_reg arg );
+void x87_fld1( struct x86_function *p );
+void x87_fldcw( struct x86_function *p, struct x86_reg arg );
+void x87_fldl2e( struct x86_function *p );
+void x87_fldln2( struct x86_function *p );
+void x87_fldz( struct x86_function *p );
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fmulp( struct x86_function *p, struct x86_reg dst );
+void x87_fnclex( struct x86_function *p );
+void x87_fprndint( struct x86_function *p );
+void x87_fscale( struct x86_function *p );
+void x87_fsin( struct x86_function *p );
+void x87_fsincos( struct x86_function *p );
+void x87_fsqrt( struct x86_function *p );
+void x87_fst( struct x86_function *p, struct x86_reg dst );
+void x87_fstp( struct x86_function *p, struct x86_reg dst );
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubp( struct x86_function *p, struct x86_reg dst );
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
+void x87_fxch( struct x86_function *p, struct x86_reg dst );
+void x87_fxtract( struct x86_function *p );
+void x87_fyl2x( struct x86_function *p );
+void x87_fyl2xp1( struct x86_function *p );
+void x87_fwait( struct x86_function *p );
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
+void x87_fucompp( struct x86_function *p );
+void x87_fucomp( struct x86_function *p, struct x86_reg arg );
+void x87_fucom( struct x86_function *p, struct x86_reg arg );
+
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity.  Note - doesn't track explict
+ * manipulation of ESP by other instructions.
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
+
+#endif
+#endif
--- a/src/arch/x86/sse.c
+++ b/src/arch/x86/sse.c
@ -0,0 +1,123 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.0
+ *
+ * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * PentiumIII-SIMD (SSE) optimizations contributed by
+ * Andre Werthmann <wertmann@cs.uni-potsdam.de>
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+
+#include "sse.h"
+#include "x86_xform.h"
+
+#ifdef DEBUG_MATH
+#include "math/m_debug.h"
+#endif
+
+
+#ifdef USE_SSE_ASM
+DECLARE_XFORM_GROUP( sse, 2 )
+DECLARE_XFORM_GROUP( sse, 3 )
+
+#if 1
+/* Some functions are not written in SSE-assembly, because the fpu ones are faster */
+extern void _ASMAPI _mesa_sse_transform_normals_no_rot( NORM_ARGS );
+extern void _ASMAPI _mesa_sse_transform_rescale_normals( NORM_ARGS );
+extern void _ASMAPI _mesa_sse_transform_rescale_normals_no_rot( NORM_ARGS );
+
+extern void _ASMAPI _mesa_sse_transform_points4_general( XFORM_ARGS );
+extern void _ASMAPI _mesa_sse_transform_points4_3d( XFORM_ARGS );
+/* XXX this function segfaults, see below */
+extern void _ASMAPI _mesa_sse_transform_points4_identity( XFORM_ARGS );
+/* XXX this one works, see below */
+extern void _ASMAPI _mesa_x86_transform_points4_identity( XFORM_ARGS );
+#else
+DECLARE_NORM_GROUP( sse )
+#endif
+
+
+extern void _ASMAPI
+_mesa_v16_sse_general_xform( GLfloat *first_vert,
+			     const GLfloat *m,
+			     const GLfloat *src,
+			     GLuint src_stride,
+			     GLuint count );
+
+extern void _ASMAPI
+_mesa_sse_project_vertices( GLfloat *first,
+			    GLfloat *last,
+			    const GLfloat *m,
+			    GLuint stride );
+
+extern void _ASMAPI
+_mesa_sse_project_clipped_vertices( GLfloat *first,
+				    GLfloat *last,
+				    const GLfloat *m,
+				    GLuint stride,
+				    const GLubyte *clipmask );
+#endif
+
+
+void _mesa_init_sse_transform_asm( void )
+{
+#ifdef USE_SSE_ASM
+   ASSIGN_XFORM_GROUP( sse, 2 );
+   ASSIGN_XFORM_GROUP( sse, 3 );
+
+#if 1
+   /* TODO: Finish these off.
+    */
+   _mesa_transform_tab[4][MATRIX_GENERAL] =
+      _mesa_sse_transform_points4_general;
+   _mesa_transform_tab[4][MATRIX_3D] =
+      _mesa_sse_transform_points4_3d;
+   /* XXX NOTE: _mesa_sse_transform_points4_identity segfaults with the
+      conformance tests, so use the x86 version.
+   */
+   _mesa_transform_tab[4][MATRIX_IDENTITY] =
+      _mesa_x86_transform_points4_identity;/*_mesa_sse_transform_points4_identity;*/
+
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] =
+      _mesa_sse_transform_normals_no_rot;
+   _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] =
+      _mesa_sse_transform_rescale_normals;
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] =
+      _mesa_sse_transform_rescale_normals_no_rot;
+#else
+   ASSIGN_XFORM_GROUP( sse, 4 );
+
+   ASSIGN_NORM_GROUP( sse );
+#endif
+
+#ifdef DEBUG_MATH
+   _math_test_all_transform_functions( "SSE" );
+   _math_test_all_normal_transform_functions( "SSE" );
+#endif
+#endif
+}
+
--- a/src/arch/x86/sse.h
+++ b/src/arch/x86/sse.h
@ -0,0 +1,36 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * PentiumIII-SIMD (SSE) optimizations contributed by
+ * Andre Werthmann <wertmann@cs.uni-potsdam.de>
+ */
+
+#ifndef __SSE_H__
+#define __SSE_H__
+
+void _mesa_init_sse_transform_asm( void );
+
+#endif
--- a/src/arch/x86/sse_normal.S
+++ b/src/arch/x86/sse_normal.S
@ -0,0 +1,261 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+  * - insert PREFETCH instructions to avoid cache-misses !
+  * - some more optimizations are possible...
+  * - for 40-50% more performance in the SSE-functions, the
+  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+  */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "norm_args.h"
+
+   SEG_TEXT
+
+#define M(i)    REGOFF(i * 4, EDX)
+#define S(i)	REGOFF(i * 4, ESI)
+#define D(i)	REGOFF(i * 4, EDI)
+#define STRIDE  REGOFF(12, ESI)
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_rescale_normals_no_rot)
+HIDDEN(_mesa_sse_transform_rescale_normals_no_rot)
+GLNAME(_mesa_sse_transform_rescale_normals_no_rot):
+
+#define FRAME_OFFSET 8
+	PUSH_L  ( ESI )
+	PUSH_L  ( EDI )
+
+	MOV_L	( ARG_IN, ESI )				/* ptr to source GLvector3f */
+	MOV_L	( ARG_DEST, EDI )			/* ptr to dest GLvector3f */
+
+	MOV_L	( ARG_MAT, EDX )			/* ptr to matrix */
+	MOV_L	( REGOFF(MATRIX_INV, EDX), EDX)		/* matrix->inv */
+
+	MOV_L	( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L	( ECX, ECX )
+	JZ( LLBL(K_G3TRNNRR_finish) )			/* count was zero; go to finish */
+
+	MOV_L	( STRIDE, EAX )				/* stride */
+	MOV_L	( ECX, REGOFF(V4F_COUNT, EDI) )		/* set dest-count */
+
+	IMUL_L( CONST(16), ECX )			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI )		/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )		/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+	MOVSS	( M(0), XMM1 )				/* m0 */
+	MOVSS	( M(5), XMM2 )				/* m5 */
+	UNPCKLPS( XMM2, XMM1 )				/* m5 | m0 */
+	MOVSS	( ARG_SCALE, XMM0 )			/* scale */
+	SHUFPS	( CONST(0x0), XMM0, XMM0 )		/* scale | scale */
+	MULPS	( XMM0, XMM1 )				/* m5*scale | m0*scale */
+	MULSS	( M(10), XMM0 )				/* m10*scale */
+
+ALIGNTEXT32
+LLBL(K_G3TRNNRR_top):
+	MOVLPS	( S(0), XMM2 )				/* uy | ux */
+	MULPS	( XMM1, XMM2 )				/* uy*m5*scale | ux*m0*scale */
+	MOVLPS	( XMM2, D(0) )				/* ->D(1) | D(0) */
+
+	MOVSS	( S(2), XMM2 )				/* uz */
+	MULSS	( XMM0, XMM2 )				/* uz*m10*scale */
+	MOVSS	( XMM2, D(2) )				/* ->D(2) */
+
+LLBL(K_G3TRNNRR_skip):
+	ADD_L	( CONST(16), EDI )
+	ADD_L	( EAX, ESI )
+	CMP_L	( ECX, EDI )
+	JNE	( LLBL(K_G3TRNNRR_top) )
+
+LLBL(K_G3TRNNRR_finish):
+	POP_L	( EDI )
+	POP_L	( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_rescale_normals)
+HIDDEN(_mesa_sse_transform_rescale_normals)
+GLNAME(_mesa_sse_transform_rescale_normals):
+
+#define FRAME_OFFSET 8
+	PUSH_L  ( ESI )
+	PUSH_L  ( EDI )
+
+	MOV_L	( ARG_IN, ESI )				/* ptr to source GLvector3f */
+	MOV_L	( ARG_DEST, EDI )			/* ptr to dest GLvector3f */
+
+	MOV_L	( ARG_MAT, EDX )			/* ptr to matrix */
+	MOV_L	( REGOFF(MATRIX_INV, EDX), EDX)		/* matrix->inv */
+
+	MOV_L	( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L	( ECX, ECX )
+	JZ( LLBL(K_G3TRNR_finish) )			/* count was zero; go to finish */
+
+	MOV_L	( STRIDE, EAX )				/* stride */
+	MOV_L	( ECX, REGOFF(V4F_COUNT, EDI) )		/* set dest-count */
+
+	IMUL_L( CONST(16), ECX )			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI )		/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )		/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+	MOVSS	( M(0), XMM0 )				/* m0 */
+	MOVSS	( M(4), XMM1 )				/* m4 */
+	UNPCKLPS( XMM1, XMM0 )				/* m4 | m0 */
+
+	MOVSS	( ARG_SCALE, XMM4 )			/* scale */
+	SHUFPS	( CONST(0x0), XMM4, XMM4 )		/* scale | scale */
+
+	MULPS	( XMM4, XMM0 )				/* m4*scale | m0*scale */
+	MOVSS	( M(1), XMM1 )				/* m1 */
+	MOVSS	( M(5), XMM2 )				/* m5 */
+	UNPCKLPS( XMM2, XMM1 )				/* m5 | m1 */
+	MULPS	( XMM4, XMM1 )				/* m5*scale | m1*scale */
+	MOVSS	( M(2), XMM2 )				/* m2 */
+	MOVSS	( M(6), XMM3 )				/* m6 */
+	UNPCKLPS( XMM3, XMM2 )				/* m6 | m2 */
+	MULPS	( XMM4, XMM2 )				/* m6*scale | m2*scale */
+
+	MOVSS	( M(8), XMM6 )				/* m8 */
+	MULSS	( ARG_SCALE, XMM6 )			/* m8*scale */
+	MOVSS	( M(9), XMM7 )				/* m9 */
+	MULSS	( ARG_SCALE, XMM7 )			/* m9*scale */
+
+ALIGNTEXT32
+LLBL(K_G3TRNR_top):
+	MOVSS	( S(0), XMM3 )				/* ux */
+	SHUFPS	( CONST(0x0), XMM3, XMM3 )		/* ux | ux */
+	MULPS	( XMM0, XMM3 )				/* ux*m4 | ux*m0 */
+	MOVSS	( S(1), XMM4 )				/* uy */
+	SHUFPS	( CONST(0x0), XMM4, XMM4 )		/* uy | uy */
+	MULPS	( XMM1, XMM4 )				/* uy*m5 | uy*m1 */
+	MOVSS	( S(2), XMM5 )				/* uz */
+	SHUFPS	( CONST(0x0), XMM5, XMM5 )		/* uz | uz */
+	MULPS	( XMM2, XMM5 )				/* uz*m6 | uz*m2 */
+
+	ADDPS	( XMM4, XMM3 )
+	ADDPS	( XMM5, XMM3 )
+	MOVLPS	( XMM3, D(0) )
+
+	MOVSS	( M(10), XMM3 )				/* m10 */
+	MULSS	( ARG_SCALE, XMM3 )			/* m10*scale */
+	MULSS	( S(2), XMM3 )				/* m10*scale*uz */
+	MOVSS	( S(1), XMM4 )				/* uy */
+	MULSS	( XMM7, XMM4 )				/* uy*m9*scale */
+	MOVSS	( S(0), XMM5 )				/* ux */
+	MULSS	( XMM6, XMM5 )				/* ux*m8*scale */
+
+	ADDSS	( XMM4, XMM3 )
+	ADDSS	( XMM5, XMM3 )
+	MOVSS	( XMM3, D(2) )
+
+LLBL(K_G3TRNR_skip):
+	ADD_L	( CONST(16), EDI )
+	ADD_L	( EAX, ESI )
+	CMP_L	( ECX, EDI )
+	JNE	( LLBL(K_G3TRNR_top) )
+
+LLBL(K_G3TRNR_finish):
+	POP_L	( EDI )
+	POP_L	( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_normals_no_rot)
+HIDDEN(_mesa_sse_transform_normals_no_rot)
+GLNAME(_mesa_sse_transform_normals_no_rot):
+
+#define FRAME_OFFSET 8
+	PUSH_L  ( ESI )
+	PUSH_L  ( EDI )
+
+	MOV_L	( ARG_IN, ESI )				/* ptr to source GLvector3f */
+	MOV_L	( ARG_DEST, EDI )			/* ptr to dest GLvector3f */
+
+	MOV_L	( ARG_MAT, EDX )			/* ptr to matrix */
+	MOV_L	( REGOFF(MATRIX_INV, EDX), EDX)		/* matrix->inv */
+
+	MOV_L	( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L	( ECX, ECX )
+	JZ( LLBL(K_G3TNNRR_finish) )			/* count was zero; go to finish */
+
+	MOV_L	( STRIDE, EAX )				/* stride */
+	MOV_L	( ECX, REGOFF(V4F_COUNT, EDI) )		/* set dest-count */
+
+	IMUL_L( CONST(16), ECX )			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI )		/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )		/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+	MOVSS( M(0), XMM0 )				/* m0 */
+	MOVSS( M(5), XMM1 )				/* m5 */
+	UNPCKLPS( XMM1, XMM0 )				/* m5 | m0 */
+	MOVSS( M(10), XMM1 )				/* m10 */
+
+ALIGNTEXT32
+LLBL(K_G3TNNRR_top):
+	MOVLPS( S(0), XMM2 )				/* uy | ux */
+	MULPS( XMM0, XMM2 )				/* uy*m5 | ux*m0 */
+	MOVLPS( XMM2, D(0) )
+
+	MOVSS( S(2), XMM2 )				/* uz */
+	MULSS( XMM1, XMM2 )				/* uz*m10 */
+	MOVSS( XMM2, D(2) )
+
+LLBL(K_G3TNNRR_skip):
+	ADD_L	( CONST(16), EDI )
+	ADD_L	( EAX, ESI )
+	CMP_L	( ECX, EDI )
+	JNE	( LLBL(K_G3TNNRR_top) )
+
+LLBL(K_G3TNNRR_finish):
+	POP_L	( EDI )
+	POP_L	( ESI )
+	RET
+#undef FRAME_OFFSET
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/sse_xform1.S
+++ b/src/arch/x86/sse_xform1.S
@ -0,0 +1,446 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+  * - insert PREFETCH instructions to avoid cache-misses !
+  * - some more optimizations are possible...
+  * - for 40-50% more performance in the SSE-functions, the
+  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+  */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+   SEG_TEXT
+
+#define S(i) 	REGOFF(i * 4, ESI)
+#define D(i) 	REGOFF(i * 4, EDI)
+#define M(i) 	REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_general)
+HIDDEN( _mesa_sse_transform_points1_general )
+GLNAME( _mesa_sse_transform_points1_general ):
+
+#define FRAME_OFFSET 8
+    PUSH_L    ( ESI )
+    PUSH_L    ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    CMP_L( CONST(0), ECX )			/* count == 0 ? */
+    JE( LLBL(K_GTP1GR_finish) )			/* yes -> nothing to do. */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+
+ALIGNTEXT32
+    MOVAPS( M(0), XMM0 )			/* m3  | m2  | m1  | m0  */
+    MOVAPS( M(12), XMM1 )			/* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP1GR_top):
+    MOVSS( S(0), XMM2 )				/* ox */
+    SHUFPS( CONST(0x0), XMM2, XMM2 )		/* ox | ox | ox | ox */
+    MULPS( XMM0, XMM2 )				/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+    ADDPS( XMM1, XMM2 )				/* + | + | + | + */
+    MOVUPS( XMM2, D(0) )
+
+LLBL(K_GTP1GR_skip):
+    ADD_L     ( CONST(16), EDI )
+    ADD_L     ( EAX, ESI )
+    CMP_L     ( ECX, EDI )
+    JNE       ( LLBL(K_GTP1GR_top) )
+
+LLBL(K_GTP1GR_finish):
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_identity)
+HIDDEN(_mesa_sse_transform_points1_identity)
+GLNAME( _mesa_sse_transform_points1_identity ):
+
+#define FRAME_OFFSET 8
+    PUSH_L    ( ESI )
+    PUSH_L    ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP1IR_finish) ) 		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(1), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+    CMP_L( ESI, EDI )
+    JE( LLBL(K_GTP1IR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTP1IR_top):
+    MOV_L( S(0), EDX )
+    MOV_L( EDX, D(0) )
+
+LLBL(K_GTP1IR_skip):
+    ADD_L     ( CONST(16), EDI )
+    ADD_L     ( EAX, ESI )
+    CMP_L     ( ECX, EDI )
+    JNE       ( LLBL(K_GTP1IR_top) )
+
+LLBL(K_GTP1IR_finish):
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_3d_no_rot)
+HIDDEN(_mesa_sse_transform_points1_3d_no_rot)
+GLNAME(_mesa_sse_transform_points1_3d_no_rot):
+
+#define FRAME_OFFSET 8
+    PUSH_L( ESI )
+    PUSH_L( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI )	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP13DNRR_finish) ) 		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+
+ALIGNTEXT32
+    MOVSS( M(0), XMM0 )				/* m0 */
+    MOVSS( M(12), XMM1 )			/* m12 */
+    MOVSS( M(13), XMM2 )			/* m13 */
+    MOVSS( M(14), XMM3 )			/* m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP13DNRR_top):
+    MOVSS( S(0), XMM4 )				/* ox */
+    MULSS( XMM0, XMM4 )				/* ox*m0 */
+    ADDSS( XMM1, XMM4 )				/* ox*m0+m12 */
+    MOVSS( XMM4, D(0) )
+
+    MOVSS( XMM2, D(1) )
+    MOVSS( XMM3, D(2) )
+
+LLBL(K_GTP13DNRR_skip):
+    ADD_L    ( CONST(16), EDI )
+    ADD_L    ( EAX, ESI )
+    CMP_L    ( ECX, EDI )
+    JNE      ( LLBL(K_GTP13DNRR_top) )
+
+LLBL(K_GTP13DNRR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_perspective)
+HIDDEN(_mesa_sse_transform_points1_perspective)
+GLNAME(_mesa_sse_transform_points1_perspective):
+
+#define FRAME_OFFSET 8
+    PUSH_L   ( ESI )
+    PUSH_L   ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP13PR_finish) )		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+
+ALIGNTEXT32
+    XORPS( XMM0, XMM0 )				/* 0 | 0 | 0 | 0 */
+    MOVSS( M(0), XMM1 )				/* m0 */
+    MOVSS( M(14), XMM2 )			/* m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP13PR_top):
+    MOVSS( S(0), XMM3 )				/* ox */
+    MULSS( XMM1, XMM3 )				/* ox*m0 */
+    MOVSS( XMM3, D(0) )				/* ox*m0->D(0) */
+    MOVSS( XMM2, D(2) )				/* m14->D(2) */
+
+    MOVSS( XMM0, D(1) )
+    MOVSS( XMM0, D(3) )
+
+LLBL(K_GTP13PR_skip):
+    ADD_L( CONST(16), EDI )
+    ADD_L( EAX, ESI )
+    CMP_L( ECX, EDI )
+    JNE( LLBL(K_GTP13PR_top) )
+
+LLBL(K_GTP13PR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_2d)
+HIDDEN(_mesa_sse_transform_points1_2d)
+GLNAME(_mesa_sse_transform_points1_2d):
+
+#define FRAME_OFFSET 8
+    PUSH_L( ESI )
+    PUSH_L( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP13P2DR_finish) ) 		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+    MOVLPS( M(0), XMM0 )			/* m1  | m0  */
+    MOVLPS( M(12), XMM1 )			/* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P2DR_top):
+    MOVSS( S(0), XMM2 )				/* ox */
+    SHUFPS( CONST(0x0), XMM2, XMM2 )		/* ox | ox | ox | ox */
+    MULPS( XMM0, XMM2 )				/* - | - | ox*m1 | ox*m0 */
+    ADDPS( XMM1, XMM2 )				/* - | - | ox*m1+m13 | ox*m0+m12 */
+    MOVLPS( XMM2, D(0) )
+
+LLBL(K_GTP13P2DR_skip):
+    ADD_L    ( CONST(16), EDI )
+    ADD_L    ( EAX, ESI )
+    CMP_L    ( ECX, EDI )
+    JNE      ( LLBL(K_GTP13P2DR_top) )
+
+LLBL(K_GTP13P2DR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_2d_no_rot)
+HIDDEN(_mesa_sse_transform_points1_2d_no_rot)
+GLNAME(_mesa_sse_transform_points1_2d_no_rot):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L( ECX, ECX)
+	JZ( LLBL(K_GTP13P2DNRR_finish) ) 	/* count was zero; go to finish */
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+	OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+	MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 			/* count += dest ptr */
+
+ALIGNTEXT32
+	MOVSS( M(0), XMM0 )			/* m0 */
+	MOVSS( M(12), XMM1 )			/* m12 */
+	MOVSS( M(13), XMM2 )			/* m13 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P2DNRR_top):
+	MOVSS( S(0), XMM3 )			/* ox */
+	MULSS( XMM0, XMM3 )			/* ox*m0 */
+	ADDSS( XMM1, XMM3 )			/* ox*m0+m12 */
+	MOVSS( XMM3, D(0) )
+	MOVSS( XMM2, D(1) )
+
+LLBL(K_GTP13P2DNRR_skip):
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(K_GTP13P2DNRR_top) )
+
+LLBL(K_GTP13P2DNRR_finish):
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_3d)
+HIDDEN(_mesa_sse_transform_points1_3d)
+GLNAME(_mesa_sse_transform_points1_3d):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L( ECX, ECX)
+	JZ( LLBL(K_GTP13P3DR_finish) ) 	/* count was zero; go to finish */
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 			/* count += dest ptr */
+
+
+ALIGNTEXT32
+	MOVAPS( M(0), XMM0 )			/* m3  | m2  | m1  |  m0 */
+	MOVAPS( M(12), XMM1 )			/* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P3DR_top):
+	MOVSS( S(0), XMM2 )			/* ox */
+	SHUFPS( CONST(0x0), XMM2, XMM2 )	/* ox | ox | ox | ox */
+	MULPS( XMM0, XMM2 )			/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+	ADDPS( XMM1, XMM2 )			/* +m15  | +m14  | +m13  | +m12  */
+	MOVLPS( XMM2, D(0) )			/*   -   |   -   | ->D(1)| ->D(0)*/
+	UNPCKHPS( XMM2, XMM2 )			/* ox*m3+m15 | ox*m3+m15 | ox*m2+m14 | ox*m2+m14 */
+	MOVSS( XMM2, D(2) )
+
+LLBL(K_GTP13P3DR_skip):
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(K_GTP13P3DR_top) )
+
+LLBL(K_GTP13P3DR_finish):
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/sse_xform2.S
+++ b/src/arch/x86/sse_xform2.S
@ -0,0 +1,466 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+  * - insert PREFETCH instructions to avoid cache-misses !
+  * - some more optimizations are possible...
+  * - for 40-50% more performance in the SSE-functions, the
+  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+  */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+   SEG_TEXT
+
+#define S(i) 	REGOFF(i * 4, ESI)
+#define D(i) 	REGOFF(i * 4, EDI)
+#define M(i) 	REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_general)
+HIDDEN (_mesa_sse_transform_points2_general)
+GLNAME( _mesa_sse_transform_points2_general ):
+
+#define FRAME_OFFSET 8
+    PUSH_L    ( ESI )
+    PUSH_L    ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX )
+    JZ( LLBL(K_GTP2GR_finish) )			/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+    MOVAPS( M(0), XMM0 )			/* m3  | m2  | m1  | m0 */
+    MOVAPS( M(4), XMM1 )			/* m7  | m6  | m5  | m4 */
+    MOVAPS( M(12), XMM2 )			/* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP2GR_top):
+    MOVSS( S(0), XMM3 )				/* ox */
+    SHUFPS( CONST(0x0), XMM3, XMM3 )		/* ox | ox | ox | ox */
+    MULPS( XMM0, XMM3 )				/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+    MOVSS( S(1), XMM4 )				/* oy */
+    SHUFPS( CONST(0x0), XMM4, XMM4 )		/* oy | oy | oy | oy */
+    MULPS( XMM1, XMM4 )				/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+    ADDPS( XMM4, XMM3 )
+    ADDPS( XMM2, XMM3 )
+    MOVAPS( XMM3, D(0) )
+
+LLBL(K_GTP2GR_skip):
+    ADD_L     ( CONST(16), EDI )
+    ADD_L     ( EAX, ESI )
+    CMP_L     ( ECX, EDI )
+    JNE       ( LLBL(K_GTP2GR_top) )
+
+LLBL(K_GTP2GR_finish):
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_identity)
+HIDDEN(_mesa_sse_transform_points2_identity)
+GLNAME( _mesa_sse_transform_points2_identity ):
+
+#define FRAME_OFFSET 8
+    PUSH_L    ( ESI )
+    PUSH_L    ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP2IR_finish) )			/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+    CMP_L( ESI, EDI )
+    JE( LLBL(K_GTP2IR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTP2IR_top):
+    MOV_L     ( S(0), EDX )
+    MOV_L     ( EDX, D(0) )
+    MOV_L     ( S(1), EDX )
+    MOV_L     ( EDX, D(1) )
+
+LLBL(K_GTP2IR_skip):
+    ADD_L     ( CONST(16), EDI )
+    ADD_L     ( EAX, ESI )
+    CMP_L     ( ECX, EDI )
+    JNE       ( LLBL(K_GTP2IR_top) )
+
+LLBL(K_GTP2IR_finish):
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_3d_no_rot)
+HIDDEN(_mesa_sse_transform_points2_3d_no_rot)
+GLNAME(_mesa_sse_transform_points2_3d_no_rot):
+
+#define FRAME_OFFSET 8
+    PUSH_L( ESI )
+    PUSH_L( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP23DNRR_finish) ) 		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+    XORPS( XMM0, XMM0 )                         /* clean the working register */
+
+ALIGNTEXT32
+    MOVSS    ( M(0), XMM1 )			/* - | - |  -  | m0  */
+    MOVSS    ( M(5), XMM2 )			/* - | - |  -  | m5  */
+    UNPCKLPS ( XMM2, XMM1 )			/* - | - | m5  | m0  */
+    MOVLPS   ( M(12), XMM2 )			/* - | - | m13 | m12 */
+    MOVSS    ( M(14), XMM3 )			/* - | - |  -  | m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP23DNRR_top):
+    MOVLPS   ( S(0), XMM0 )			/* - | - |  oy   | ox */
+    MULPS    ( XMM1, XMM0 )			/* - | - | oy*m5 | ox*m0 */
+    ADDPS    ( XMM2, XMM0 )			/* - | - | +m13  | +m12 */
+    MOVLPS   ( XMM0, D(0) )			/* -> D(1) | -> D(0) */
+
+    MOVSS    ( XMM3, D(2) )			/* -> D(2) */
+
+LLBL(K_GTP23DNRR_skip):
+    ADD_L    ( CONST(16), EDI )
+    ADD_L    ( EAX, ESI )
+    CMP_L    ( ECX, EDI )
+    JNE      ( LLBL(K_GTP23DNRR_top) )
+
+LLBL(K_GTP23DNRR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_perspective)
+HIDDEN(_mesa_sse_transform_points2_perspective)
+GLNAME(_mesa_sse_transform_points2_perspective):
+
+#define FRAME_OFFSET 8
+    PUSH_L   ( ESI )
+    PUSH_L   ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI )	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP23PR_finish) )		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+    MOVSS    ( M(0), XMM1 )			/* -  | -  |  -  | m0  */
+    MOVSS    ( M(5), XMM2 )			/* -  | -  |  -  | m5  */
+    UNPCKLPS ( XMM2, XMM1 )			/* -  | -  | m5  | m0  */
+    MOVSS    ( M(14), XMM3 )			/* m14 */
+    XORPS    ( XMM0, XMM0 )			/* 0 | 0 | 0 | 0 */
+
+ALIGNTEXT32
+LLBL(K_GTP23PR_top):
+    MOVLPS( S(0), XMM4 )			/* oy | ox */
+    MULPS( XMM1, XMM4 )				/* oy*m5 | ox*m0 */
+    MOVLPS( XMM4, D(0) )			/* ->D(1) | ->D(0) */
+    MOVSS( XMM3, D(2) )				/* ->D(2) */
+    MOVSS( XMM0, D(3) )				/* ->D(3) */
+
+LLBL(K_GTP23PR_skip):
+    ADD_L( CONST(16), EDI )
+    ADD_L( EAX, ESI )
+    CMP_L( ECX, EDI )
+    JNE( LLBL(K_GTP23PR_top) )
+
+LLBL(K_GTP23PR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_2d)
+HIDDEN(_mesa_sse_transform_points2_2d)
+GLNAME(_mesa_sse_transform_points2_2d):
+
+#define FRAME_OFFSET 8
+    PUSH_L( ESI )
+    PUSH_L( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP23P2DR_finish) ) 		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+    MOVLPS( M(0), XMM0 )			/* m1  | m0 */
+    MOVLPS( M(4), XMM1 )			/* m5  | m4 */
+    MOVLPS( M(12), XMM2 )			/* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P2DR_top):
+    MOVSS( S(0), XMM3 )				/* ox */
+    SHUFPS( CONST(0x0), XMM3, XMM3 )		/* ox | ox */
+    MULPS( XMM0, XMM3 )				/* ox*m1 | ox*m0 */
+
+    MOVSS( S(1), XMM4 )				/* oy */
+    SHUFPS( CONST(0x0), XMM4, XMM4 )		/* oy | oy */
+    MULPS( XMM1, XMM4 )				/* oy*m5 | oy*m4 */
+
+    ADDPS( XMM4, XMM3 )
+    ADDPS( XMM2, XMM3 )
+    MOVLPS( XMM3, D(0) )			/* ->D(1) | ->D(0) */
+
+LLBL(K_GTP23P2DR_skip):
+    ADD_L    ( CONST(16), EDI )
+    ADD_L    ( EAX, ESI )
+    CMP_L    ( ECX, EDI )
+    JNE      ( LLBL(K_GTP23P2DR_top) )
+
+LLBL(K_GTP23P2DR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_2d_no_rot)
+HIDDEN(_mesa_sse_transform_points2_2d_no_rot)
+GLNAME(_mesa_sse_transform_points2_2d_no_rot):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L( ECX, ECX)
+	JZ( LLBL(K_GTP23P2DNRR_finish) ) 	/* count was zero; go to finish */
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+	OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+	MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 			/* count += dest ptr */
+
+ALIGNTEXT32
+	MOVSS    ( M(0), XMM1 )			/* m0 */
+	MOVSS    ( M(5), XMM2 )			/* m5 */
+	UNPCKLPS ( XMM2, XMM1 )			/* m5 | m0 */
+	MOVLPS   ( M(12), XMM2 )		/* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P2DNRR_top):
+	MOVLPS( S(0), XMM0 )			/* oy | ox */
+	MULPS( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
+	ADDPS( XMM2, XMM0 )			/* +m13 | +m12 */
+	MOVLPS( XMM0, D(0) )			/* ->D(1) | ->D(0) */
+
+LLBL(K_GTP23P2DNRR_skip):
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(K_GTP23P2DNRR_top) )
+
+LLBL(K_GTP23P2DNRR_finish):
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_3d)
+HIDDEN(_mesa_sse_transform_points2_3d)
+GLNAME(_mesa_sse_transform_points2_3d):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L( ECX, ECX)
+	JZ( LLBL(K_GTP23P3DR_finish) ) 	/* count was zero; go to finish */
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 			/* count += dest ptr */
+
+ALIGNTEXT32
+	MOVAPS( M(0), XMM0 )			/* m2  | m1  | m0 */
+	MOVAPS( M(4), XMM1 )			/* m6  | m5  | m4 */
+	MOVAPS( M(12), XMM2 )			/* m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P3DR_top):
+	MOVSS( S(0), XMM3 )			/* ox */
+	SHUFPS( CONST(0x0), XMM3, XMM3 )	/* ox | ox | ox */
+	MULPS( XMM0, XMM3 )			/* ox*m2 | ox*m1 | ox*m0 */
+
+	MOVSS( S(1), XMM4 )			/* oy */
+	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* oy | oy | oy */
+	MULPS( XMM1, XMM4 )			/* oy*m6 | oy*m5 | oy*m4 */
+
+	ADDPS( XMM4, XMM3 )
+	ADDPS( XMM2, XMM3 )
+
+	MOVLPS( XMM3, D(0) )			/* ->D(1) | ->D(0) */
+	UNPCKHPS( XMM3, XMM3 )
+	MOVSS( XMM3, D(2) )			/* ->D(2) */
+
+LLBL(K_GTP23P3DR_skip):
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(K_GTP23P3DR_top) )
+
+LLBL(K_GTP23P3DR_finish):
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/sse_xform3.S
+++ b/src/arch/x86/sse_xform3.S
@ -0,0 +1,512 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+  * - insert PREFETCH instructions to avoid cache-misses !
+  * - some more optimizations are possible...
+  * - for 40-50% more performance in the SSE-functions, the
+  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+  */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+   SEG_TEXT
+
+#define S(i) 	REGOFF(i * 4, ESI)
+#define D(i) 	REGOFF(i * 4, EDI)
+#define M(i) 	REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_general)
+HIDDEN(_mesa_sse_transform_points3_general)
+GLNAME( _mesa_sse_transform_points3_general ):
+
+#define FRAME_OFFSET 8
+    PUSH_L    ( ESI )
+    PUSH_L    ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    CMP_L     ( CONST(0), ECX )			/* count == 0 ? */
+    JE        ( LLBL(K_GTPGR_finish) )		/* yes -> nothing to do. */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+
+ALIGNTEXT32
+    MOVAPS    ( REGOFF(0, EDX), XMM0 )	/* m0  | m1  | m2  | m3 */
+    MOVAPS    ( REGOFF(16, EDX), XMM1 )	/* m4  | m5  | m6  | m7 */
+    MOVAPS    ( REGOFF(32, EDX), XMM2 )	/* m8  | m9  | m10 | m11 */
+    MOVAPS    ( REGOFF(48, EDX), XMM3 )	/* m12 | m13 | m14 | m15 */
+
+
+ALIGNTEXT32
+LLBL(K_GTPGR_top):
+    MOVSS     ( REGOFF(0, ESI), XMM4 )		/*    |    |    | ox */
+    SHUFPS    ( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox | ox */
+    MOVSS     ( REGOFF(4, ESI), XMM5 )		/*    |    |    | oy */
+    SHUFPS    ( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy | oy */
+    MOVSS     ( REGOFF(8, ESI), XMM6 )		/*    |    |    | oz */
+    SHUFPS    ( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz | oz */
+
+    MULPS     ( XMM0, XMM4 )		/* m3*ox  | m2*ox  | m1*ox | m0*ox */
+    MULPS     ( XMM1, XMM5 )		/* m7*oy  | m6*oy  | m5*oy | m4*oy */
+    MULPS     ( XMM2, XMM6 )		/* m11*oz | m10*oz | m9*oz | m8*oz */
+
+    ADDPS     ( XMM5, XMM4 )
+    ADDPS     ( XMM6, XMM4 )
+    ADDPS     ( XMM3, XMM4 )
+
+    MOVAPS    ( XMM4, REGOFF(0, EDI) )
+
+LLBL(K_GTPGR_skip):
+    ADD_L     ( CONST(16), EDI )
+    ADD_L     ( EAX, ESI )
+    CMP_L     ( ECX, EDI )
+    JNE       ( LLBL(K_GTPGR_top) )
+
+LLBL(K_GTPGR_finish):
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_identity)
+HIDDEN(_mesa_sse_transform_points3_identity)
+GLNAME( _mesa_sse_transform_points3_identity ):
+
+#define FRAME_OFFSET 8
+    PUSH_L    ( ESI )
+    PUSH_L    ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTPIR_finish) ) 			/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+    CMP_L( ESI, EDI )
+    JE( LLBL(K_GTPIR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTPIR_top):
+    MOVLPS    ( S(0), XMM0 )
+    MOVLPS    ( XMM0, D(0) )
+    MOVSS     ( S(2), XMM0 )
+    MOVSS     ( XMM0, D(2) )
+
+LLBL(K_GTPIR_skip):
+    ADD_L     ( CONST(16), EDI )
+    ADD_L     ( EAX, ESI )
+    CMP_L     ( ECX, EDI )
+    JNE       ( LLBL(K_GTPIR_top) )
+
+LLBL(K_GTPIR_finish):
+    POP_L     ( EDI )
+    POP_L     ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
+HIDDEN(_mesa_sse_transform_points3_3d_no_rot)
+GLNAME(_mesa_sse_transform_points3_3d_no_rot):
+
+#define FRAME_OFFSET 8
+    PUSH_L( ESI )
+    PUSH_L( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP3DNRR_finish) ) 		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+    XORPS( XMM0, XMM0 )                         /* clean the working register */
+
+ALIGNTEXT32
+    MOVSS    ( M(0), XMM1 )			/* - | - |  -  | m0  */
+    MOVSS    ( M(5), XMM2 )			/* - | - |  -  | m5  */
+    UNPCKLPS ( XMM2, XMM1 )			/* - | - | m5  | m0  */
+    MOVLPS   ( M(12), XMM2 )			/* - | - | m13 | m12 */
+    MOVSS    ( M(10), XMM3 )			/* - | - |  -  | m10 */
+    MOVSS    ( M(14), XMM4 )			/* - | - |  -  | m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP3DNRR_top):
+
+    MOVLPS   ( S(0), XMM0 )			/* - | - |  s1   | s0 */
+    MULPS    ( XMM1, XMM0 )			/* - | - | s1*m5 | s0*m0 */
+    ADDPS    ( XMM2, XMM0 )			/* - | - | +m13  | +m12 */
+    MOVLPS   ( XMM0, D(0) )			/* -> D(1) | -> D(0) */
+
+    MOVSS    ( S(2), XMM0 )			/* sz */
+    MULSS    ( XMM3, XMM0 )			/* sz*m10 */
+    ADDSS    ( XMM4, XMM0 )			/* +m14 */
+    MOVSS    ( XMM0, D(2) )			/* -> D(2) */
+
+LLBL(K_GTP3DNRR_skip):
+    ADD_L    ( CONST(16), EDI )
+    ADD_L    ( EAX, ESI )
+    CMP_L    ( ECX, EDI )
+    JNE      ( LLBL(K_GTP3DNRR_top) )
+
+LLBL(K_GTP3DNRR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_perspective)
+HIDDEN(_mesa_sse_transform_points3_perspective)
+GLNAME(_mesa_sse_transform_points3_perspective):
+
+#define FRAME_OFFSET 8
+    PUSH_L   ( ESI )
+    PUSH_L   ( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP3PR_finish) )			/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+    MOVSS    ( M(0), XMM1 )			/* -  | -  |  -  | m0  */
+    MOVSS    ( M(5), XMM2 )			/* -  | -  |  -  | m5  */
+    UNPCKLPS ( XMM2, XMM1 )			/* -  | -  | m5  | m0  */
+    MOVLPS   ( M(8), XMM2 )			/* -  | -  | m9  | m8  */
+    MOVSS    ( M(10), XMM3 )			/* m10 */
+    MOVSS    ( M(14), XMM4 )			/* m14 */
+    XORPS    ( XMM6, XMM6 )			/* 0 */
+
+ALIGNTEXT32
+LLBL(K_GTP3PR_top):
+    MOVLPS   ( S(0), XMM0 )			/* oy | ox */
+    MULPS    ( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
+    MOVSS    ( S(2), XMM5 )			/* oz */
+    SHUFPS   ( CONST(0x0), XMM5, XMM5 )		/* oz | oz */
+    MULPS    ( XMM2, XMM5 )			/* oz*m9 | oz*m8 */
+    ADDPS    ( XMM5, XMM0 )			/* +oy*m5 | +ox*m0 */
+    MOVLPS   ( XMM0, D(0) )			/* ->D(1) | ->D(0) */
+
+    MOVSS    ( S(2), XMM0 )			/* oz */
+    MULSS    ( XMM3, XMM0 )			/* oz*m10 */
+    ADDSS    ( XMM4, XMM0 )			/* +m14 */
+    MOVSS    ( XMM0, D(2) )			/* ->D(2) */
+
+    MOVSS    ( S(2), XMM0 )			/* oz */
+    MOVSS    ( XMM6, XMM5 )			/* 0 */
+    SUBPS    ( XMM0, XMM5 )			/* -oz */
+    MOVSS    ( XMM5, D(3) )			/* ->D(3) */
+
+LLBL(K_GTP3PR_skip):
+    ADD_L( CONST(16), EDI )
+    ADD_L( EAX, ESI )
+    CMP_L( ECX, EDI )
+    JNE( LLBL(K_GTP3PR_top) )
+
+LLBL(K_GTP3PR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_2d)
+HIDDEN(_mesa_sse_transform_points3_2d)
+GLNAME(_mesa_sse_transform_points3_2d):
+
+#define FRAME_OFFSET 8
+    PUSH_L( ESI )
+    PUSH_L( EDI )
+
+    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
+    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+    TEST_L( ECX, ECX)
+    JZ( LLBL(K_GTP3P2DR_finish) ) 		/* count was zero; go to finish */
+
+    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+    ADD_L( EDI, ECX ) 				/* count += dest ptr */
+
+ALIGNTEXT32
+    MOVLPS( M(0), XMM0 )			/* m1  | m0 */
+    MOVLPS( M(4), XMM1 )			/* m5  | m4 */
+    MOVLPS( M(12), XMM2 )			/* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P2DR_top):
+    MOVSS    ( S(0), XMM3 )			/* ox */
+    SHUFPS   ( CONST(0x0), XMM3, XMM3 )		/* ox | ox */
+    MULPS    ( XMM0, XMM3 )			/* ox*m1 | ox*m0 */
+    MOVSS    ( S(1), XMM4 )			/* oy */
+    SHUFPS   ( CONST(0x0), XMM4, XMM4 )		/* oy | oy */
+    MULPS    ( XMM1, XMM4 )			/* oy*m5 | oy*m4 */
+
+    ADDPS    ( XMM4, XMM3 )
+    ADDPS    ( XMM2, XMM3 )
+    MOVLPS   ( XMM3, D(0) )
+
+    MOVSS    ( S(2), XMM3 )
+    MOVSS    ( XMM3, D(2) )
+
+LLBL(K_GTP3P2DR_skip):
+    ADD_L    ( CONST(16), EDI )
+    ADD_L    ( EAX, ESI )
+    CMP_L    ( ECX, EDI )
+    JNE      ( LLBL(K_GTP3P2DR_top) )
+
+LLBL(K_GTP3P2DR_finish):
+    POP_L    ( EDI )
+    POP_L    ( ESI )
+    RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
+HIDDEN(_mesa_sse_transform_points3_2d_no_rot)
+GLNAME(_mesa_sse_transform_points3_2d_no_rot):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L( ECX, ECX)
+	JZ( LLBL(K_GTP3P2DNRR_finish) ) 	/* count was zero; go to finish */
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 			/* count += dest ptr */
+
+ALIGNTEXT32
+	MOVSS    ( M(0), XMM1 )			/* m0 */
+	MOVSS    ( M(5), XMM2 )			/* m5 */
+	UNPCKLPS ( XMM2, XMM1 )			/* m5 | m0 */
+	MOVLPS   ( M(12), XMM2 )		/* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P2DNRR_top):
+	MOVLPS( S(0), XMM0 )			/* oy | ox */
+	MULPS( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
+	ADDPS( XMM2, XMM0 )			/* +m13 | +m12 */
+	MOVLPS( XMM0, D(0) )			/* ->D(1) | ->D(0) */
+
+	MOVSS( S(2), XMM0 )
+	MOVSS( XMM0, D(2) )
+
+LLBL(K_GTP3P2DNRR_skip):
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(K_GTP3P2DNRR_top) )
+
+LLBL(K_GTP3P2DNRR_finish):
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_3d)
+HIDDEN(_mesa_sse_transform_points3_3d)
+GLNAME(_mesa_sse_transform_points3_3d):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
+	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
+
+
+	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
+
+	TEST_L( ECX, ECX)
+	JZ( LLBL(K_GTP3P3DR_finish) ) 	/* count was zero; go to finish */
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
+
+	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
+	ADD_L( EDI, ECX ) 			/* count += dest ptr */
+
+
+ALIGNTEXT32
+	MOVAPS( M(0), XMM0 )			/* m2  | m1  | m0 */
+	MOVAPS( M(4), XMM1 )			/* m6  | m5  | m4 */
+	MOVAPS( M(8), XMM2 )			/* m10 | m9  | m8 */
+	MOVAPS( M(12), XMM3 )			/* m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P3DR_top):
+	MOVSS( S(0), XMM4 )
+	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox */
+	MULPS( XMM0, XMM4 )			/* ox*m2 | ox*m1 | ox*m0 */
+
+	MOVSS( S(1), XMM5 )
+	SHUFPS( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy */
+	MULPS( XMM1, XMM5 )			/* oy*m6 | oy*m5 | oy*m4 */
+
+	MOVSS( S(2), XMM6 )
+	SHUFPS( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz */
+	MULPS( XMM2, XMM6 )			/* oz*m10 | oz*m9 | oz*m8 */
+
+	ADDPS( XMM5, XMM4 )			/* + | + | + */
+	ADDPS( XMM6, XMM4 )			/* + | + | + */
+	ADDPS( XMM3, XMM4 )			/* + | + | + */
+
+	MOVLPS( XMM4, D(0) )			/* => D(1) | => D(0) */
+	UNPCKHPS( XMM4, XMM4 )
+	MOVSS( XMM4, D(2) )
+
+LLBL(K_GTP3P3DR_skip):
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(K_GTP3P3DR_top) )
+
+LLBL(K_GTP3P3DR_finish):
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/sse_xform4.S
+++ b/src/arch/x86/sse_xform4.S
@ -0,0 +1,235 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+	SEG_TEXT
+
+#define FRAME_OFFSET	8
+
+#define SRC(i)		REGOFF(i * 4, ESI)
+#define DST(i)		REGOFF(i * 4, EDI)
+#define MAT(i)		REGOFF(i * 4, EDX)
+
+#define SELECT(r0, r1, r2, r3)	CONST( r0 * 64 + r1 * 16 + r2 * 4 + r3 )
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_sse_transform_points4_general )
+HIDDEN(_mesa_sse_transform_points4_general)
+GLNAME( _mesa_sse_transform_points4_general ):
+
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )			/* verify non-zero count */
+	JE( LLBL( sse_general_done ) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )	/* stride */
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )	/* set dest count */
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+	MOV_L( REGOFF(V4F_START, ESI), ESI )	/* ptr to first source vertex */
+	MOV_L( REGOFF(V4F_START, EDI), EDI )	/* ptr to first dest vertex */
+
+	PREFETCHT0( REGIND(ESI) )
+
+	MOVAPS( MAT(0), XMM4 )			/* m3  | m2  | m1  | m0  */
+	MOVAPS( MAT(4), XMM5 )			/* m7  | m6  | m5  | m4  */
+	MOVAPS( MAT(8), XMM6 )			/* m11 | m10 | m9  | m8  */
+	MOVAPS( MAT(12), XMM7 )			/* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT16
+LLBL( sse_general_loop ):
+
+	MOVSS( SRC(0), XMM0 )			/* ox */
+	SHUFPS( CONST(0x0), XMM0, XMM0 )	/* ox | ox | ox | ox */
+	MULPS( XMM4, XMM0 )			/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+
+	MOVSS( SRC(1), XMM1 )			/* oy */
+	SHUFPS( CONST(0x0), XMM1, XMM1 )	/* oy | oy | oy | oy */
+	MULPS( XMM5, XMM1 )			/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+	MOVSS( SRC(2), XMM2 )			/* oz */
+	SHUFPS( CONST(0x0), XMM2, XMM2 )	/* oz | oz | oz | oz */
+	MULPS( XMM6, XMM2 )			/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+
+	MOVSS( SRC(3), XMM3 )			/* ow */
+	SHUFPS( CONST(0x0), XMM3, XMM3 )	/* ow | ow | ow | ow */
+	MULPS( XMM7, XMM3 )			/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+
+	ADDPS( XMM1, XMM0 )			/* ox*m3+oy*m7 | ... */
+	ADDPS( XMM2, XMM0 )			/* ox*m3+oy*m7+oz*m11 | ... */
+	ADDPS( XMM3, XMM0 )			/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+	MOVAPS( XMM0, DST(0) )			/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+
+	DEC_L( ECX )
+	JNZ( LLBL( sse_general_loop ) )
+
+LLBL( sse_general_done ):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_sse_transform_points4_3d )
+HIDDEN(_mesa_sse_transform_points4_3d)
+GLNAME( _mesa_sse_transform_points4_3d ):
+
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )		/* ptr to source GLvector4f */
+	MOV_L( ARG_DEST, EDI )			/* ptr to dest GLvector4f */
+
+	MOV_L( ARG_MATRIX, EDX )		/* ptr to matrix */
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )	/* source count */
+
+	TEST_L( ECX, ECX)
+	JZ( LLBL(K_GTP43P3DR_finish) )		/* count was zero; go to finish */
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )	/* stride */
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )	/* set dest count */
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+	SHL_L( CONST(4), ECX )			/* count *= 16 */
+	MOV_L( REGOFF(V4F_START, ESI), ESI )	/* ptr to first source vertex */
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )	/* ptr to first dest vertex */
+	ADD_L( EDI, ECX )			/* count += dest ptr */
+
+	MOVAPS( MAT(0), XMM0 )			/* m3  | m2  | m1  |  m0 */
+	MOVAPS( MAT(4), XMM1 )			/* m7  | m6  | m5  |  m4 */
+	MOVAPS( MAT(8), XMM2 )			/* m11 | m10 | m9  |  m8 */
+	MOVAPS( MAT(12), XMM3 )			/* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL( K_GTP43P3DR_top ):
+	MOVSS( SRC(0), XMM4 )			/* ox */
+	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox | ox */
+	MULPS( XMM0, XMM4 )			/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+
+	MOVSS( SRC(1), XMM5 )			/* oy */
+	SHUFPS( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy | oy */
+	MULPS( XMM1, XMM5 )			/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+	MOVSS( SRC(2), XMM6 )			/* oz */
+	SHUFPS( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz | oz */
+	MULPS( XMM2, XMM6 )			/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+
+	MOVSS( SRC(3), XMM7 )			/* ow */
+	SHUFPS( CONST(0x0), XMM7, XMM7 )	/* ow | ow | ow | ow */
+	MULPS( XMM3, XMM7 )			/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+
+	ADDPS( XMM5, XMM4 )			/* ox*m3+oy*m7 | ... */
+	ADDPS( XMM6, XMM4 )			/* ox*m3+oy*m7+oz*m11 | ... */
+	ADDPS( XMM7, XMM4 )			/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+	MOVAPS( XMM4, DST(0) )			/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+
+	MOVSS( SRC(3), XMM4 )			/* ow */
+	MOVSS( XMM4, DST(3) )			/* ->D(3) */
+
+LLBL( K_GTP43P3DR_skip ):
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(K_GTP43P3DR_top) )
+
+LLBL( K_GTP43P3DR_finish ):
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_sse_transform_points4_identity )
+HIDDEN(_mesa_sse_transform_points4_identity)
+GLNAME( _mesa_sse_transform_points4_identity ):
+
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )			/* verify non-zero count */
+	JE( LLBL( sse_identity_done ) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )	/* stride */
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )	/* set dest count */
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+	MOV_L( REGOFF(V4F_START, ESI), ESI )	/* ptr to first source vertex */
+	MOV_L( REGOFF(V4F_START, EDI), EDI )	/* ptr to first dest vertex */
+
+ALIGNTEXT16
+LLBL( sse_identity_loop ):
+
+	PREFETCHNTA( REGOFF(32, ESI) )
+
+	MOVAPS( REGIND(ESI), XMM0 )
+	ADD_L( EAX, ESI )
+
+	MOVAPS( XMM0, REGIND(EDI) )
+	ADD_L( CONST(16), EDI )
+
+	DEC_L( ECX )
+	JNZ( LLBL( sse_identity_loop ) )
+
+LLBL( sse_identity_done ):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#endif
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/x86_cliptest.S
+++ b/src/arch/x86/x86_cliptest.S
@ -0,0 +1,407 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...).  Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "clip_args.h"
+
+#define SRC0		REGOFF(0, ESI)
+#define SRC1		REGOFF(4, ESI)
+#define SRC2		REGOFF(8, ESI)
+#define SRC3		REGOFF(12, ESI)
+#define DST0		REGOFF(0, EDI)
+#define DST1		REGOFF(4, EDI)
+#define DST2		REGOFF(8, EDI)
+#define DST3		REGOFF(12, EDI)
+#define MAT0		REGOFF(0, EDX)
+#define MAT1		REGOFF(4, EDX)
+#define MAT2		REGOFF(8, EDX)
+#define MAT3		REGOFF(12, EDX)
+
+
+/*
+ * Table for clip test.
+ *
+ * 	bit6 = SRC3 < 0
+ * 	bit5 = SRC2 < 0
+ * 	bit4 = abs(S(2)) > abs(S(3))
+ * 	bit3 = SRC1 < 0
+ * 	bit2 = abs(S(1)) > abs(S(3))
+ * 	bit1 = SRC0 < 0
+ * 	bit0 = abs(S(0)) > abs(S(3))
+ */
+
+	SEG_DATA
+
+clip_table:
+	D_BYTE 0x00, 0x01, 0x00, 0x02, 0x04, 0x05, 0x04, 0x06
+	D_BYTE 0x00, 0x01, 0x00, 0x02, 0x08, 0x09, 0x08, 0x0a
+	D_BYTE 0x20, 0x21, 0x20, 0x22, 0x24, 0x25, 0x24, 0x26
+	D_BYTE 0x20, 0x21, 0x20, 0x22, 0x28, 0x29, 0x28, 0x2a
+	D_BYTE 0x00, 0x01, 0x00, 0x02, 0x04, 0x05, 0x04, 0x06
+	D_BYTE 0x00, 0x01, 0x00, 0x02, 0x08, 0x09, 0x08, 0x0a
+	D_BYTE 0x10, 0x11, 0x10, 0x12, 0x14, 0x15, 0x14, 0x16
+	D_BYTE 0x10, 0x11, 0x10, 0x12, 0x18, 0x19, 0x18, 0x1a
+	D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x37, 0x35, 0x37, 0x36
+	D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a
+	D_BYTE 0x2f, 0x2d, 0x2f, 0x2e, 0x27, 0x25, 0x27, 0x26
+	D_BYTE 0x2f, 0x2d, 0x2f, 0x2e, 0x2b, 0x29, 0x2b, 0x2a
+	D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x37, 0x35, 0x37, 0x36
+	D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a
+	D_BYTE 0x1f, 0x1d, 0x1f, 0x1e, 0x17, 0x15, 0x17, 0x16
+	D_BYTE 0x1f, 0x1d, 0x1f, 0x1e, 0x1b, 0x19, 0x1b, 0x1a
+
+
+	SEG_TEXT
+
+/*
+ * _mesa_x86_cliptest_points4
+ *
+ *   AL:  ormask
+ *   AH:  andmask
+ *   EBX: temp0
+ *   ECX: temp1
+ *   EDX: clipmask[]
+ *   ESI: clip[]
+ *   EDI: proj[]
+ *   EBP: temp2
+ */
+
+#if defined(__ELF__) && defined(__PIC__) && defined(GNU_ASSEMBLER) && !defined(ELFPIC)
+#define ELFPIC
+#endif
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_cliptest_points4 )
+HIDDEN(_mesa_x86_cliptest_points4)
+GLNAME( _mesa_x86_cliptest_points4 ):
+
+#ifdef ELFPIC
+#define FRAME_OFFSET 20
+#else
+#define FRAME_OFFSET 16
+#endif
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBP )
+	PUSH_L( EBX )
+
+#ifdef ELFPIC
+	/* store pointer to clip_table on stack */
+	CALL( LLBL(ctp4_get_eip) )
+	ADD_L( CONST(_GLOBAL_OFFSET_TABLE_), EBX )
+	MOV_L( REGOFF(clip_table@GOT, EBX), EBX )
+	PUSH_L( EBX )
+	JMP( LLBL(ctp4_clip_table_ready) )
+
+LLBL(ctp4_get_eip):
+	/* store eip in ebx */
+	MOV_L( REGIND(ESP), EBX )
+	RET
+
+LLBL(ctp4_clip_table_ready):
+#endif
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_CLIP, EDX )
+	MOV_L( ARG_OR, EBX )
+
+	MOV_L( ARG_AND, EBP )
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+	MOV_L( EAX, ARG_SOURCE )	/* put stride in ARG_SOURCE */
+
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDX, ECX )
+
+	MOV_L( ECX, ARG_CLIP )		/* put clipmask + count in ARG_CLIP */
+	CMP_L( ECX, EDX )
+
+	MOV_B( REGIND(EBX), AL )
+	MOV_B( REGIND(EBP), AH )
+
+	JZ( LLBL(ctp4_finish) )
+
+ALIGNTEXT16
+LLBL(ctp4_top):
+
+	FLD1				/* F3 */
+	FDIV_S( SRC3 )		/* GH: don't care about div-by-zero */
+
+	MOV_L( SRC3, EBP )
+	MOV_L( SRC2, EBX )
+
+	XOR_L( ECX, ECX )
+	ADD_L( EBP, EBP )	/* ebp = abs(S(3))*2 ; carry = sign of S(3) */
+
+	ADC_L( ECX, ECX )
+	ADD_L( EBX, EBX )	/* ebx = abs(S(2))*2 ; carry = sign of S(2) */
+
+	ADC_L( ECX, ECX )
+	CMP_L( EBX, EBP )	/* carry = abs(S(2))*2 > abs(S(3))*2 */
+
+	ADC_L( ECX, ECX )
+	MOV_L( SRC1, EBX )
+
+	ADD_L( EBX, EBX )	/* ebx = abs(S(1))*2 ; carry = sign of S(1) */
+
+	ADC_L( ECX, ECX )
+	CMP_L( EBX, EBP )	/* carry = abs(S(1))*2 > abs(S(3))*2 */
+
+	ADC_L( ECX, ECX )
+	MOV_L( SRC0, EBX )
+
+	ADD_L( EBX, EBX )	/* ebx = abs(S(0))*2 ; carry = sign of S(0) */
+
+	ADC_L( ECX, ECX )
+	CMP_L( EBX, EBP )	/* carry = abs(S(0))*2 > abs(S(3))*2 */
+
+	ADC_L( ECX, ECX )
+
+#ifdef ELFPIC
+	MOV_L( REGIND(ESP), EBP )	/* clip_table */
+
+	MOV_B( REGBI(EBP, ECX), CL )
+#else
+	MOV_B( REGOFF(clip_table,ECX), CL )
+#endif
+
+	OR_B( CL, AL )
+	AND_B( CL, AH )
+
+	TEST_B( CL, CL )
+	MOV_B( CL, REGIND(EDX) )
+
+	JZ( LLBL(ctp4_proj) )
+
+LLBL(ctp4_noproj):
+
+	FSTP( ST(0) )			/* */
+
+	MOV_L( CONST(0), DST0 )
+	MOV_L( CONST(0), DST1 )
+	MOV_L( CONST(0), DST2 )
+	MOV_L( CONST(0x3f800000), DST3 )
+
+	JMP( LLBL(ctp4_next) )
+
+LLBL(ctp4_proj):
+
+	FLD_S( SRC0 )			/* F0 F3 */
+	FMUL2( ST(1), ST0 )
+
+	FLD_S( SRC1 )			/* F1 F0 F3 */
+	FMUL2( ST(2), ST0 )
+
+	FLD_S( SRC2 )			/* F2 F1 F0 F3 */
+	FMUL2( ST(3), ST0 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F3 */
+	FSTP_S( DST0 )		/* F1 F2 F3 */
+	FSTP_S( DST1 )		/* F2 F3 */
+	FSTP_S( DST2 )		/* F3 */
+	FSTP_S( DST3 )		/* */
+
+LLBL(ctp4_next):
+
+	INC_L( EDX )
+	ADD_L( CONST(16), EDI )
+
+	ADD_L( ARG_SOURCE, ESI )
+	CMP_L( EDX, ARG_CLIP )
+
+	JNZ( LLBL(ctp4_top) )
+
+	MOV_L( ARG_OR, ECX )
+	MOV_L( ARG_AND, EDX )
+
+	MOV_B( AL, REGIND(ECX) )
+	MOV_B( AH, REGIND(EDX) )
+
+LLBL(ctp4_finish):
+
+	MOV_L( ARG_DEST, EAX )
+#ifdef ELFPIC
+	POP_L( ESI )			/* discard ptr to clip_table */
+#endif
+	POP_L( EBX )
+	POP_L( EBP )
+	POP_L( EDI )
+	POP_L( ESI )
+
+	RET
+
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_cliptest_points4_np )
+HIDDEN(_mesa_x86_cliptest_points4_np)
+GLNAME( _mesa_x86_cliptest_points4_np ):
+
+#ifdef ELFPIC
+#define FRAME_OFFSET 20
+#else
+#define FRAME_OFFSET 16
+#endif
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBP )
+	PUSH_L( EBX )
+
+#ifdef ELFPIC
+	/* store pointer to clip_table on stack */
+	CALL( LLBL(ctp4_np_get_eip) )
+	ADD_L( CONST(_GLOBAL_OFFSET_TABLE_), EBX )
+	MOV_L( REGOFF(clip_table@GOT, EBX), EBX )
+	PUSH_L( EBX )
+	JMP( LLBL(ctp4_np_clip_table_ready) )
+
+LLBL(ctp4_np_get_eip):
+	/* store eip in ebx */
+	MOV_L( REGIND(ESP), EBX )
+	RET
+
+LLBL(ctp4_np_clip_table_ready):
+#endif
+
+	MOV_L( ARG_SOURCE, ESI )
+	/* slot */
+
+	MOV_L( ARG_CLIP, EDX )
+	MOV_L( ARG_OR, EBX )
+
+	MOV_L( ARG_AND, EBP )
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( EAX, ARG_DEST )   	/* put stride in ARG_DEST */
+	ADD_L( EDX, ECX )
+
+	MOV_L( ECX, EDI )		/* put clipmask + count in EDI */
+	CMP_L( ECX, EDX )
+
+	MOV_B( REGIND(EBX), AL )
+	MOV_B( REGIND(EBP), AH )
+
+	JZ( LLBL(ctp4_np_finish) )
+
+ALIGNTEXT16
+LLBL(ctp4_np_top):
+
+	MOV_L( SRC3, EBP )
+	MOV_L( SRC2, EBX )
+
+	XOR_L( ECX, ECX )
+	ADD_L( EBP, EBP )	/* ebp = abs(S(3))*2 ; carry = sign of S(3) */
+
+	ADC_L( ECX, ECX )
+	ADD_L( EBX, EBX )	/* ebx = abs(S(2))*2 ; carry = sign of S(2) */
+
+	ADC_L( ECX, ECX )
+	CMP_L( EBX, EBP )	/* carry = abs(S(2))*2 > abs(S(3))*2 */
+
+	ADC_L( ECX, ECX )
+	MOV_L( SRC1, EBX )
+
+	ADD_L( EBX, EBX )	/* ebx = abs(S(1))*2 ; carry = sign of S(1) */
+
+	ADC_L( ECX, ECX )
+	CMP_L( EBX, EBP )	/* carry = abs(S(1))*2 > abs(S(3))*2 */
+
+	ADC_L( ECX, ECX )
+	MOV_L( SRC0, EBX )
+
+	ADD_L( EBX, EBX )	/* ebx = abs(S(0))*2 ; carry = sign of S(0) */
+
+	ADC_L( ECX, ECX )
+	CMP_L( EBX, EBP )	/* carry = abs(S(0))*2 > abs(S(3))*2 */
+
+	ADC_L( ECX, ECX )
+
+#ifdef ELFPIC
+	MOV_L( REGIND(ESP), EBP )	/* clip_table */
+
+	MOV_B( REGBI(EBP, ECX), CL )
+#else
+	MOV_B( REGOFF(clip_table,ECX), CL )
+#endif
+
+	OR_B( CL, AL )
+	AND_B( CL, AH )
+
+	TEST_B( CL, CL )
+	MOV_B( CL, REGIND(EDX) )
+
+	INC_L( EDX )
+	/* slot */
+
+	ADD_L( ARG_DEST, ESI )
+	CMP_L( EDX, EDI )
+
+	JNZ( LLBL(ctp4_np_top) )
+
+	MOV_L( ARG_OR, ECX )
+	MOV_L( ARG_AND, EDX )
+
+	MOV_B( AL, REGIND(ECX) )
+	MOV_B( AH, REGIND(EDX) )
+
+LLBL(ctp4_np_finish):
+
+	MOV_L( ARG_SOURCE, EAX )
+#ifdef ELFPIC
+	POP_L( ESI )			/* discard ptr to clip_table */
+#endif
+	POP_L( EBX )
+	POP_L( EBP )
+	POP_L( EDI )
+	POP_L( ESI )
+
+	RET
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/x86_xform.c
+++ b/src/arch/x86/x86_xform.c
@ -0,0 +1,126 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Intel x86 assembly code by Josh Vanderhoof
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+
+#include "x86_xform.h"
+#include "common_x86_asm.h"
+
+#ifdef USE_X86_ASM
+#ifdef USE_3DNOW_ASM
+#include "3dnow.h"
+#endif
+#ifdef USE_SSE_ASM
+#include "sse.h"
+#endif
+#endif
+
+#ifdef DEBUG_MATH
+#include "math/m_debug.h"
+#endif
+
+
+#ifdef USE_X86_ASM
+DECLARE_XFORM_GROUP( x86, 2 )
+DECLARE_XFORM_GROUP( x86, 3 )
+DECLARE_XFORM_GROUP( x86, 4 )
+
+
+extern GLvector4f * _ASMAPI
+_mesa_x86_cliptest_points4( GLvector4f *clip_vec,
+			    GLvector4f *proj_vec,
+			    GLubyte clipMask[],
+			    GLubyte *orMask,
+			    GLubyte *andMask,
+			    GLboolean viewport_z_clip );
+
+extern GLvector4f * _ASMAPI
+_mesa_x86_cliptest_points4_np( GLvector4f *clip_vec,
+			       GLvector4f *proj_vec,
+			       GLubyte clipMask[],
+			       GLubyte *orMask,
+			       GLubyte *andMask,
+			       GLboolean viewport_z_clip );
+
+extern void _ASMAPI
+_mesa_v16_x86_cliptest_points4( GLfloat *first_vert,
+				GLfloat *last_vert,
+				GLubyte *or_mask,
+				GLubyte *and_mask,
+				GLubyte *clip_mask,
+				GLboolean viewport_z_clip );
+
+extern void _ASMAPI
+_mesa_v16_x86_general_xform( GLfloat *dest,
+			     const GLfloat *m,
+			     const GLfloat *src,
+			     GLuint src_stride,
+			     GLuint count );
+#endif
+
+
+#ifdef USE_X86_ASM
+static void _mesa_init_x86_transform_asm( void )
+{
+   ASSIGN_XFORM_GROUP( x86, 2 );
+   ASSIGN_XFORM_GROUP( x86, 3 );
+   ASSIGN_XFORM_GROUP( x86, 4 );
+
+   _mesa_clip_tab[4] = _mesa_x86_cliptest_points4;
+   _mesa_clip_np_tab[4] = _mesa_x86_cliptest_points4_np;
+
+#ifdef DEBUG_MATH
+   _math_test_all_transform_functions( "x86" );
+   _math_test_all_cliptest_functions( "x86" );
+#endif
+}
+#endif
+
+
+void _mesa_init_all_x86_transform_asm( void )
+{
+   _mesa_get_x86_features();
+
+#ifdef USE_X86_ASM
+   if ( _mesa_x86_cpu_features ) {
+      _mesa_init_x86_transform_asm();
+   }
+
+   if (cpu_has_3dnow) {
+      _mesa_init_3dnow_transform_asm();
+   }
+
+   if ( cpu_has_xmm ) {
+      _mesa_init_sse_transform_asm();
+   }
+
+#endif
+}
--- a/src/arch/x86/x86_xform.h
+++ b/src/arch/x86/x86_xform.h
@ -0,0 +1,106 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Gareth Hughes
+ */
+
+#ifndef X86_XFORM_H
+#define X86_XFORM_H
+
+
+/* =============================================================
+ * Transformation function declarations:
+ */
+
+#define XFORM_ARGS	GLvector4f *to_vec,				\
+			const GLfloat m[16],				\
+			const GLvector4f *from_vec
+
+#define DECLARE_XFORM_GROUP( pfx, sz ) \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general( XFORM_ARGS );		\
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity( XFORM_ARGS );	\
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot( XFORM_ARGS );	\
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective( XFORM_ARGS );	\
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d( XFORM_ARGS );		\
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot( XFORM_ARGS );	\
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d( XFORM_ARGS );
+
+#define ASSIGN_XFORM_GROUP( pfx, sz )					\
+   _mesa_transform_tab[sz][MATRIX_GENERAL] =				\
+      _mesa_##pfx##_transform_points##sz##_general;			\
+   _mesa_transform_tab[sz][MATRIX_IDENTITY] =				\
+      _mesa_##pfx##_transform_points##sz##_identity;			\
+   _mesa_transform_tab[sz][MATRIX_3D_NO_ROT] =				\
+      _mesa_##pfx##_transform_points##sz##_3d_no_rot;			\
+   _mesa_transform_tab[sz][MATRIX_PERSPECTIVE] =			\
+      _mesa_##pfx##_transform_points##sz##_perspective;			\
+   _mesa_transform_tab[sz][MATRIX_2D] =					\
+      _mesa_##pfx##_transform_points##sz##_2d;				\
+   _mesa_transform_tab[sz][MATRIX_2D_NO_ROT] =				\
+      _mesa_##pfx##_transform_points##sz##_2d_no_rot;			\
+   _mesa_transform_tab[sz][MATRIX_3D] =					\
+      _mesa_##pfx##_transform_points##sz##_3d;
+
+
+/* =============================================================
+ * Normal transformation function declarations:
+ */
+
+#define NORM_ARGS	const GLmatrix *mat,				\
+			GLfloat scale,					\
+			const GLvector4f *in,				\
+			const GLfloat *lengths,				\
+			GLvector4f *dest
+
+#define DECLARE_NORM_GROUP( pfx ) \
+extern void _ASMAPI _mesa_##pfx##_rescale_normals( NORM_ARGS );				\
+extern void _ASMAPI _mesa_##pfx##_normalize_normals( NORM_ARGS );			\
+extern void _ASMAPI _mesa_##pfx##_transform_normals( NORM_ARGS );			\
+extern void _ASMAPI _mesa_##pfx##_transform_normals_no_rot( NORM_ARGS );		\
+extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals( NORM_ARGS );		\
+extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals_no_rot( NORM_ARGS );	\
+extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals( NORM_ARGS );		\
+extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals_no_rot( NORM_ARGS );
+
+#define ASSIGN_NORM_GROUP( pfx )					\
+   _mesa_normal_tab[NORM_RESCALE] =					\
+      _mesa_##pfx##_rescale_normals;					\
+   _mesa_normal_tab[NORM_NORMALIZE] =					\
+      _mesa_##pfx##_normalize_normals;					\
+   _mesa_normal_tab[NORM_TRANSFORM] =					\
+      _mesa_##pfx##_transform_normals;					\
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] =				\
+      _mesa_##pfx##_transform_normals_no_rot;				\
+   _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] =			\
+      _mesa_##pfx##_transform_rescale_normals;				\
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] =		\
+      _mesa_##pfx##_transform_rescale_normals_no_rot;			\
+   _mesa_normal_tab[NORM_TRANSFORM | NORM_NORMALIZE] =			\
+      _mesa_##pfx##_transform_normalize_normals;			\
+   _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE] =		\
+      _mesa_##pfx##_transform_normalize_normals_no_rot;
+
+
+#endif
--- a/src/arch/x86/x86_xform2.S
+++ b/src/arch/x86/x86_xform2.S
@ -0,0 +1,574 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...).  Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+	SEG_TEXT
+
+#define FP_ONE		1065353216
+#define FP_ZERO		0
+
+#define SRC0		REGOFF(0, ESI)
+#define SRC1		REGOFF(4, ESI)
+#define SRC2		REGOFF(8, ESI)
+#define SRC3		REGOFF(12, ESI)
+#define DST0		REGOFF(0, EDI)
+#define DST1		REGOFF(4, EDI)
+#define DST2		REGOFF(8, EDI)
+#define DST3		REGOFF(12, EDI)
+#define MAT0		REGOFF(0, EDX)
+#define MAT1		REGOFF(4, EDX)
+#define MAT2		REGOFF(8, EDX)
+#define MAT3		REGOFF(12, EDX)
+#define MAT4		REGOFF(16, EDX)
+#define MAT5		REGOFF(20, EDX)
+#define MAT6		REGOFF(24, EDX)
+#define MAT7		REGOFF(28, EDX)
+#define MAT8		REGOFF(32, EDX)
+#define MAT9		REGOFF(36, EDX)
+#define MAT10		REGOFF(40, EDX)
+#define MAT11		REGOFF(44, EDX)
+#define MAT12		REGOFF(48, EDX)
+#define MAT13		REGOFF(52, EDX)
+#define MAT14		REGOFF(56, EDX)
+#define MAT15		REGOFF(60, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_general )
+HIDDEN(_mesa_x86_transform_points2_general)
+GLNAME( _mesa_x86_transform_points2_general ):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p2_gr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p2_gr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+	FLD_S( SRC0 )			/* F6 F5 F4 */
+	FMUL_S( MAT2 )
+	FLD_S( SRC0 )			/* F7 F6 F5 F4 */
+	FMUL_S( MAT3 )
+
+	FLD_S( SRC1 )			/* F0 F7 F6 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT5 )
+	FLD_S( SRC1 )			/* F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT6 )
+	FLD_S( SRC1 )			/* F3 F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT7 )
+
+	FXCH( ST(3) )			/* F0 F2 F1 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(7) )		/* F2 F1 F3 F7 F6 F5 F4 */
+	FXCH( ST(1) )			/* F1 F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F7 F6 F5 F4 */
+
+	FXCH( ST(3) )			/* F4 F6 F5 F7 */
+	FADD_S( MAT12 )
+	FXCH( ST(2) )			/* F5 F6 F4 F7 */
+	FADD_S( MAT13 )
+	FXCH( ST(1) )			/* F6 F5 F4 F7 */
+	FADD_S( MAT14 )
+	FXCH( ST(3) )			/* F7 F5 F4 F6 */
+	FADD_S( MAT15 )
+
+	FXCH( ST(2) )			/* F4 F5 F7 F6 */
+	FSTP_S( DST0 )			/* F5 F7 F6 */
+	FSTP_S( DST1 )			/* F7 F6 */
+	FXCH( ST(1) )			/* F6 F7 */
+	FSTP_S( DST2 )			/* F7 */
+	FSTP_S( DST3 )			/* */
+
+LLBL(x86_p2_gr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p2_gr_loop) )
+
+LLBL(x86_p2_gr_done):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_perspective )
+HIDDEN(_mesa_x86_transform_points2_perspective)
+GLNAME( _mesa_x86_transform_points2_perspective ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p2_pr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+	MOV_L( MAT14, EBX )
+
+ALIGNTEXT16
+LLBL(x86_p2_pr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F1 F4 */
+	FMUL_S( MAT5 )
+
+	FXCH( ST(1) )			/* F4 F1 */
+	FSTP_S( DST0   )		/* F1 */
+	FSTP_S( DST1   )		/* */
+	MOV_L( EBX, DST2 )
+	MOV_L( CONST(FP_ZERO), DST3 )
+
+LLBL(x86_p2_pr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p2_pr_loop) )
+
+LLBL(x86_p2_pr_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_3d )
+HIDDEN(_mesa_x86_transform_points2_3d)
+GLNAME( _mesa_x86_transform_points2_3d ):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p2_3dr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p2_3dr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+	FLD_S( SRC0 )			/* F6 F5 F4 */
+	FMUL_S( MAT2 )
+
+	FLD_S( SRC1 )			/* F0 F6 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F6 F5 F4 */
+	FMUL_S( MAT5 )
+	FLD_S( SRC1 )			/* F2 F1 F0 F6 F5 F4 */
+	FMUL_S( MAT6 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F2 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	FXCH( ST(2) )			/* F4 F5 F6 */
+	FADD_S( MAT12 )
+	FXCH( ST(1) )			/* F5 F4 F6 */
+	FADD_S( MAT13 )
+	FXCH( ST(2) )			/* F6 F4 F5 */
+	FADD_S( MAT14 )
+
+	FXCH( ST(1) )			/* F4 F6 F5 */
+	FSTP_S( DST0 )			/* F6 F5 */
+	FXCH( ST(1) )			/* F5 F6 */
+	FSTP_S( DST1 )			/* F6 */
+	FSTP_S( DST2 )			/* */
+
+LLBL(x86_p2_3dr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p2_3dr_loop) )
+
+LLBL(x86_p2_3dr_done):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_3d_no_rot )
+HIDDEN(_mesa_x86_transform_points2_3d_no_rot)
+GLNAME( _mesa_x86_transform_points2_3d_no_rot ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p2_3dnrr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+	MOV_L( MAT14, EBX )
+
+ALIGNTEXT16
+LLBL(x86_p2_3dnrr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F1 F4 */
+	FMUL_S( MAT5 )
+
+	FXCH( ST(1) )			/* F4 F1 */
+	FADD_S( MAT12 )
+	FLD_S( MAT13 )		/* F5 F4 F1 */
+	FXCH( ST(2) )			/* F1 F4 F5 */
+	FADDP( ST0, ST(2) )		/* F4 F5 */
+
+	FSTP_S( DST0 )		/* F5 */
+	FSTP_S( DST1 )		/* */
+	MOV_L( EBX, DST2 )
+
+LLBL(x86_p2_3dnrr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p2_3dnrr_loop) )
+
+LLBL(x86_p2_3dnrr_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_2d )
+HIDDEN(_mesa_x86_transform_points2_2d)
+GLNAME( _mesa_x86_transform_points2_2d ):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p2_2dr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p2_2dr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+
+	FLD_S( SRC1 )			/* F0 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F5 F4 */
+	FMUL_S( MAT5 )
+
+	FXCH( ST(1) )			/* F0 F1 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F1 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F5 F4 */
+
+	FXCH( ST(1) )			/* F4 F5 */
+	FADD_S( MAT12 )
+	FXCH( ST(1) )			/* F5 F4 */
+	FADD_S( MAT13 )
+
+	FXCH( ST(1) )			/* F4 F5 */
+	FSTP_S( DST0 )		/* F5 */
+	FSTP_S( DST1 )		/* */
+
+LLBL(x86_p2_2dr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p2_2dr_loop) )
+
+LLBL(x86_p2_2dr_done):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_x86_transform_points2_2d_no_rot )
+HIDDEN(_mesa_x86_transform_points2_2d_no_rot)
+GLNAME( _mesa_x86_transform_points2_2d_no_rot ):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p2_2dnrr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p2_2dnrr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F1 F4 */
+	FMUL_S( MAT5 )
+
+	FXCH( ST(1) )			/* F4 F1 */
+	FADD_S( MAT12 )
+	FLD_S( MAT13 )		/* F5 F4 F1 */
+	FXCH( ST(2) )			/* F1 F4 F5 */
+	FADDP( ST0, ST(2) )		/* F4 F5 */
+
+	FSTP_S( DST0   )		/* F5 */
+	FSTP_S( DST1   )		/* */
+
+LLBL(x86_p2_2dnrr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p2_2dnrr_loop) )
+
+LLBL(x86_p2_2dnrr_done):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_identity )
+HIDDEN(_mesa_x86_transform_points2_identity)
+GLNAME( _mesa_x86_transform_points2_identity ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p2_ir_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+	CMP_L( ESI, EDI )
+	JE( LLBL(x86_p2_ir_done) )
+
+ALIGNTEXT16
+LLBL(x86_p2_ir_loop):
+
+	MOV_L( SRC0, EBX )
+	MOV_L( SRC1, EDX )
+
+	MOV_L( EBX, DST0 )
+	MOV_L( EDX, DST1 )
+
+LLBL(x86_p2_ir_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p2_ir_loop) )
+
+LLBL(x86_p2_ir_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/x86_xform3.S
+++ b/src/arch/x86/x86_xform3.S
@ -0,0 +1,644 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...).  Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+	SEG_TEXT
+
+#define FP_ONE		1065353216
+#define FP_ZERO		0
+
+#define SRC0		REGOFF(0, ESI)
+#define SRC1		REGOFF(4, ESI)
+#define SRC2		REGOFF(8, ESI)
+#define SRC3		REGOFF(12, ESI)
+#define DST0		REGOFF(0, EDI)
+#define DST1		REGOFF(4, EDI)
+#define DST2		REGOFF(8, EDI)
+#define DST3		REGOFF(12, EDI)
+#define MAT0		REGOFF(0, EDX)
+#define MAT1		REGOFF(4, EDX)
+#define MAT2		REGOFF(8, EDX)
+#define MAT3		REGOFF(12, EDX)
+#define MAT4		REGOFF(16, EDX)
+#define MAT5		REGOFF(20, EDX)
+#define MAT6		REGOFF(24, EDX)
+#define MAT7		REGOFF(28, EDX)
+#define MAT8		REGOFF(32, EDX)
+#define MAT9		REGOFF(36, EDX)
+#define MAT10		REGOFF(40, EDX)
+#define MAT11		REGOFF(44, EDX)
+#define MAT12		REGOFF(48, EDX)
+#define MAT13		REGOFF(52, EDX)
+#define MAT14		REGOFF(56, EDX)
+#define MAT15		REGOFF(60, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_general )
+HIDDEN(_mesa_x86_transform_points3_general)
+GLNAME( _mesa_x86_transform_points3_general ):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p3_gr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_gr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+	FLD_S( SRC0 )			/* F6 F5 F4 */
+	FMUL_S( MAT2 )
+	FLD_S( SRC0 )			/* F7 F6 F5 F4 */
+	FMUL_S( MAT3 )
+
+	FLD_S( SRC1 )			/* F0 F7 F6 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT5 )
+	FLD_S( SRC1 )			/* F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT6 )
+	FLD_S( SRC1 )			/* F3 F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT7 )
+
+	FXCH( ST(3) )			/* F0 F2 F1 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(7) )		/* F2 F1 F3 F7 F6 F5 F4 */
+	FXCH( ST(1) )			/* F1 F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F7 F6 F5 F4 */
+
+	FLD_S( SRC2 )			/* F0 F7 F6 F5 F4 */
+	FMUL_S( MAT8 )
+	FLD_S( SRC2 )			/* F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT9 )
+	FLD_S( SRC2 )			/* F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT10 )
+	FLD_S( SRC2 )			/* F3 F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT11 )
+
+	FXCH( ST(3) )			/* F0 F2 F1 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(7) )		/* F2 F1 F3 F7 F6 F5 F4 */
+	FXCH( ST(1) )			/* F1 F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F7 F6 F5 F4 */
+
+	FXCH( ST(3) )			/* F4 F6 F5 F7 */
+	FADD_S( MAT12 )
+	FXCH( ST(2) )			/* F5 F6 F4 F7 */
+	FADD_S( MAT13 )
+	FXCH( ST(1) )			/* F6 F5 F4 F7 */
+	FADD_S( MAT14 )
+	FXCH( ST(3) )			/* F7 F5 F4 F6 */
+	FADD_S( MAT15 )
+
+	FXCH( ST(2) )			/* F4 F5 F7 F6 */
+	FSTP_S( DST0 )		/* F5 F7 F6 */
+	FSTP_S( DST1 )		/* F7 F6 */
+	FXCH( ST(1) )			/* F6 F7 */
+	FSTP_S( DST2 )		/* F7 */
+	FSTP_S( DST3 )		/* */
+
+LLBL(x86_p3_gr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p3_gr_loop) )
+
+LLBL(x86_p3_gr_done):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_perspective )
+HIDDEN(_mesa_x86_transform_points3_perspective)
+GLNAME( _mesa_x86_transform_points3_perspective ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p3_pr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_pr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F5 F4 */
+	FMUL_S( MAT5 )
+
+	FLD_S( SRC2 )			/* F0 F5 F4 */
+	FMUL_S( MAT8 )
+	FLD_S( SRC2 )			/* F1 F0 F5 F4 */
+	FMUL_S( MAT9 )
+	FLD_S( SRC2 )			/* F2 F1 F0 F5 F4 */
+	FMUL_S( MAT10 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F5 F4 */
+	FADDP( ST0, ST(4) )		/* F1 F2 F5 F4 */
+	FADDP( ST0, ST(2) )		/* F2 F5 F4 */
+	FLD_S( MAT14 )		/* F6 F2 F5 F4 */
+	FXCH( ST(1) )			/* F2 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	MOV_L( SRC2, EBX )
+	XOR_L( CONST(-2147483648), EBX )/* change sign */
+
+	FXCH( ST(2) )			/* F4 F5 F6 */
+	FSTP_S( DST0 )		/* F5 F6 */
+	FSTP_S( DST1 )		/* F6 */
+	FSTP_S( DST2 )		/* */
+	MOV_L( EBX, DST3 )
+
+LLBL(x86_p3_pr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p3_pr_loop) )
+
+LLBL(x86_p3_pr_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_3d )
+HIDDEN(_mesa_x86_transform_points3_3d)
+GLNAME( _mesa_x86_transform_points3_3d ):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p3_3dr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_3dr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+	FLD_S( SRC0 )			/* F6 F5 F4 */
+	FMUL_S( MAT2 )
+
+	FLD_S( SRC1 )			/* F0 F6 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F6 F5 F4 */
+	FMUL_S( MAT5 )
+	FLD_S( SRC1 )			/* F2 F1 F0 F6 F5 F4 */
+	FMUL_S( MAT6 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F2 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	FLD_S( SRC2 )			/* F0 F6 F5 F4 */
+	FMUL_S( MAT8 )
+	FLD_S( SRC2 )			/* F1 F0 F6 F5 F4 */
+	FMUL_S( MAT9 )
+	FLD_S( SRC2 )			/* F2 F1 F0 F6 F5 F4 */
+	FMUL_S( MAT10 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F2 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	FXCH( ST(2) )			/* F4 F5 F6 */
+	FADD_S( MAT12 )
+	FXCH( ST(1) )			/* F5 F4 F6 */
+	FADD_S( MAT13 )
+	FXCH( ST(2) )			/* F6 F4 F5 */
+	FADD_S( MAT14 )
+
+	FXCH( ST(1) )			/* F4 F6 F5 */
+	FSTP_S( DST0   )		/* F6 F5 */
+	FXCH( ST(1) )			/* F5 F6 */
+	FSTP_S( DST1   )		/* F6 */
+	FSTP_S( DST2   )		/* */
+
+LLBL(x86_p3_3dr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p3_3dr_loop) )
+
+LLBL(x86_p3_3dr_done):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_3d_no_rot )
+HIDDEN(_mesa_x86_transform_points3_3d_no_rot)
+GLNAME( _mesa_x86_transform_points3_3d_no_rot ):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p3_3dnrr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_3dnrr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F1 F4 */
+	FMUL_S( MAT5 )
+
+	FLD_S( SRC2 )			/* F2 F1 F4 */
+	FMUL_S( MAT10 )
+
+	FXCH( ST(2) )			/* F4 F1 F2 */
+	FADD_S( MAT12 )
+	FLD_S( MAT13 )		/* F5 F4 F1 F2 */
+	FXCH( ST(2) )			/* F1 F4 F5 F2 */
+	FADDP( ST0, ST(2) )		/* F4 F5 F2 */
+	FLD_S( MAT14 )		/* F6 F4 F5 F2 */
+	FXCH( ST(3) )			/* F2 F4 F5 F6 */
+	FADDP( ST0, ST(3) )		/* F4 F5 F6 */
+
+	FSTP_S( DST0   )		/* F5 F6 */
+	FSTP_S( DST1   )		/* F6 */
+	FSTP_S( DST2   )		/* */
+
+LLBL(x86_p3_3dnrr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p3_3dnrr_loop) )
+
+LLBL(x86_p3_3dnrr_done):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_2d )
+HIDDEN(_mesa_x86_transform_points3_2d)
+GLNAME( _mesa_x86_transform_points3_2d ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p3_2dr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_2dr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+
+	FLD_S( SRC1 )			/* F0 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F5 F4 */
+	FMUL_S( MAT5 )
+
+	FXCH( ST(1) )			/* F0 F1 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F1 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F5 F4 */
+
+	FXCH( ST(1) )			/* F4 F5 */
+	FADD_S( MAT12 )
+	FXCH( ST(1) )			/* F5 F4 */
+	FADD_S( MAT13 )
+
+	MOV_L( SRC2, EBX )
+
+	FXCH( ST(1) )			/* F4 F5 */
+	FSTP_S( DST0   )		/* F5 */
+	FSTP_S( DST1   )		/* */
+	MOV_L( EBX, DST2 )
+
+LLBL(x86_p3_2dr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p3_2dr_loop) )
+
+LLBL(x86_p3_2dr_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_2d_no_rot )
+HIDDEN(_mesa_x86_transform_points3_2d_no_rot)
+GLNAME( _mesa_x86_transform_points3_2d_no_rot ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p3_2dnrr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_2dnrr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F1 F4 */
+	FMUL_S( MAT5 )
+
+	FXCH( ST(1) )			/* F4 F1 */
+	FADD_S( MAT12 )
+	FLD_S( MAT13 )		/* F5 F4 F1 */
+
+	FXCH( ST(2) )			/* F1 F4 F5 */
+	FADDP( ST0, ST(2) )		/* F4 F5 */
+
+	MOV_L( SRC2, EBX )
+
+	FSTP_S( DST0 )		/* F5 */
+	FSTP_S( DST1 )		/* */
+	MOV_L( EBX, DST2 )
+
+LLBL(x86_p3_2dnrr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p3_2dnrr_loop) )
+
+LLBL(x86_p3_2dnrr_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_identity )
+HIDDEN(_mesa_x86_transform_points3_identity)
+GLNAME(_mesa_x86_transform_points3_identity ):
+
+#define FRAME_OFFSET 16
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+	PUSH_L( EBP )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p3_ir_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+	CMP_L( ESI, EDI )
+	JE( LLBL(x86_p3_ir_done) )
+
+ALIGNTEXT16
+LLBL(x86_p3_ir_loop):
+
+#if 1
+	MOV_L( SRC0, EBX )
+	MOV_L( SRC1, EBP )
+	MOV_L( SRC2, EDX )
+
+	MOV_L( EBX, DST0 )
+	MOV_L( EBP, DST1 )
+	MOV_L( EDX, DST2 )
+#else
+	FLD_S( SRC0 )
+	FLD_S( SRC1 )
+	FLD_S( SRC2 )
+
+	FSTP_S( DST2 )
+	FSTP_S( DST1 )
+	FSTP_S( DST0 )
+#endif
+
+LLBL(x86_p3_ir_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p3_ir_loop) )
+
+LLBL(x86_p3_ir_done):
+
+	POP_L( EBP )
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/x86_xform4.S
+++ b/src/arch/x86/x86_xform4.S
@ -0,0 +1,677 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...).  Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+	SEG_TEXT
+
+#define FP_ONE		1065353216
+#define FP_ZERO		0
+
+#define SRC0		REGOFF(0, ESI)
+#define SRC1		REGOFF(4, ESI)
+#define SRC2		REGOFF(8, ESI)
+#define SRC3		REGOFF(12, ESI)
+#define DST0		REGOFF(0, EDI)
+#define DST1		REGOFF(4, EDI)
+#define DST2		REGOFF(8, EDI)
+#define DST3		REGOFF(12, EDI)
+#define MAT0		REGOFF(0, EDX)
+#define MAT1		REGOFF(4, EDX)
+#define MAT2		REGOFF(8, EDX)
+#define MAT3		REGOFF(12, EDX)
+#define MAT4		REGOFF(16, EDX)
+#define MAT5		REGOFF(20, EDX)
+#define MAT6		REGOFF(24, EDX)
+#define MAT7		REGOFF(28, EDX)
+#define MAT8		REGOFF(32, EDX)
+#define MAT9		REGOFF(36, EDX)
+#define MAT10		REGOFF(40, EDX)
+#define MAT11		REGOFF(44, EDX)
+#define MAT12		REGOFF(48, EDX)
+#define MAT13		REGOFF(52, EDX)
+#define MAT14		REGOFF(56, EDX)
+#define MAT15		REGOFF(60, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_general )
+HIDDEN(_mesa_x86_transform_points4_general)
+GLNAME( _mesa_x86_transform_points4_general ):
+
+#define FRAME_OFFSET 8
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p4_gr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_gr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+	FLD_S( SRC0 )			/* F6 F5 F4 */
+	FMUL_S( MAT2 )
+	FLD_S( SRC0 )			/* F7 F6 F5 F4 */
+	FMUL_S( MAT3 )
+
+	FLD_S( SRC1 )			/* F0 F7 F6 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT5 )
+	FLD_S( SRC1 )			/* F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT6 )
+	FLD_S( SRC1 )			/* F3 F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT7 )
+
+	FXCH( ST(3) )			/* F0 F2 F1 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(7) )		/* F2 F1 F3 F7 F6 F5 F4 */
+	FXCH( ST(1) )			/* F1 F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F7 F6 F5 F4 */
+
+	FLD_S( SRC2 )			/* F0 F7 F6 F5 F4 */
+	FMUL_S( MAT8 )
+	FLD_S( SRC2 )			/* F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT9 )
+	FLD_S( SRC2 )			/* F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT10 )
+	FLD_S( SRC2 )			/* F3 F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT11 )
+
+	FXCH( ST(3) )			/* F0 F2 F1 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(7) )		/* F2 F1 F3 F7 F6 F5 F4 */
+	FXCH( ST(1) )			/* F1 F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F7 F6 F5 F4 */
+
+	FLD_S( SRC3 )			/* F0 F7 F6 F5 F4 */
+	FMUL_S( MAT12 )
+	FLD_S( SRC3 )			/* F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT13 )
+	FLD_S( SRC3 )			/* F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT14 )
+	FLD_S( SRC3 )			/* F3 F2 F1 F0 F7 F6 F5 F4 */
+	FMUL_S( MAT15 )
+
+	FXCH( ST(3) )			/* F0 F2 F1 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(7) )		/* F2 F1 F3 F7 F6 F5 F4 */
+	FXCH( ST(1) )			/* F1 F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F2 F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F3 F7 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F7 F6 F5 F4 */
+
+	FXCH( ST(3) )			/* F4 F6 F5 F7 */
+	FSTP_S( DST0 )		/* F6 F5 F7 */
+	FXCH( ST(1) )			/* F5 F6 F7 */
+	FSTP_S( DST1 )		/* F6 F7 */
+	FSTP_S( DST2 )		/* F7 */
+	FSTP_S( DST3 )		/* */
+
+LLBL(x86_p4_gr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p4_gr_loop) )
+
+LLBL(x86_p4_gr_done):
+
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_perspective )
+HIDDEN(_mesa_x86_transform_points4_perspective)
+GLNAME( _mesa_x86_transform_points4_perspective ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p4_pr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_pr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F5 F4 */
+	FMUL_S( MAT5 )
+
+	FLD_S( SRC2 )			/* F0 F5 F4 */
+	FMUL_S( MAT8 )
+	FLD_S( SRC2 )			/* F1 F0 F5 F4 */
+	FMUL_S( MAT9 )
+	FLD_S( SRC2 )			/* F6 F1 F0 F5 F4 */
+	FMUL_S( MAT10 )
+
+	FXCH( ST(2) )			/* F0 F1 F6 F5 F4 */
+	FADDP( ST0, ST(4) )		/* F1 F6 F5 F4 */
+	FADDP( ST0, ST(2) )		/* F6 F5 F4 */
+
+	FLD_S( SRC3 )			/* F2 F6 F5 F4 */
+	FMUL_S( MAT14 )
+
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	MOV_L( SRC2, EBX )
+	XOR_L( CONST(-2147483648), EBX )/* change sign */
+
+	FXCH( ST(2) )			/* F4 F5 F6 */
+	FSTP_S( DST0 )		/* F5 F6 */
+	FSTP_S( DST1 )		/* F6 */
+	FSTP_S( DST2 )		/* */
+	MOV_L( EBX, DST3 )
+
+LLBL(x86_p4_pr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p4_pr_loop) )
+
+LLBL(x86_p4_pr_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_3d )
+HIDDEN(_mesa_x86_transform_points4_3d)
+GLNAME( _mesa_x86_transform_points4_3d ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p4_3dr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_3dr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+	FLD_S( SRC0 )			/* F6 F5 F4 */
+	FMUL_S( MAT2 )
+
+	FLD_S( SRC1 )			/* F0 F6 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F6 F5 F4 */
+	FMUL_S( MAT5 )
+	FLD_S( SRC1 )			/* F2 F1 F0 F6 F5 F4 */
+	FMUL_S( MAT6 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F2 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	FLD_S( SRC2 )			/* F0 F6 F5 F4 */
+	FMUL_S( MAT8 )
+	FLD_S( SRC2 )			/* F1 F0 F6 F5 F4 */
+	FMUL_S( MAT9 )
+	FLD_S( SRC2 )			/* F2 F1 F0 F6 F5 F4 */
+	FMUL_S( MAT10 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F2 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	FLD_S( SRC3 )			/* F0 F6 F5 F4 */
+	FMUL_S( MAT12 )
+	FLD_S( SRC3 )			/* F1 F0 F6 F5 F4 */
+	FMUL_S( MAT13 )
+	FLD_S( SRC3 )			/* F2 F1 F0 F6 F5 F4 */
+	FMUL_S( MAT14 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F2 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	MOV_L( SRC3, EBX )
+
+	FXCH( ST(2) )			/* F4 F5 F6 */
+	FSTP_S( DST0 )		/* F5 F6 */
+	FSTP_S( DST1 )		/* F6 */
+	FSTP_S( DST2 )		/* */
+	MOV_L( EBX, DST3 )
+
+LLBL(x86_p4_3dr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p4_3dr_loop) )
+
+LLBL(x86_p4_3dr_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_x86_transform_points4_3d_no_rot)
+HIDDEN(_mesa_x86_transform_points4_3d_no_rot)
+GLNAME(_mesa_x86_transform_points4_3d_no_rot):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p4_3dnrr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_3dnrr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F5 F4 */
+	FMUL_S( MAT5 )
+
+	FLD_S( SRC2 )			/* F6 F5 F4 */
+	FMUL_S( MAT10 )
+
+	FLD_S( SRC3 )			/* F0 F6 F5 F4 */
+	FMUL_S( MAT12 )
+	FLD_S( SRC3 )			/* F1 F0 F6 F5 F4 */
+	FMUL_S( MAT13 )
+	FLD_S( SRC3 )			/* F2 F1 F0 F6 F5 F4 */
+	FMUL_S( MAT14 )
+
+	FXCH( ST(2) )			/* F0 F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(5) )		/* F1 F2 F6 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F2 F6 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F6 F5 F4 */
+
+	MOV_L( SRC3, EBX )
+
+	FXCH( ST(2) )			/* F4 F5 F6 */
+	FSTP_S( DST0   )		/* F5 F6 */
+	FSTP_S( DST1   )		/* F6 */
+	FSTP_S( DST2   )		/* */
+	MOV_L( EBX, DST3 )
+
+LLBL(x86_p4_3dnrr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p4_3dnrr_loop) )
+
+LLBL(x86_p4_3dnrr_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_2d )
+HIDDEN(_mesa_x86_transform_points4_2d)
+GLNAME( _mesa_x86_transform_points4_2d ):
+
+#define FRAME_OFFSET 16
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+	PUSH_L( EBP )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p4_2dr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_2dr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+	FLD_S( SRC0 )			/* F5 F4 */
+	FMUL_S( MAT1 )
+
+	FLD_S( SRC1 )			/* F0 F5 F4 */
+	FMUL_S( MAT4 )
+	FLD_S( SRC1 )			/* F1 F0 F5 F4 */
+	FMUL_S( MAT5 )
+
+	FXCH( ST(1) )			/* F0 F1 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F1 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F5 F4 */
+
+	FLD_S( SRC3 )			/* F0 F5 F4 */
+	FMUL_S( MAT12 )
+	FLD_S( SRC3 )			/* F1 F0 F5 F4 */
+	FMUL_S( MAT13 )
+
+	FXCH( ST(1) )			/* F0 F1 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F1 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F5 F4 */
+
+	MOV_L( SRC2, EBX )
+	MOV_L( SRC3, EBP )
+
+	FXCH( ST(1) )			/* F4 F5 */
+	FSTP_S( DST0 )		/* F5 */
+	FSTP_S( DST1 )		/* */
+	MOV_L( EBX, DST2 )
+	MOV_L( EBP, DST3 )
+
+LLBL(x86_p4_2dr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p4_2dr_loop) )
+
+LLBL(x86_p4_2dr_done):
+
+	POP_L( EBP )
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_2d_no_rot )
+HIDDEN(_mesa_x86_transform_points4_2d_no_rot)
+GLNAME( _mesa_x86_transform_points4_2d_no_rot ):
+
+#define FRAME_OFFSET 16
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+	PUSH_L( EBP )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p4_2dnrr_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_2dnrr_loop):
+
+	FLD_S( SRC0 )			/* F4 */
+	FMUL_S( MAT0 )
+
+	FLD_S( SRC1 )			/* F5 F4 */
+	FMUL_S( MAT5 )
+
+	FLD_S( SRC3 )			/* F0 F5 F4 */
+	FMUL_S( MAT12 )
+	FLD_S( SRC3 )			/* F1 F0 F5 F4 */
+	FMUL_S( MAT13 )
+
+	FXCH( ST(1) )			/* F0 F1 F5 F4 */
+	FADDP( ST0, ST(3) )		/* F1 F5 F4 */
+	FADDP( ST0, ST(1) )		/* F5 F4 */
+
+	MOV_L( SRC2, EBX )
+	MOV_L( SRC3, EBP )
+
+	FXCH( ST(1) )			/* F4 F5 */
+	FSTP_S( DST0   )		/* F5 */
+	FSTP_S( DST1   )		/* */
+	MOV_L( EBX, DST2 )
+	MOV_L( EBP, DST3 )
+
+LLBL(x86_p4_2dnrr_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p4_2dnrr_loop) )
+
+LLBL(x86_p4_2dnrr_done):
+
+	POP_L( EBP )
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_identity )
+HIDDEN(_mesa_x86_transform_points4_identity)
+GLNAME( _mesa_x86_transform_points4_identity ):
+
+#define FRAME_OFFSET 12
+	PUSH_L( ESI )
+	PUSH_L( EDI )
+	PUSH_L( EBX )
+
+	MOV_L( ARG_SOURCE, ESI )
+	MOV_L( ARG_DEST, EDI )
+
+	MOV_L( ARG_MATRIX, EDX )
+	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+	TEST_L( ECX, ECX )
+	JZ( LLBL(x86_p4_ir_done) )
+
+	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+	SHL_L( CONST(4), ECX )
+	MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+	MOV_L( REGOFF(V4F_START, EDI), EDI )
+	ADD_L( EDI, ECX )
+
+	CMP_L( ESI, EDI )
+	JE( LLBL(x86_p4_ir_done) )
+
+ALIGNTEXT16
+LLBL(x86_p4_ir_loop):
+
+	MOV_L( SRC0, EBX )
+	MOV_L( SRC1, EDX )
+
+	MOV_L( EBX, DST0 )
+	MOV_L( EDX, DST1 )
+
+	MOV_L( SRC2, EBX )
+	MOV_L( SRC3, EDX )
+
+	MOV_L( EBX, DST2 )
+	MOV_L( EDX, DST3 )
+
+LLBL(x86_p4_ir_skip):
+
+	ADD_L( CONST(16), EDI )
+	ADD_L( EAX, ESI )
+	CMP_L( ECX, EDI )
+	JNE( LLBL(x86_p4_ir_loop) )
+
+LLBL(x86_p4_ir_done):
+
+	POP_L( EBX )
+	POP_L( EDI )
+	POP_L( ESI )
+	RET
+	
+#if defined (__ELF__) && defined (__linux__)
+	.section .note.GNU-stack,"",%progbits
+#endif
--- a/src/arch/x86/xform_args.h
+++ b/src/arch/x86/xform_args.h
@ -0,0 +1,51 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Transform function interface for assembly code.  Simply define
+ * FRAME_OFFSET to the number of bytes pushed onto the stack before
+ * using the ARG_* argument macros.
+ *
+ * Gareth Hughes
+ */
+
+#ifndef __XFORM_ARGS_H__
+#define __XFORM_ARGS_H__
+
+/* Offsets for transform_func arguments
+ *
+ * typedef void (*transform_func)( GLvector4f *to_vec,
+ *				   const GLfloat m[16],
+ *				   const GLvector4f *from_vec );
+ */
+#define OFFSET_DEST	4
+#define OFFSET_MATRIX	8
+#define OFFSET_SOURCE	12
+
+#define ARG_DEST	REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
+#define ARG_MATRIX 	REGOFF(FRAME_OFFSET+OFFSET_MATRIX, ESP)
+#define ARG_SOURCE 	REGOFF(FRAME_OFFSET+OFFSET_SOURCE, ESP)
+
+#endif