99% done Windows ARM32 port

2025-07-03 04:27:58 +00:00 · 2023-05-19 20:15:23 +03:00 · 2023-05-19 20:15:23 +03:00 · ee04c66aa5
commit ee04c66aa5
parent bee760067e
37 changed files with 591 additions and 136 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+*~
 *.mak
 *.mak.vpc_crc
 *.vpc_crc
--- a/common/sse2neon.h
+++ b/common/sse2neon.h
@ -89,9 +89,6 @@
 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
 #elif defined(_MSC_VER)
-#if _MSVC_TRADITIONAL
-#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
-#endif
 #ifndef FORCE_INLINE
 #define FORCE_INLINE static inline
 #endif
@ -184,6 +181,10 @@
    } while (0)
 #endif

+#ifdef _M_ARM
+#define vst1q_lane_s64(a, b, c)
+#endif
+
 /* Memory barriers
 * __atomic_thread_fence does not include a compiler barrier; instead,
 * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
@ -202,8 +203,12 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #elif defined(__GNUC__) || defined(__clang__)
    __atomic_thread_fence(__ATOMIC_SEQ_CST);
 #else /* MSVC */
+#ifdef _M_ARM
+    __dmb(_ARM_BARRIER_ISH);
+#else
    __dmb(_ARM64_BARRIER_ISH);
 #endif
+#endif
 }

 /* Architecture-specific build options */
@ -268,7 +273,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 * we have to perform syscall instead.
 */
 #if (!defined(__aarch64__) && !defined(_M_ARM64))
-#include <sys/time.h>
+#include <time.h>
 #endif

 /* "__has_builtin" can be used to query support for built-in functions
@ -574,10 +579,10 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
 /* Backwards compatibility for compilers with lack of specific type support */

 // Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__) &&                        \
+#if defined(_M_ARM) || (defined(__GNUC__) && !defined(__clang__) &&    \
    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
-     (__GNUC__ <= 9 && defined(__aarch64__)))
+     (__GNUC__ <= 9 && defined(__aarch64__))))
 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 {
    uint8x16x4_t ret;
@ -610,6 +615,9 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 }
 #endif

+#if defined(_M_ARM)
+#pragma message("TODO: Windows ARM32: Port many SSE2NEON functions")
+#else
 #if !defined(__aarch64__) && !defined(_M_ARM64)
 /* emulate vaddvq u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
@ -645,6 +653,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
    return vaddvq_u16(a);
 }
 #endif
+#endif

 /* Function Naming Conventions
 * The naming convention of SSE intrinsics is straightforward. A generic SSE
@ -1765,6 +1774,7 @@ FORCE_INLINE void _mm_free(void *addr)
 }
 #endif

+#ifndef _M_ARM
 FORCE_INLINE uint64_t _sse2neon_get_fpcr()
 {
    uint64_t value;
@ -1808,6 +1818,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()

    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
 }
+#endif

 // Macro: Get the rounding mode bits from the MXCSR control and status register.
 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
@ -1826,6 +1837,8 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()

 #if defined(__aarch64__) || defined(_M_ARM64)
    r.value = _sse2neon_get_fpcr();
+#elif defined(_M_ARM)
+    r.value = _MoveFromCoprocessor(10,7, 1,0,0);
 #else
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
@ -2247,7 +2260,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 FORCE_INLINE void _mm_prefetch(char const *p, int i)
 {
    (void) i;
-#if defined(_MSC_VER)
+#ifdef _M_ARM64
    switch (i) {
    case _MM_HINT_NTA:
        __prefetch2(p, 1);
@ -2262,6 +2275,8 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
        __prefetch2(p, 4);
        break;
    }
+#elif defined(_M_ARM)
+    // TODO
 #else
    switch (i) {
    case _MM_HINT_NTA:
@ -2348,6 +2363,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
 }

+#ifndef _M_ARM
 // Macro: Set the flush zero bits of the MXCSR control and status register to
 // the value in unsigned 32-bit integer a. The flush zero may contain any of the
 // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
@ -2379,6 +2395,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
 #endif
 }
+#endif

 // Set packed single-precision (32-bit) floating-point elements in dst with the
 // supplied values.
@ -2404,6 +2421,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
 {
+#ifndef _M_ARM
    union {
        fpcr_bitfield field;
 #if defined(__aarch64__) || defined(_M_ARM64)
@ -2442,6 +2460,7 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
 #else
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
 #endif
+#endif
 }

 // Copy single-precision (32-bit) floating-point element a to the lower element
@ -3206,6 +3225,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
 }

+#ifndef _M_ARM
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for greater-than-or-equal, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
@ -3247,6 +3267,7 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
+#endif

 // Compare packed signed 16-bit integers in a and b for greater-than, and store
 // the results in dst.
@ -3275,6 +3296,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }

+#ifndef _M_ARM
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for greater-than, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
@ -3358,6 +3380,7 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
+#endif

 // Compare packed signed 16-bit integers in a and b for less-than, and store the
 // results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
@ -3389,6 +3412,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }

+#ifndef _M_ARM
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for less-than, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
@ -3429,6 +3453,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
+#endif

 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for not-equal, and store the results in dst.
@ -3456,6 +3481,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
 }

+#ifndef _M_ARM
 // Compare packed double-precision (64-bit) floating-point elements in a and b
 // for not-greater-than-or-equal, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
@ -3756,6 +3782,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
    return (*(double *) &a0 < *(double *) &b0);
 #endif
 }
+#endif

 // Compare the lower double-precision (64-bit) floating-point element in a and b
 // for equality, and return the boolean result (0 or 1).
@ -4401,6 +4428,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
 }

+#ifndef _M_ARM
 // Compare packed double-precision (64-bit) floating-point elements in a and b,
 // and store packed maximum values in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
@ -4487,6 +4515,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
+#endif

 // Compare the lower double-precision (64-bit) floating-point elements in a and
 // b, store the minimum value in the lower element of dst, and copy the upper
@ -4793,7 +4822,11 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
 FORCE_INLINE void _mm_pause()
 {
 #if defined(_MSC_VER)
+#ifdef _M_ARM
+    __isb(_ARM_BARRIER_SY);
+#else
    __isb(_ARM64_BARRIER_SY);
+#endif
 #else
    __asm__ __volatile__("isb\n");
 #endif
@ -7622,6 +7655,7 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
 }

 /* SSE4.2 */
+#ifndef _M_ARM

 const static uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
@ -8463,9 +8497,11 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
    return crc;
 }

+#endif
+
 /* AES */

-#if !defined(__ARM_FEATURE_CRYPTO) && !defined(_M_ARM64)
+#if !defined(__ARM_FEATURE_CRYPTO) && !defined(_M_ARM64) && !defined(_M_ARM)
 /* clang-format off */
 #define SSE2NEON_AES_SBOX(w)                                           \
    {                                                                  \
@ -8913,6 +8949,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 #undef SSE2NEON_MULTIPLY
 #endif

+#elif defined(_M_ARM)
 #else /* __ARM_FEATURE_CRYPTO */
 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
 // AESMC and then manually applying the real key as an xor operation. This
@ -9034,6 +9071,7 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
    }
 }

+#ifndef _M_ARM
 FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
 {
    union {
@ -9053,6 +9091,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()

    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
 }
+#endif

 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
 // return that count in dst.
@ -9113,6 +9152,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 #endif
 }

+#ifndef _M_ARM
 FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 {
    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
@ -9140,6 +9180,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
 #endif
 }
+#endif

 // Return the current 64-bit value of the processor's time-stamp counter.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
@ -9161,6 +9202,9 @@ FORCE_INLINE uint64_t _rdtsc(void)
 #endif

    return val;
+#elif defined(_M_ARM)
+	uint32_t val = _MoveFromCoprocessor(15,0, 9,13,0);
+	return ((uint64_t)val) << 6;
 #else
    uint32_t pmccntr, pmuseren, pmcntenset;
    // Read the user mode Performance Monitoring Unit (PMU)
--- a/engine/cmodel.cpp
+++ b/engine/cmodel.cpp
@ -862,7 +862,7 @@ BOX TRACING

 // Custom SIMD implementation for box brushes

-const fltx4 Four_DistEpsilons={DIST_EPSILON,DIST_EPSILON,DIST_EPSILON,DIST_EPSILON};
+const fltx4 Four_DistEpsilons=FLTX4(DIST_EPSILON,DIST_EPSILON,DIST_EPSILON,DIST_EPSILON);
 const int32 ALIGN16 g_CubeFaceIndex0[4] ALIGN16_POST = {0,1,2,-1};
 const int32 ALIGN16 g_CubeFaceIndex1[4] ALIGN16_POST = {3,4,5,-1};
 bool IntersectRayWithBoxBrush( TraceInfo_t *pTraceInfo, const cbrush_t *pBrush, cboxbrush_t *pBox )
@ -1572,7 +1572,7 @@ void FASTCALL CM_TraceToLeaf( TraceInfo_t * RESTRICT pTraceInfo, int ndxLeaf, fl
 			fltx4 traceStart = LoadUnaligned3SIMD(pTraceInfo->m_start.Base());
 			fltx4 traceDelta = LoadUnaligned3SIMD(pTraceInfo->m_delta.Base());
 			fltx4 traceInvDelta = LoadUnaligned3SIMD(pTraceInfo->m_invDelta.Base());
-			static const fltx4 vecEpsilon = {DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON};
+			static const fltx4 vecEpsilon = FLTX4(DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON,DISPCOLL_DIST_EPSILON);
 			// only used in !IS_POINT version:
 			fltx4 extents;
 			if (!IS_POINT)
--- a/engine/l_studio.cpp
+++ b/engine/l_studio.cpp
@ -40,7 +40,7 @@
 #include "materialsystem/materialsystem_config.h"
 #include "materialsystem/itexture.h"
 #include "IHammer.h"
-#if defined( _WIN32 ) && !defined( _X360 )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
 #include <xmmintrin.h>
 #endif
 #include "staticpropmgr.h"
--- a/engine/sys_engine.cpp
+++ b/engine/sys_engine.cpp
@ -104,7 +104,7 @@ extern ConVar host_timer_spin_ms;
 extern float host_nexttick;
 extern IVEngineClient *engineClient;

-#ifdef WIN32
+#if defined(_WIN32) && !defined(_M_ARM)
 static void cpu_frequency_monitoring_callback( IConVar *var, const char *pOldValue, float flOldValue )
 {
 	// Set the specified interval for CPU frequency monitoring
--- a/game/client/detailobjectsystem.cpp
+++ b/game/client/detailobjectsystem.cpp
@ -2122,8 +2122,8 @@ int CDetailObjectSystem::SortSpritesBackToFront( int nLeaf, const Vector &viewOr
 #else
 #define MANTISSA_LSB_OFFSET 0
 #endif
-static fltx4 Four_MagicNumbers={ MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER };
-static fltx4 Four_255s={ 255.0, 255.0, 255.0, 255.0 };
+static fltx4 Four_MagicNumbers=FLTX4( MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER );
+static fltx4 Four_255s=FLTX4( 255.0, 255.0, 255.0, 255.0 );

 static ALIGN16 int32 And255Mask[4] ALIGN16_POST = {0xff,0xff,0xff,0xff};
 #define PIXMASK ( * ( reinterpret_cast< fltx4 *>( &And255Mask ) ) )
--- a/inputsystem/inputsystem.cpp
+++ b/inputsystem/inputsystem.cpp
@ -167,8 +167,10 @@ InitReturnVal_t CInputSystem::Init()

 	joy_xcontroller_found.SetValue( 0 );
 	
+#ifdef USE_SDL
 	if( !m_bConsoleTextMode )
 		InitializeTouch();
+#endif
 	
 	if ( IsPC() && !m_bConsoleTextMode )
 	{
@ -975,7 +977,9 @@ void CInputSystem::SetPrimaryUserId( int userId )
 //-----------------------------------------------------------------------------
 void CInputSystem::SetRumble( float fLeftMotor, float fRightMotor, int userId )
 {
+#ifdef USE_SDL
 	SetXDeviceRumble( fLeftMotor, fRightMotor, userId );
+#endif
 }


--- a/inputsystem/inputsystem.h
+++ b/inputsystem/inputsystem.h
@ -145,10 +145,16 @@ public:

 	struct JoystickInfo_t
 	{
+#ifdef USE_SDL
 		void *m_pDevice;  // Really an SDL_GameController*, NULL if not present.
 		void *m_pHaptic;  // Really an SDL_Haptic*
 		float m_fCurrentRumble;
 		bool m_bRumbleEnabled;
+#elif defined(_WIN32)
+		JOYINFOEX m_JoyInfoEx;
+#else
+#error
+#endif
 		int m_nButtonCount;
 		int m_nAxisFlags;
 		int m_nDeviceId;
@ -271,6 +277,9 @@ public:

 	//Added called and set to true when binding input and set to false once bound
 	void SetNovintPure( bool bPure );
+#ifndef USE_SDL
+	unsigned int AxisValue( JoystickAxis_t axis, JOYINFOEX& ji );
+#endif
 #else
 	void SetNovintPure( bool bPure ) {} // to satify the IInput virtual interface	
 #endif
--- a/inputsystem/joystick_win32.cpp
+++ b/inputsystem/joystick_win32.cpp
@ -0,0 +1,370 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: PC Joystick implementation for inputsystem.dll
+//
+//===========================================================================//
+
+/* For force feedback testing. */
+#include "inputsystem.h"
+#include "tier1/convar.h"
+#include "tier0/icommandline.h"
+
+//-----------------------------------------------------------------------------
+// Joystick helpers
+//-----------------------------------------------------------------------------
+#define JOY_POVFWDRIGHT		( ( JOY_POVFORWARD + JOY_POVRIGHT ) >> 1 )  // 4500
+#define JOY_POVRIGHTBACK	( ( JOY_POVRIGHT + JOY_POVBACKWARD ) >> 1 ) // 13500
+#define JOY_POVFBACKLEFT	( ( JOY_POVBACKWARD + JOY_POVLEFT ) >> 1 ) // 22500
+#define JOY_POVLEFTFWD		( ( JOY_POVLEFT + JOY_POVFORWARD ) >> 1 ) // 31500
+
+ConVar joy_wwhack1( "joy_wingmanwarrior_centerhack", "0", FCVAR_ARCHIVE, "Wingman warrior centering hack." );
+ConVar joy_axisbutton_threshold( "joy_axisbutton_threshold", "0.3", FCVAR_ARCHIVE, "Analog axis range before a button press is registered." );
+
+//-----------------------------------------------------------------------------
+// Initialize all joysticks 
+//-----------------------------------------------------------------------------
+void CInputSystem::InitializeJoysticks( void ) 
+{  
+ 	// assume no joystick
+	m_nJoystickCount = 0; 
+
+	// abort startup if user requests no joystick
+	if ( CommandLine()->FindParm("-nojoy" ) ) 
+		return; 
+ 
+	// verify joystick driver is present
+	int nMaxJoysticks = joyGetNumDevs();
+	if ( nMaxJoysticks > MAX_JOYSTICKS )
+	{
+		nMaxJoysticks = MAX_JOYSTICKS; 
+	}
+	else if ( nMaxJoysticks <= 0 )
+	{
+		DevMsg( 1, "joystick not found -- driver not present\n");
+		return;
+	}
+
+	// cycle through the joysticks looking for valid ones
+	MMRESULT	mmr;
+	for ( int i=0; i < nMaxJoysticks; i++ )
+	{
+		JOYINFOEX ji;
+		Q_memset( &ji, 0, sizeof( ji ) );
+		ji.dwSize = sizeof(ji);
+		ji.dwFlags = JOY_RETURNCENTERED;
+		mmr = joyGetPosEx( i, &ji );
+		if ( mmr != JOYERR_NOERROR )
+			continue;
+
+		// get the capabilities of the selected joystick
+		// abort startup if command fails
+		JOYCAPS		jc;
+		Q_memset( &jc, 0, sizeof( jc ) );
+		mmr = joyGetDevCaps( i, &jc, sizeof( jc ) );
+		if ( mmr != JOYERR_NOERROR )
+			continue;
+
+		JoystickInfo_t &info = m_pJoystickInfo[m_nJoystickCount];
+		info.m_nDeviceId = i;
+		info.m_JoyInfoEx = ji;
+		info.m_nButtonCount = (int)jc.wNumButtons;
+		info.m_bHasPOVControl = ( jc.wCaps & JOYCAPS_HASPOV ) ? true : false;
+		info.m_bDiagonalPOVControlEnabled = false;
+		info.m_nFlags = JOY_RETURNCENTERED | JOY_RETURNBUTTONS | JOY_RETURNX | JOY_RETURNY;
+		info.m_nAxisFlags = 0;
+		if ( jc.wNumAxes >= 2 )
+		{
+			info.m_nAxisFlags |= 0x3;
+		}
+		if ( info.m_bHasPOVControl )
+		{
+			info.m_nFlags |= JOY_RETURNPOV;
+		}
+		if ( jc.wCaps & JOYCAPS_HASZ )
+		{
+			info.m_nFlags |= JOY_RETURNZ;
+			info.m_nAxisFlags |= 0x4;
+		}
+		if ( jc.wCaps & JOYCAPS_HASR )
+		{
+			info.m_nFlags |= JOY_RETURNR;
+			info.m_nAxisFlags |= 0x8;
+		}
+		if ( jc.wCaps & JOYCAPS_HASU )
+		{
+			info.m_nFlags |= JOY_RETURNU;
+			info.m_nAxisFlags |= 0x10;
+		}
+		if ( jc.wCaps & JOYCAPS_HASV )
+		{
+			info.m_nFlags |= JOY_RETURNV;
+			info.m_nAxisFlags |= 0x20;
+		}
+		info.m_nLastPolledButtons = 0;
+		info.m_nLastPolledAxisButtons = 0;
+		info.m_nLastPolledPOVState = 0;
+		memset( info.m_pLastPolledAxes, 0, sizeof(info.m_pLastPolledAxes) );
+		++m_nJoystickCount;
+
+		EnableJoystickInput( i, true );
+	} 
+}
+
+void CInputSystem::ShutdownJoysticks()
+{
+}
+
+//-----------------------------------------------------------------------------
+//	Process the event
+//-----------------------------------------------------------------------------
+void CInputSystem::JoystickButtonEvent( ButtonCode_t button, int sample )
+{
+	// package the key
+	if ( sample )
+	{
+		PostButtonPressedEvent( IE_ButtonPressed, m_nLastSampleTick, button, button );
+	}
+	else
+	{
+		PostButtonReleasedEvent( IE_ButtonReleased, m_nLastSampleTick, button, button );
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Update the joystick button state
+//-----------------------------------------------------------------------------
+void CInputSystem::UpdateJoystickButtonState( int nJoystick )
+{
+	JoystickInfo_t &info = m_pJoystickInfo[nJoystick];
+	JOYINFOEX& ji = info.m_JoyInfoEx;
+
+	// Standard joystick buttons
+	unsigned int buttons = ji.dwButtons ^ info.m_nLastPolledButtons;
+	if ( buttons )
+	{
+		for ( int j = 0 ; j < info.m_nButtonCount ; ++j )
+		{
+			int mask = buttons & ( 1 << j );
+			if ( !mask )
+				continue;
+
+			ButtonCode_t code = (ButtonCode_t)JOYSTICK_BUTTON( nJoystick, j );
+			if ( mask & ji.dwButtons )
+			{
+				// down event
+				JoystickButtonEvent( code, MAX_BUTTONSAMPLE );
+			}
+			else
+			{
+				// up event
+				JoystickButtonEvent( code, 0 );
+			}
+		}
+
+		info.m_nLastPolledButtons = (unsigned int)ji.dwButtons;
+	}
+
+	// Analog axis buttons
+	const float minValue = joy_axisbutton_threshold.GetFloat() * MAX_BUTTONSAMPLE;
+	for ( int j = 0 ; j < MAX_JOYSTICK_AXES; ++j )
+	{
+		if ( ( info.m_nAxisFlags & (1 << j) ) == 0 )
+			continue;
+
+		// Positive side of the axis
+		int mask = ( 1 << (j << 1) );
+		ButtonCode_t code = JOYSTICK_AXIS_BUTTON( nJoystick, (j << 1) );
+		float value = GetAnalogValue( JOYSTICK_AXIS( nJoystick, j ) );
+
+		if ( value > minValue && !(info.m_nLastPolledAxisButtons & mask) )
+		{
+			info.m_nLastPolledAxisButtons |= mask;
+			JoystickButtonEvent( code, MAX_BUTTONSAMPLE );
+		}
+		if ( value <= minValue && (info.m_nLastPolledAxisButtons & mask) )
+		{
+			info.m_nLastPolledAxisButtons &= ~mask;
+			JoystickButtonEvent( code, 0 );
+		}
+
+		// Negative side of the axis
+		mask <<= 1;
+		code = (ButtonCode_t)( code + 1 );
+		if ( value < -minValue && !(info.m_nLastPolledAxisButtons & mask) )
+		{
+			info.m_nLastPolledAxisButtons |= mask;
+			JoystickButtonEvent( code, MAX_BUTTONSAMPLE );
+		}
+		if ( value >= -minValue && (info.m_nLastPolledAxisButtons & mask) )
+		{
+			info.m_nLastPolledAxisButtons &= ~mask;
+			JoystickButtonEvent( code, 0 );
+		}
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Get raw joystick sample along axis
+//-----------------------------------------------------------------------------
+unsigned int CInputSystem::AxisValue( JoystickAxis_t axis, JOYINFOEX& ji )
+{
+	switch (axis)
+	{
+	case JOY_AXIS_X:
+		return (unsigned int)ji.dwXpos;
+	case JOY_AXIS_Y:
+		return (unsigned int)ji.dwYpos;
+	case JOY_AXIS_Z:
+		return (unsigned int)ji.dwZpos;
+	case JOY_AXIS_R:
+		return (unsigned int)ji.dwRpos;
+	case JOY_AXIS_U:
+		return (unsigned int)ji.dwUpos;
+	case JOY_AXIS_V:
+		return (unsigned int)ji.dwVpos;
+	}
+	// FIX: need to do some kind of error
+	return (unsigned int)ji.dwXpos;
+}
+
+
+//-----------------------------------------------------------------------------
+// Update the joystick POV control
+//-----------------------------------------------------------------------------
+void CInputSystem::UpdateJoystickPOVControl( int nJoystick )
+{
+	JoystickInfo_t &info = m_pJoystickInfo[nJoystick];
+	JOYINFOEX& ji = info.m_JoyInfoEx;
+
+	if ( !info.m_bHasPOVControl )
+		return;
+
+	// convert POV information into 4 bits of state information
+	// this avoids any potential problems related to moving from one
+	// direction to another without going through the center position
+	unsigned int povstate = 0;
+
+	if ( ji.dwPOV != JOY_POVCENTERED )
+	{
+		if (ji.dwPOV == JOY_POVFORWARD)  // 0
+		{
+			povstate |= 0x01;
+		}
+		if (ji.dwPOV == JOY_POVRIGHT)  // 9000
+		{
+			povstate |= 0x02;
+		}
+		if (ji.dwPOV == JOY_POVBACKWARD) // 18000
+		{
+			povstate |= 0x04;
+		}
+		if (ji.dwPOV == JOY_POVLEFT)  // 27000
+		{
+			povstate |= 0x08;
+		}
+
+		// Deal with diagonals if user wants them
+		if ( info.m_bDiagonalPOVControlEnabled )
+		{
+			if (ji.dwPOV == JOY_POVFWDRIGHT)  // 4500
+			{
+				povstate |= ( 0x01 | 0x02 );
+			}
+			if (ji.dwPOV == JOY_POVRIGHTBACK)  // 13500
+			{
+				povstate |= ( 0x02 | 0x04 );
+			}
+			if (ji.dwPOV == JOY_POVFBACKLEFT) // 22500
+			{
+				povstate |= ( 0x04 | 0x08 );
+			}
+			if (ji.dwPOV == JOY_POVLEFTFWD)  // 31500
+			{
+				povstate |= ( 0x08 | 0x01 );
+			}
+		}
+	}
+
+	// determine which bits have changed and key an auxillary event for each change
+	unsigned int buttons = povstate ^ info.m_nLastPolledPOVState;
+	if ( buttons )
+	{
+		for ( int i = 0; i < JOYSTICK_POV_BUTTON_COUNT; ++i )
+		{
+			unsigned int mask = buttons & ( 1 << i );
+			if ( !mask )
+				continue;
+
+			ButtonCode_t code = (ButtonCode_t)JOYSTICK_POV_BUTTON( nJoystick, i );
+
+			if ( mask & povstate )
+			{
+				// Keydown on POV buttons
+				JoystickButtonEvent( code, MAX_BUTTONSAMPLE );
+			}
+			else
+			{
+				// KeyUp on POV buttons
+				JoystickButtonEvent( code, 0 );
+			}
+		}
+
+		// Latch old values
+		info.m_nLastPolledPOVState = povstate;
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Sample the joystick
+//-----------------------------------------------------------------------------
+void CInputSystem::PollJoystick( void )
+{
+	if ( !m_JoysticksEnabled.IsAnyFlagSet() )
+		return;
+
+	InputState_t &state = m_InputState[ m_bIsPolling ];
+	for ( int i = 0; i < m_nJoystickCount; ++i )
+	{
+		if ( !m_JoysticksEnabled.IsFlagSet( 1 << i ) )
+			continue;
+
+		JoystickInfo_t &info = m_pJoystickInfo[i];
+		JOYINFOEX& ji = info.m_JoyInfoEx;
+		Q_memset( &ji, 0, sizeof( ji ) );
+		ji.dwSize = sizeof( ji );
+		ji.dwFlags = (DWORD)info.m_nFlags;
+
+		if ( joyGetPosEx( info.m_nDeviceId, &ji ) != JOYERR_NOERROR )
+			continue;
+
+		// This hack fixes a bug in the Logitech WingMan Warrior DirectInput Driver
+		// rather than having 32768 be the zero point, they have the zero point at 32668
+		// go figure -- anyway, now we get the full resolution out of the device
+		if ( joy_wwhack1.GetBool() )
+		{
+			ji.dwUpos += 100;
+		}
+
+		// Poll joystick axes
+		for ( int j = 0; j < MAX_JOYSTICK_AXES; ++j )
+		{
+			if ( ( info.m_nAxisFlags & ( 1 << j ) ) == 0 )
+				continue;
+
+			AnalogCode_t code = JOYSTICK_AXIS( i, j );
+			int nValue = AxisValue( (JoystickAxis_t)j, ji ) - MAX_BUTTONSAMPLE;
+			state.m_pAnalogDelta[ code ] = nValue - state.m_pAnalogValue[ code ];
+			state.m_pAnalogValue[ code ] = nValue;
+			if ( state.m_pAnalogDelta[ code ] != 0 )
+			{
+				PostEvent( IE_AnalogValueChanged, m_nLastSampleTick, code, state.m_pAnalogValue[ code ], state.m_pAnalogDelta[ code ] );
+			}
+		}
+
+		UpdateJoystickButtonState( i );
+		UpdateJoystickPOVControl( i );
+	}
+}
--- a/inputsystem/wscript
+++ b/inputsystem/wscript
@ -18,8 +18,6 @@ def configure(conf):
 def build(bld):
 	source = [
 		'inputsystem.cpp',
-		'joystick_sdl.cpp',
-		'touch_sdl.cpp',
 		'key_translation.cpp',
 		'steamcontroller.cpp',
 		'../public/tier0/memoverride.cpp'
@ -41,6 +39,12 @@ def build(bld):

 	libs = ['tier0','tier1','tier2','vstdlib','SDL2','steam_api']

+	if bld.options.SDL:
+		source += ['joystick_sdl.cpp', 'touch_sdl.cpp']
+	elif bld.env.DEST_OS == 'win32':
+		source += ['joystick_win32.cpp']
+		libs += ['WINMM']
+
 	if bld.env.DEST_OS == 'win32':
 		libs += ['USER32']

--- a/materialsystem/colorspace.h
+++ b/materialsystem/colorspace.h
@ -287,7 +287,7 @@ namespace ColorSpace
 	{
 		// preload 3.0f onto the returns so that we don't need to multiply the bumpAverage by it
 		// straight away (eg, reschedule this dependent op)
-		static const fltx4 vThree = { 3.0f, 3.0f, 3.0f, 0.0f };
+		static const fltx4 vThree = FLTX4( 3.0f, 3.0f, 3.0f, 0.0f );
 		fltx4 retValBump1 = MulSIMD( vThree, linearBumpColor1);
 		fltx4 retValBump2 = MulSIMD( vThree, linearBumpColor2);
 		fltx4 retValBump3 = MulSIMD( vThree, linearBumpColor3);
--- a/mathlib/3dnow.cpp
+++ b/mathlib/3dnow.cpp
@ -16,7 +16,7 @@
 // memdbgon must be the last include file in a .cpp file!!!
 #include "tier0/memdbgon.h"

-#if !defined(COMPILER_MSVC64) && !defined(LINUX) && !defined(COMPILER_CLANG)
+#if defined(_M_IX86) && !defined(LINUX) && !defined(COMPILER_CLANG)
 // Implement for 64-bit Windows if needed.
 // Clang hits "fatal error: error in backend:" and other errors when trying
 // to compile the inline assembly below. 3DNow support is highly unlikely to
--- a/mathlib/mathlib_base.cpp
+++ b/mathlib/mathlib_base.cpp
@ -3258,7 +3258,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright

 	// SSE Generally performs better than 3DNow when present, so this is placed 
 	// first to allow SSE to override these settings.
-#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX) && !defined(PLATFORM_BSD)
+#ifdef _M_IX86
 	if ( bAllow3DNow && pi.m_b3DNow )
 	{
 		s_b3DNowEnabled = true;
@ -3291,7 +3291,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 		pfRSqrt = _SSE_RSqrtAccurate;
 		pfRSqrtFast = _SSE_RSqrtFast;
 #endif
-#ifdef PLATFORM_WINDOWS_PC32
+#ifdef _M_IX86
 		pfFastSinCos = _SSE_SinCos;
 		pfFastCos = _SSE_cos;
 #endif
@ -3304,7 +3304,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 	if ( bAllowSSE2 && pi.m_bSSE2 )
 	{
 		s_bSSE2Enabled = true;
-#ifdef PLATFORM_WINDOWS_PC32
+#ifdef _M_IX86
 		pfFastSinCos = _SSE2_SinCos;
 		pfFastCos = _SSE2_cos;
 #endif
--- a/mathlib/sse.cpp
+++ b/mathlib/sse.cpp
@ -91,7 +91,7 @@ float _SSE_Sqrt(float x)
 {
 	Assert( s_bMathlibInitialized );
 	float	root = 0.f;
-#ifdef _WIN32
+#if defined(_WIN32) && !defined(_M_ARM)
 	_asm
 	{
 		sqrtss		xmm0, x
@ -122,7 +122,7 @@ float _SSE_RSqrtAccurate(float x)
 }
 #else

-#ifdef POSIX
+#if POSIX || defined(_M_ARM)
 const __m128  f3  = _mm_set_ss(3.0f);  // 3 as SSE value
 const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
 #endif
@ -131,7 +131,7 @@ const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
 float _SSE_RSqrtAccurate(float a)
 {

-#ifdef _WIN32
+#if defined(_WIN32) && !defined(_M_ARM)
 	float x;
 	float half = 0.5f;
 	float three = 3.f;
@ -153,7 +153,7 @@ float _SSE_RSqrtAccurate(float a)
 	}

 	return x;
-#elif POSIX	
+#elif POSIX || defined(_M_ARM)
 	__m128  xx = _mm_load_ss( &a );
    __m128  xr = _mm_rsqrt_ss( xx );
    __m128  xt;
@ -764,7 +764,7 @@ float _SSE_cos( float x )
 //-----------------------------------------------------------------------------
 // SSE2 implementations of optimized routines:
 //-----------------------------------------------------------------------------
-#ifdef PLATFORM_WINDOWS_PC32
+#if defined(PLATFORM_WINDOWS_PC32) && !defined(_M_ARM)
 void _SSE2_SinCos(float x, float* s, float* c)  // any x
 {
 #ifdef _WIN32
@ -850,9 +850,7 @@ void _SSE2_SinCos(float x, float* s, float* c)  // any x
 	#error "Not Implemented"
 #endif
 }
-#endif // PLATFORM_WINDOWS_PC32

-#ifdef PLATFORM_WINDOWS_PC32
 float _SSE2_cos(float x)  
 {
 #ifdef _WIN32
@ -970,9 +968,7 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 	#error "Not Implemented"
 #endif
 }
-#endif

-#if 0
 void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 {
 	Assert( s_bMathlibInitialized );
@ -1026,9 +1022,7 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 	#error "Not Implemented"
 #endif
 }
-#endif

-#ifdef _WIN32
 void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
 {
 	// FIXME: This don't work!! It will overwrite memory in the write to dest
@ -1057,7 +1051,6 @@ void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const floa
 #endif
 	}
 }
-#endif

 #ifdef _WIN32
 #ifdef PFN_VECTORMA
@ -1101,7 +1094,6 @@ float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
 //   NJS: (Nov 1 2002) -NOT- faster.  may time a couple cycles faster in a single function like 
 //   this, but when inlined, and instruction scheduled, the C version is faster.  
 //   Verified this via VTune
-/*
 vec_t DotProduct (const vec_t *a, const vec_t *c)
 {
 	vec_t temp;
@ -1124,6 +1116,6 @@ vec_t DotProduct (const vec_t *a, const vec_t *c)
 		ret
 	}
 }
-*/
+#endif

 #endif // COMPILER_MSVC64 
--- a/mathlib/sseconst.cpp
+++ b/mathlib/sseconst.cpp
@ -7,35 +7,35 @@
 #include "mathlib/ssemath.h"
 #include "mathlib/ssequaternion.h"

-const fltx4 Four_PointFives={0.5,0.5,0.5,0.5};
+const fltx4 Four_PointFives=FLTX4(0.5,0.5,0.5,0.5);
 #ifndef _X360
-const fltx4 Four_Zeros={0.0,0.0,0.0,0.0};
-const fltx4 Four_Ones={1.0,1.0,1.0,1.0};
+const fltx4 Four_Zeros=FLTX4(0.0,0.0,0.0,0.0);
+const fltx4 Four_Ones=FLTX4(1.0,1.0,1.0,1.0);
 #endif
-const fltx4 Four_Twos={2.0,2.0,2.0,2.0};
-const fltx4 Four_Threes={3.0,3.0,3.0,3.0};
-const fltx4 Four_Fours={4.0,4.0,4.0,4.0};
-const fltx4 Four_Origin={0,0,0,1};
-const fltx4 Four_NegativeOnes={-1,-1,-1,-1};
+const fltx4 Four_Twos=FLTX4(2.0,2.0,2.0,2.0);
+const fltx4 Four_Threes=FLTX4(3.0,3.0,3.0,3.0);
+const fltx4 Four_Fours=FLTX4(4.0,4.0,4.0,4.0);
+const fltx4 Four_Origin=FLTX4(0,0,0,1);
+const fltx4 Four_NegativeOnes=FLTX4(-1,-1,-1,-1);

-const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) };
-const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) };
-const fltx4 Four_2ToThe23s={ (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) };
-const fltx4 Four_2ToThe24s={ (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) };
+const fltx4 Four_2ToThe21s=FLTX4( (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) );
+const fltx4 Four_2ToThe22s=FLTX4( (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) );
+const fltx4 Four_2ToThe23s=FLTX4( (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) );
+const fltx4 Four_2ToThe24s=FLTX4( (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) );

-const fltx4 Four_Point225s={ .225, .225, .225, .225 };
-const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON};
+const fltx4 Four_Point225s=FLTX4( .225, .225, .225, .225 );
+const fltx4 Four_Epsilons=FLTX4(FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON);

-const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
-const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX};
-const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. };
+const fltx4 Four_FLT_MAX=FLTX4(FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX);
+const fltx4 Four_Negative_FLT_MAX=FLTX4(-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX);
+const fltx4 g_SIMD_0123 = FLTX4( 0., 1., 2., 3. );

 const fltx4 g_QuatMultRowSign[4] =
 {
-	{  1.0f,  1.0f, -1.0f, 1.0f },
-	{ -1.0f,  1.0f,  1.0f, 1.0f },
-	{  1.0f, -1.0f,  1.0f, 1.0f },
-	{ -1.0f, -1.0f, -1.0f, 1.0f }
+	FLTX4(  1.0f,  1.0f, -1.0f, 1.0f ),
+	FLTX4( -1.0f,  1.0f,  1.0f, 1.0f ),
+	FLTX4(  1.0f, -1.0f,  1.0f, 1.0f ),
+	FLTX4( -1.0f, -1.0f, -1.0f, 1.0f )
 };

 const uint32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
--- a/mathlib/ssenoise.cpp
+++ b/mathlib/ssenoise.cpp
@ -20,7 +20,7 @@

 #define MAGIC_NUMBER (1<<15)								// gives 8 bits of fraction

-static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER };
+static fltx4 Four_MagicNumbers = FLTX4( MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER );


 static ALIGN16 int32 idx_mask[4]= {0xffff, 0xffff, 0xffff, 0xffff};
--- a/public/materialsystem/imesh.h
+++ b/public/materialsystem/imesh.h
@ -1220,7 +1220,7 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX7_t &vertex )
 	Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
 	Assert( m_nCurrentVertex < m_nMaxVertexCount );

-#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) && !defined(_M_ARM)
 	const void *pRead = &vertex;
 	void *pCurrPos = m_pCurrPosition;
 	__asm
@ -1236,7 +1236,7 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX7_t &vertex )
 			movntps [edi + 16], xmm1
 			movntps [edi + 32], xmm2
 	}
-#elif defined(GNUC) || defined(PLATFORM_WINDOWS_PC64)
+#elif defined(GNUC) || defined(_WIN32)
 	const char *pRead = (char *)&vertex;
 	char *pCurrPos = (char *)m_pCurrPosition;
 	__m128 m1 = _mm_load_ps( (float *)pRead );
@ -1267,7 +1267,7 @@ inline void CVertexBuilder::Fast4VerticesSSE(
 	Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
 	Assert( m_nCurrentVertex < m_nMaxVertexCount-3 );

-#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) && !defined(_M_ARM)
 	void *pCurrPos = m_pCurrPosition;
 	__asm
 	{
@ -1309,7 +1309,7 @@ inline void CVertexBuilder::Fast4VerticesSSE(
 			movntps [edi + 80+96], xmm5

 	}
-#elif defined(__arm__) || defined(PLATFORM_WINDOWS_PC64)
+#elif defined(__arm__) || defined(_WIN32)
 	const void *pReadA = &vtx_a;
 	const void *pReadB = &vtx_b;
 	const void *pReadC = &vtx_c;
@ -1430,7 +1430,7 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex )
 	Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
 	Assert( m_nCurrentVertex < m_nMaxVertexCount );

-#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS ) && !defined(_M_ARM)
 	const void *pRead = &vertex;
 	void *pCurrPos = m_pCurrPosition;
 	__asm
@ -1448,21 +1448,10 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex )
 			movntps [edi + 32], xmm2
 			movntps [edi + 48], xmm3
 	}
-#elif defined(GNUC) || defined(PLATFORM_WINDOWS_PC64)
+#elif defined(GNUC) || defined(_WIN32)
 	const void *pRead = &vertex;
 	void *pCurrPos = m_pCurrPosition;

-/*	__asm__ __volatile__ (
-						  "movaps (%0), %%xmm0\n"
-						  "movaps 16(%0), %%xmm1\n"
-						  "movaps 32(%0), %%xmm2\n"
-						  "movaps 48(%0), %%xmm3\n"
-						  "movntps %%xmm0, (%1)\n"
-						  "movntps %%xmm1, 16(%1)\n"
-						  "movntps %%xmm2, 32(%1)\n"
-						  "movntps %%xmm3, 48(%1)\n"
-						  :: "r" (pRead), "r" (pCurrPos) : "memory"); */
-
 	__m128 m1 = _mm_load_ps( (float *)pRead );
 	__m128 m2 = _mm_load_ps( (float *)((intp)pRead + 16) );
 	__m128 m3 = _mm_load_ps( (float *)((intp)pRead + 32) );
--- a/public/mathlib/mathlib.h
+++ b/public/mathlib/mathlib.h
@ -405,6 +405,9 @@ void inline SinCos( float radians, float *sine, float *cosine )
 {
 #if defined( _X360 )
 	XMScalarSinCos( sine, cosine, radians );
+#elif defined( PLATFORM_WINDOWS_PC64 ) || defined(_M_ARM)
+	*sine = sin( radians );
+	*cosine = cos( radians );
 #elif defined( PLATFORM_WINDOWS_PC32 )
 	_asm
 	{
@ -417,11 +420,8 @@ void inline SinCos( float radians, float *sine, float *cosine )
 		fstp DWORD PTR [edx]
 		fstp DWORD PTR [eax]
 	}
-#elif defined( PLATFORM_WINDOWS_PC64 )
-	*sine = sin( radians );
-	*cosine = cos( radians );
 #elif defined( OSX )
-    __sincosf(radians, sine, cosine);
+	__sincosf(radians, sine, cosine);
 #elif defined( POSIX )
 	sincosf(radians, sine, cosine);
 #endif
--- a/public/mathlib/simdvectormatrix.h
+++ b/public/mathlib/simdvectormatrix.h
@ -135,7 +135,8 @@ public:
 		Assert( m_pData );

 		static FourVectors value{Four_Zeros, Four_Zeros, Four_Zeros};
-		memutils::set( m_pData, value, m_nHeight*m_nPaddedWidth );
+		for (size_t n = m_nHeight * m_nPaddedWidth; n; n--)
+			*(m_pData+n) = value;
 	}

 	void RaiseToPower( float power );
--- a/public/mathlib/ssemath.h
+++ b/public/mathlib/ssemath.h
@ -63,6 +63,12 @@ typedef __m128 fltx4;
 typedef __m128 i32x4;
 typedef __m128 u32x4;

+#ifdef _M_ARM
+#define FLTX4(w, x, y, z) {(w) + (unsigned long long(x) << 32), (y) + (unsigned long long(z) << 32)}
+#else
+#define FLTX4(w, x, y, z) {w, x, y, z}
+#endif
+
 #endif

 // The FLTX4 type is a fltx4 used as a parameter to a function.
@ -1828,7 +1834,7 @@ FORCEINLINE fltx4 ReplicateX4( float flValue )
 FORCEINLINE float SubFloat( const fltx4 & a, int idx )
 {
 	// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
-#ifndef POSIX
+#if defined(_WIN32) && !defined(_M_ARM)
 	return a.m128_f32[ idx ];
 #else
 	return (reinterpret_cast<float const *>(&a))[idx];
@ -1837,7 +1843,7 @@ FORCEINLINE float SubFloat( const fltx4 & a, int idx )

 FORCEINLINE float & SubFloat( fltx4 & a, int idx )
 {
-#ifndef POSIX
+#if defined(_WIN32) && !defined(_M_ARM)
 	return a.m128_f32[ idx ];
 #else
 	return (reinterpret_cast<float *>(&a))[idx];
@ -1851,8 +1857,8 @@ FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )

 FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
 {
-#ifndef POSIX
-	return a.m128_u32[idx];
+#if defined(_WIN32) && !defined(_M_ARM)
+	return a.m128_u32[ idx ];
 #else
 	return (reinterpret_cast<uint32 const *>(&a))[idx];
 #endif
@ -1860,8 +1866,8 @@ FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )

 FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
 {
-#ifndef POSIX
-	return a.m128_u32[idx];
+#if defined(_WIN32) && !defined(_M_ARM)
+	return a.m128_u32[ idx ];
 #else
 	return (reinterpret_cast<uint32 *>(&a))[idx];
 #endif
--- a/public/mathlib/vmatrix.h
+++ b/public/mathlib/vmatrix.h
@ -43,7 +43,7 @@ struct cplane_t;
 	#define M_PI		3.14159265358979323846	// matches value in gcc v2 math.h
 #endif

-class alignas(16) VMatrix
+class VMatrix
 {
 public:

--- a/public/shaderapi/ishaderapi.h
+++ b/public/shaderapi/ishaderapi.h
@ -175,7 +175,9 @@ public:
 	virtual void TexMagFilter( ShaderTexFilterMode_t texFilterMode ) = 0;
 	virtual void TexWrap( ShaderTexCoordComponent_t coord, ShaderTexWrapMode_t wrapMode ) = 0;

+#ifndef SHADERAPIDX10
 	virtual void CopyRenderTargetToTexture( ShaderAPITextureHandle_t textureHandle ) = 0;
+#endif

 	// Binds a particular material to render with
 	virtual void Bind( IMaterial* pMaterial ) = 0;
@ -612,6 +614,7 @@ public:
 	//extended clear buffers function with alpha independent from color
 	virtual void ClearBuffersObeyStencilEx( bool bClearColor, bool bClearAlpha, bool bClearDepth ) = 0;

+#ifndef SHADERAPIDX10
 	// Allows copying a render target to another texture by specifying them both.
 	virtual void CopyRenderTargetToScratchTexture( ShaderAPITextureHandle_t srcRt, ShaderAPITextureHandle_t dstTex, Rect_t *pSrcRect = NULL, Rect_t *pDstRect = NULL ) = 0;

@ -627,6 +630,7 @@ public:
 	
 	virtual void CopyTextureToTexture( ShaderAPITextureHandle_t srcTex, ShaderAPITextureHandle_t dstTex ) = 0;
 	
+#endif
 };


--- a/public/tier0/commonmacros.h
+++ b/public/tier0/commonmacros.h
@ -68,7 +68,7 @@ inline bool IsPowerOfTwo( T value )

 // From crtdefs.h
 #if !defined(UNALIGNED)
-#if defined(_M_IA64) || defined(_M_AMD64)
+#if defined(_M_AMD64) || defined(_M_ARM)
 #define UNALIGNED __unaligned
 #else
 #define UNALIGNED
--- a/public/tier0/platform.h
+++ b/public/tier0/platform.h
@ -852,7 +852,9 @@ static FORCEINLINE double fsel(double fComparand, double fValGE, double fLT)
 //-----------------------------------------------------------------------------
 //#define CHECK_FLOAT_EXCEPTIONS		1

-#if !defined( _X360 )
+#if defined (__arm__) || defined (__aarch64__)
+	inline void SetupFPUControlWord() {}
+#elif !defined( _X360 )
 #if defined( _MSC_VER )

 	#if defined( PLATFORM_WINDOWS_PC64 )
@ -898,8 +900,6 @@ static FORCEINLINE double fsel(double fComparand, double fValGE, double fLT)

 		#endif
 	#endif
-#elif defined (__arm__) || defined (__aarch64__)
-	inline void SetupFPUControlWord() {}
 #else
 	inline void SetupFPUControlWord()
 	{
@ -1025,7 +1025,7 @@ inline T QWordSwapC( T dw )
 		return output;
 	}

-#elif defined( _MSC_VER ) && !defined( PLATFORM_WINDOWS_PC64 )
+#elif defined( _MSC_VER ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(_M_ARM)

 	#define WordSwap  WordSwapAsm
 	#define DWordSwap DWordSwapAsm
@ -1229,8 +1229,18 @@ PLATFORM_INTERFACE time_t			Plat_timegm( struct tm *timeptr );
 PLATFORM_INTERFACE struct tm *		Plat_localtime( const time_t *timep, struct tm *result );

 #if defined( _WIN32 ) && defined( _MSC_VER ) && ( _MSC_VER >= 1400 )
+#ifdef _M_X64
 	extern "C" unsigned __int64 __rdtsc();
 	#pragma intrinsic(__rdtsc)
+#else
+#include <intrin.h>
+#define MSVC_ARM_SYSREG(op0, op1, crn, crm, op2) \
+        ( ((op0 & 1) << 14) | \
+          ((op1 & 7) << 11) | \
+          ((crn & 15) << 7) | \
+          ((crm & 15) << 3) | \
+          ((op2 & 7) << 0) )
+#endif
 #endif

 inline uint64 Plat_Rdtsc()
@ -1241,15 +1251,15 @@ inline uint64 Plat_Rdtsc()
 	return t.tv_sec * 1000000000ULL + t.tv_nsec;
 #elif defined( _X360 )
 	return ( uint64 )__mftb32();
-#elif defined( _WIN64 )
+#elif defined( _M_IX86 )
+	_asm rdtsc
+#elif defined( _M_ARM )
+	uint32 val = _MoveFromCoprocessor(15,0, 9,13,0);
+	return ((uint64)val) << 6;
+#elif defined( _M_ARM64 ) || defined( _M_ARM64EC )
+	return _ReadStatusReg(MSVC_ARM_SYSREG(3,3, 9,12,5));
+#elif defined( COMPILER_MSVC )
 	return ( uint64 )__rdtsc();
-#elif defined( _WIN32 )
-  #if defined( _MSC_VER ) && ( _MSC_VER >= 1400 )
-	return ( uint64 )__rdtsc();
-  #else
-    __asm rdtsc;
-	__asm ret;
-  #endif
 #elif defined( __i386__ )
 	uint64 val;
 	__asm__ __volatile__ ( "rdtsc" : "=A" (val) );
--- a/public/tier0/threadtools.h
+++ b/public/tier0/threadtools.h
@ -241,6 +241,8 @@ inline void ThreadPause()
 	_mm_pause();
 #elif defined( COMPILER_MSVC32 )
 	__asm pause;
+#elif defined(_M_ARM)
+	__yield();
 #elif defined( COMPILER_MSVCX360 )
 	YieldProcessor(); 
 	__asm { or r0,r0,r0 } 
@ -445,7 +447,7 @@ PLATFORM_INTERFACE bool ThreadInterlockedAssignIf64( volatile int64 *pDest, int6

 PLATFORM_INTERFACE int64 ThreadInterlockedExchange64( int64 volatile *, int64 value ) NOINLINE;

-#ifdef COMPILER_MSVC32
+#if COMPILER_MSVC32 || _M_ARM
 PLATFORM_INTERFACE int64 ThreadInterlockedIncrement64( int64 volatile * ) NOINLINE;
 PLATFORM_INTERFACE int64 ThreadInterlockedDecrement64( int64 volatile * ) NOINLINE;
 PLATFORM_INTERFACE int64 ThreadInterlockedExchangeAdd64( int64 volatile *, int64 value ) NOINLINE;
--- a/studiorender/r_studiodraw.cpp
+++ b/studiorender/r_studiodraw.cpp
@ -657,7 +657,7 @@ static matrix3x4_t *ComputeSkinMatrix( mstudioboneweight_t &boneweights, matrix3
 static matrix3x4_t *ComputeSkinMatrixSSE( mstudioboneweight_t &boneweights, matrix3x4_t *pPoseToWorld, matrix3x4_t &result )
 {
 	// NOTE: pPoseToWorld, being cache aligned, doesn't need explicit initialization
-#if defined( _WIN32 ) && !defined( _X360 ) && !defined( PLATFORM_64BITS )
+#if defined( _WIN32 ) && !defined( _X360 ) && defined(_M_IX86)
 	switch( boneweights.numbones )
 	{
 	default:
@ -866,11 +866,9 @@ static matrix3x4_t *ComputeSkinMatrixSSE( mstudioboneweight_t &boneweights, matr
 		return &result;
 #endif
 	}
-#elif POSIX || PLATFORM_WINDOWS_PC64
+#elif POSIX || _WIN32
 // #warning "ComputeSkinMatrixSSE C implementation only"
 	return ComputeSkinMatrix( boneweights, pPoseToWorld, result );
-#elif defined( _X360 )
-	return ComputeSkinMatrix( boneweights, pPoseToWorld, result );
 #else
 	#error
 #endif
@ -909,7 +907,7 @@ inline void CStudioRender::R_ComputeLightAtPoint3( const Vector &pos, const Vect

 // define SPECIAL_SSE_MESH_PROCESSOR to enable code which contains a special optimized SSE lighting loop, significantly
 // improving software vertex processing performace.
-#if defined( _WIN32 ) && !defined( _X360 )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
 #define SPECIAL_SSE_MESH_PROCESSOR
 #endif

--- a/studiorender/r_studiolight.cpp
+++ b/studiorender/r_studiolight.cpp
@ -66,7 +66,7 @@ void R_LightAmbient_4D( const Vector& normal, Vector4D* pLightBoxColor, Vector &
 	VectorMA( lv, normal[2]*normal[2], normal[2] > 0.f ? pLightBoxColor[4].AsVector3D() : pLightBoxColor[5].AsVector3D(), lv );
 }

-#if defined( _WIN32 ) && !defined( _X360 )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
 void R_LightAmbient_4D( const FourVectors& normal, Vector4D* pLightBoxColor, FourVectors &lv )
 {
 //	VPROF( "R_LightAmbient" );
--- a/studiorender/r_studiolight.h
+++ b/studiorender/r_studiolight.h
@ -13,7 +13,7 @@

 #include "tier0/platform.h"

-#if defined( _WIN32 ) && !defined( _X360 )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
 #include <xmmintrin.h>
 #endif

@ -40,7 +40,7 @@ float FASTCALL R_WorldLightDistanceFalloff( const LightDesc_t *wl, const Vector&
 // Copies lighting state into a buffer, returns number of lights copied
 int CopyLocalLightingState( int nMaxLights, LightDesc_t *pDest, int nLightCount, const LightDesc_t *pSrc );

-#if defined( _WIN32 ) && !defined( _X360 )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
 // SSE optimized versions
 void R_LightAmbient_4D( const FourVectors& normal, Vector4D* pLightBoxColor, FourVectors &lv );
 __m128 FASTCALL R_WorldLightDistanceFalloff( const LightDesc_t *wl, const FourVectors& delta );
--- a/studiorender/studiorender.h
+++ b/studiorender/studiorender.h
@ -22,7 +22,7 @@
 #include "flexrenderdata.h"
 #include "mathlib/compressed_vector.h"
 #include "r_studiolight.h"
-#if defined( _WIN32 ) && !defined( _X360 )
+#if defined( _WIN32 ) && !defined( _X360 ) && !defined(_M_ARM)
 #include <xmmintrin.h>
 #endif
 #include "tier0/dbg.h"
--- a/tier0/PMELib.cpp
+++ b/tier0/PMELib.cpp
@ -6,7 +6,7 @@
 //
 //===========================================================================//

-#ifdef _WIN32
+#if defined(_WIN32) && !defined(_M_ARM)
 #include <windows.h>

 #pragma warning( disable : 4530 )   // warning: exception handler -GX option
--- a/tier0/cpumonitoring.cpp
+++ b/tier0/cpumonitoring.cpp
@ -23,7 +23,7 @@
 #include "pch_tier0.h"
 #include "tier0/cpumonitoring.h"

-#ifdef PLATFORM_WINDOWS_PC32
+#ifdef _M_IX86
 #include "tier0/threadtools.h"
 #define NOMINMAX
 #undef min
--- a/tier0/threadtools.cpp
+++ b/tier0/threadtools.cpp
@ -1740,7 +1740,7 @@ bool ThreadInterlockedAssignIf( int32 volatile *pDest, int32 value, int32 comper
 {
 	Assert( (size_t)pDest % 4 == 0 );

-#if !(defined(_WIN64) || defined (_X360))
+#if !(defined(_WIN64) || defined (_X360) || defined(_M_ARM))
 	__asm 
 	{
 		mov	eax,comperand
@ -1773,7 +1773,7 @@ void *ThreadInterlockedCompareExchangePointer( void * volatile *pDest, void *val
 bool ThreadInterlockedAssignPointerIf( void * volatile *pDest, void *value, void *comperand )
 {
 	Assert( (size_t)pDest % 4 == 0 );
-#if !(defined(_WIN64) || defined (_X360))
+#if !(defined(_WIN64) || defined (_X360) || defined(_M_ARM))
 	__asm 
 	{
 		mov	eax,comperand
@ -1807,13 +1807,19 @@ int64 ThreadInterlockedCompareExchange64( int64 volatile *pDest, int64 value, in
 		lock CMPXCHG8B [esi];			
 	}
 }
+#elif defined(_M_ARM)
+int64 ThreadInterlockedCompareExchange64( int64 volatile *pDest, int64 value, int64 comperand )
+{
+	Assert( (size_t)pDest % 8 == 0 );
+	return InterlockedCompareExchange64( pDest, value, comperand );
+}
 #endif

 bool ThreadInterlockedAssignIf64(volatile int64 *pDest, int64 value, int64 comperand ) 
 {
 	Assert( (size_t)pDest % 8 == 0 );

-#if defined(_X360) || defined(_WIN64)
+#if defined(_X360) || defined(_WIN64) || defined(_M_ARM)
 	return ( ThreadInterlockedCompareExchange64( pDest, value, comperand ) == comperand ); 
 #else
 	__asm
--- a/tier0/wscript
+++ b/tier0/wscript
@ -22,8 +22,8 @@ def build(bld):
 		'assert_dialog.cpp',
 		'commandline.cpp',
 		'cpu.cpp',
-		'cpumonitoring.cpp',
 		'cpu_usage.cpp',
+		'cpumonitoring.cpp',
 		'dbg.cpp',
 		'dynfunction.cpp',
 		'fasttimer.cpp',
@ -54,10 +54,13 @@ def build(bld):
 			'assert_dialog.rc',
 			#'etwprof.cpp',			[$WINDOWS]
 			'platform.cpp',
-			'pme.cpp',
 			'vcrmode.cpp',
 			'win32consoleio.cpp'
 		]
+		if bld.env.DEST_CPU == 'arm':
+			source += ['pme_posix.cpp']
+		else:
+			source += ['pme.cpp']
 		if bld.env.DEST_CPU == 'amd64':
 			source += [
 				'InterlockedCompareExchange128.masm'
--- a/tier1/processor_detect.cpp
+++ b/tier1/processor_detect.cpp
@ -6,7 +6,7 @@
 // $NoKeywords: $
 //=============================================================================//

-#if defined( _X360 ) || defined( WIN64 )
+#if defined( _X360 ) || defined( WIN64 ) || defined(_M_ARM)

 bool CheckMMXTechnology(void) { return false; }
 bool CheckSSETechnology(void) { return false; }
--- a/vphysics/trace.cpp
+++ b/vphysics/trace.cpp
@ -453,7 +453,7 @@ private:
 #ifdef WIN32
 static const 
 #endif
-fltx4 g_IVPToHLDir = { 1.0f, -1.0f, 1.0f, 1.0f };
+fltx4 g_IVPToHLDir = FLTX4( 1.0f, -1.0f, 1.0f, 1.0f );

 //static const fltx4 g_IVPToHLPosition = { IVP2HL(1.0f), -IVP2HL(1.0f), IVP2HL(1.0f), IVP2HL(1.0f) };

@ -680,7 +680,7 @@ bool CTraceIVP::BuildLeafmapCache( const leafmap_t * RESTRICT pLeafmap )
 #endif
 }

-static const fltx4 g_IndexBase = {0,1,2,3};
+static const fltx4 g_IndexBase =FLTX4(0,1,2,3);
 int CTraceIVP::SupportMapCached( const Vector &dir, Vector *pOut ) const
 {
 	VPROF("SupportMapCached");
--- a/vstdlib/coroutine.cpp
+++ b/vstdlib/coroutine.cpp
@ -218,7 +218,11 @@ extern "C" byte *GetStackPtr64();
 #define GetStackPtr( pStackPtr)		byte *pStackPtr = GetStackPtr64();
 #else
 #ifdef WIN32
-#define GetStackPtr( pStackPtr )	byte *pStackPtr;	__asm mov pStackPtr, esp	
+# ifdef _M_ARM
+#  define GetStackPtr( pStackPtr )	byte x; byte *pStackPtr = &x
+# else
+#  define GetStackPtr( pStackPtr )	byte *pStackPtr;	__asm mov pStackPtr, esp	
+# endif
 #elif defined(GNUC)
 // Apple's version of gcc/g++ doesn't return the expected value using the intrinsic, so 
 // do it the old fashioned way - this will also use asm on linux (since we don't compile
@ -649,7 +653,7 @@ bool Internal_Coroutine_Continue( HCoroutine hCoroutine, const char *pchDebugMsg
 	bool bInCoroutineAlready = GCoroutineMgr().IsAnyCoroutineActive();

 #ifdef _WIN32
-#ifndef _WIN64
+#if !defined( _WIN64 ) && !defined( _M_ARM )
 	// make sure nobody has a try/catch block and then yielded
 	// because we hate that and we will crash
 	uint32 topofexceptionchain;
@ -897,7 +901,7 @@ void Coroutine_YieldToMain()
 	CoroutineDbgMsg( g_fmtstr.sprintf( "Coroutine_YieldToMain() %s#%x -> %s#%x\n", coroutine.m_pchName, coroutine.m_hCoroutine, coroutinePrev.m_pchName, coroutinePrev.m_hCoroutine ) );

 #ifdef _WIN32
-#ifndef _WIN64
+#if !defined( _WIN64 ) && !defined( _M_ARM )
 	// make sure nobody has a try/catch block and then yielded
 	// because we hate that and we will crash
 	uint32 topofexceptionchain;
--- a/36
+++ b/36
@ -1,5 +1,6 @@
 #! /usr/bin/env python
 # encoding: utf-8
+# vim: noexpandtab
 # nillerusr

 from __future__ import print_function
@ -222,6 +223,8 @@ def define_platform(conf):
 			'_ALLOW_MSC_VER_MISMATCH',
 			'NO_X360_XDK'
 		])
+		if conf.env.DEST_CPU == 'arm':
+			conf.env.append_unique('DEFINES', ['__arm__=1'])
 	elif conf.env.DEST_OS == 'darwin':
 		conf.env.append_unique('DEFINES', [
 			'OSX=1', '_OSX=1',
@ -384,14 +387,16 @@ def check_deps(conf):
 		conf.check(lib='opus', uselib_store='OPUS')

 	if conf.env.DEST_OS == 'win32':
-		conf.check(lib='libz', uselib_store='ZLIB', define_name='USE_ZLIB')
-		# conf.check(lib='nvtc', uselib_store='NVTC')
-		# conf.check(lib='ati_compress_mt_vc10', uselib_store='ATI_COMPRESS_MT_VC10')
-		conf.check(lib='SDL2', uselib_store='SDL2')
-		conf.check(lib='libjpeg', uselib_store='JPEG', define_name='HAVE_JPEG')
-		conf.check(lib='libpng', uselib_store='PNG', define_name='HAVE_PNG')
-		conf.check(lib='d3dx9', uselib_store='D3DX9')
-		conf.check(lib='d3d9', uselib_store='D3D9')
+		if conf.env.DEST_CPU == 'arm':
+			conf.check(lib='d3d9', uselib_store='D3D9')
+			conf.check(lib='d3dcompiler', uselib_store='D3DCOMPILER')
+		else:
+			conf.check(lib='libz', uselib_store='ZLIB', define_name='USE_ZLIB')
+			conf.check(lib='SDL2', uselib_store='SDL2')
+			conf.check(lib='libjpeg', uselib_store='JPEG', define_name='HAVE_JPEG')
+			conf.check(lib='libpng', uselib_store='PNG', define_name='HAVE_PNG')
+			conf.check(lib='d3dx9', uselib_store='D3DX9')
+			conf.check(lib='d3d9', uselib_store='D3D9')
 		conf.check(lib='dsound', uselib_store='DSOUND')
 		conf.check(lib='dxguid', uselib_store='DXGUID')
 		if conf.options.OPUS:
@ -405,10 +410,12 @@ def configure(conf):
 	# Force XP compability, all build targets should add
 	# subsystem=bld.env.MSVC_SUBSYSTEM
 	# TODO: wrapper around bld.stlib, bld.shlib and so on?
-	conf.env.MSVC_SUBSYSTEM = 'WINDOWS,5.01'
-	conf.env.MSVC_TARGETS = ['x86'] # explicitly request x86 target for MSVC
-	if conf.options.ALLOW64:
-		conf.env.MSVC_TARGETS = ['x64']
+	conf.env.MSVC_TARGETS = ['x86' if not conf.options.ALLOW64 else 'x64']
+	if conf.env.MSVC_TARGETS[0] == 'x86':
+		conf.env.MSVC_SUBSYSTEM = 'WINDOWS,5.01'
+	else:
+		conf.env.MSVC_SUBSYSTEM = 'WINDOWS'
+
 	if sys.platform == 'win32':
 		conf.load('msvc_pdb_ext msdev msvs')
 	conf.load('subproject xcompile compiler_c compiler_cxx gitversion clang_compilation_database strip_on_install_v2 waf_unit_test enforce_pic')
@ -504,7 +511,6 @@ def configure(conf):
 	else:
 		cflags += [
 			'/I'+os.path.abspath('.')+'/thirdparty/SDL',
-			'/arch:SSE' if conf.env.DEST_CPU == 'x86' else '/arch:AVX',
 			'/GF',
 			'/Gy',
 			'/fp:fast',
@ -514,6 +520,8 @@ def configure(conf):
 			'/TP',
 			'/EHsc'
 		]
+		if conf.env.DEST_CPU != 'arm':
+			cflags += ['/arch:SSE' if conf.env.DEST_CPU == 'x86' else '/arch:AVX']

 		if conf.options.BUILD_TYPE == 'debug':
 			linkflags += [
@ -593,7 +601,7 @@ def configure(conf):
 def build(bld):
 	os.environ["CCACHE_DIR"] = os.path.abspath('.ccache/'+bld.env.COMPILER_CC+'/'+bld.env.DEST_OS+'/'+bld.env.DEST_CPU)

-	if bld.env.DEST_OS in ['win32', 'android']:
+	if bld.env.SDL and bld.env.DEST_OS in ['win32', 'android']:
 		sdl_name = 'SDL2.dll' if bld.env.DEST_OS == 'win32' else 'libSDL2.so'
 		sdl_path = os.path.join('lib', bld.env.DEST_OS, bld.env.DEST_CPU, sdl_name)
 		bld.install_files(bld.env.LIBDIR, [sdl_path])