From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Enable building on non-x86 and less than SSE4.1
--- mmseqs2.orig/src/CMakeLists.txt
+++ mmseqs2/src/CMakeLists.txt
@@ -133,21 +133,6 @@
     append_target_property(mmseqs-framework LINK_FLAGS -msse4.1)
 elseif (HAVE_NEON)
     target_compile_definitions(mmseqs-framework PUBLIC -DSSE=1 -DNEON=1)
-else ()
-    include(CheckSSEFeatures)
-    append_target_property(mmseqs-framework COMPILE_FLAGS ${SSE_FLAGS})
-    append_target_property(mmseqs-framework LINK_FLAGS ${SSE_FLAGS})
-    if (HAVE_AVX2_EXTENSIONS)
-        target_compile_definitions(mmseqs-framework PUBLIC -DAVX2=1)
-        # debugging
-        #   list(APPEND MMSEQS_DEFINITIONS -DSSE=1)
-    else ()
-        if (HAVE_SSE4_1_EXTENSIONS)
-            target_compile_definitions(mmseqs-framework PUBLIC -DSSE=1)
-        else ()
-            message(FATAL_ERROR "At least SSE4.2 is needed to compile!")
-        endif (HAVE_SSE4_1_EXTENSIONS)
-    endif (HAVE_AVX2_EXTENSIONS)
 endif ()
 
 target_link_libraries(mmseqs-framework tinyexpr zstd microtar)
@@ -261,10 +246,10 @@
     add_subdirectory(version)
     set(mmseqs_source_files mmseqs.cpp)
 
-    add_executable(mmseqs ${mmseqs_source_files})
-    mmseqs_setup_derived_target(mmseqs)
-    target_link_libraries(mmseqs version)
-    install(TARGETS mmseqs DESTINATION bin)
+    add_executable(mmseqs${EXE_SUFFIX} ${mmseqs_source_files})
+    mmseqs_setup_derived_target(mmseqs${EXE_SUFFIX})
+    target_link_libraries(mmseqs${EXE_SUFFIX} version)
+    install(TARGETS mmseqs${EXE_SUFFIX} DESTINATION bin)
 
     if (HAVE_TESTS)
         add_subdirectory(test)
--- mmseqs2.orig/lib/simd/simd.h
+++ mmseqs2/lib/simd/simd.h
@@ -50,11 +50,8 @@
 #define SSE
 #endif
 
-#ifdef NEON
-#include "sse2neon.h"
-#else
-#include <xmmintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
 
 #ifdef AVX512
 #include <zmmintrin.h.h> // AVX512
@@ -164,7 +161,7 @@
 // integer support  (usable with AVX2)
 #ifndef SIMD_INT
 #define SIMD_INT
-#include <immintrin.h> // AVX
+#include <simde/x86/avx2.h>
 #define ALIGN_INT           AVX2_ALIGN_INT
 #define VECSIZE_INT         AVX2_VECSIZE_INT
 //function header
@@ -231,7 +228,7 @@
 #endif //AVX2
 
 #ifdef AVX
-#include <immintrin.h> // AVX
+#include <simde/x86/avx.h>
 // double support (usable with AVX1)
 #ifndef SIMD_DOUBLE
 #define SIMD_DOUBLE
@@ -284,11 +281,9 @@
 #endif //AVX_SUPPORT
 
 
-#ifdef SSE
+#include <simde/x86/sse4.1.h>
 uint16_t simd_hmax16(const __m128i buffer);
 uint8_t simd_hmax8(const __m128i buffer);
-#ifndef NEON
-#include <smmintrin.h>  //SSE4.1
 // double support
 #ifndef SIMD_DOUBLE
 #define SIMD_DOUBLE
@@ -311,7 +306,6 @@
 #define simdf64_andnot(x,y) _mm_andnot_pd(x,y)
 #define simdf64_xor(x,y)    _mm_xor_pd(x,y)
 #endif //SIMD_DOUBLE
-#endif
 
 // float support
 #ifndef SIMD_FLOAT
@@ -395,40 +389,7 @@
 #define simdi32_i2f(x) 	    _mm_cvtepi32_ps(x)  // convert integer to s.p. float
 #define simdi_i2fcast(x)    _mm_castsi128_ps(x)
 #endif //SIMD_INT
-#endif //SSE
-
-#ifdef NEON
-inline uint16_t simd_hmax16(const __m128i buffer) {
-    uint16x4_t tmp;
-    tmp = vmax_u16(vget_low_u16(vreinterpretq_u16_m128i(buffer)), vget_high_u16(vreinterpretq_u16_m128i(buffer)));
-    tmp = vpmax_u16(tmp, tmp);
-    tmp = vpmax_u16(tmp, tmp);
-    return vget_lane_u16(tmp, 0);
-}
-
-inline uint8_t simd_hmax8(const __m128i buffer) {
-    uint8x8_t tmp;
-    tmp = vmax_u8(vget_low_u8(vreinterpretq_u8_m128i(buffer)), vget_high_u8(vreinterpretq_u8_m128i(buffer)));
-    tmp = vpmax_u8(tmp, tmp);
-    tmp = vpmax_u8(tmp, tmp);
-    tmp = vpmax_u8(tmp, tmp);
-    return vget_lane_u8(tmp, 0);
-}
-#if 0
-template <typename F>
-inline F simd_hmax(const F * in, unsigned int n);
-
-inline uint16_t simd_hmax16(const __m128i buffer) {
-    SIMDVec* tmp = (SIMDVec*)&buffer;
-    return simd_hmax<uint16_t>((uint16_t*)tmp->m128_u16, 8);
-}
 
-inline uint8_t simd_hmax8(const __m128i buffer) {
-    SIMDVec* tmp = (SIMDVec*)&buffer;
-    return simd_hmax<uint8_t>((uint8_t*)tmp->m128_u8, 16);
-}
-#endif
-#else
 inline uint16_t simd_hmax16(const __m128i buffer)
 {
     __m128i tmp1 = _mm_subs_epu16(_mm_set1_epi16((short)65535), buffer);
@@ -443,7 +404,6 @@
     __m128i tmp3 = _mm_minpos_epu16(tmp2);
     return (int8_t)(255 -(int8_t) _mm_cvtsi128_si32(tmp3));
 }
-#endif
 
 #ifdef AVX2
 inline uint16_t simd_hmax16_avx(const __m256i buffer){
@@ -488,7 +448,6 @@
     return 0;
 }
 #else
-#ifdef SSE
 inline unsigned short extract_epi16(__m128i v, int pos) {
     switch(pos){
         case 0: return _mm_extract_epi16(v, 0);
@@ -503,7 +462,6 @@
     return 0;
 }
 #endif
-#endif
 
 
 /* horizontal max */
@@ -608,7 +566,6 @@
 //
 //
 //TODO fix this
-#ifdef SSE
     float __attribute__((aligned(16))) res;
     __m128 P; // query 128bit SSE2 register holding 4 floats
     __m128 R;// result
@@ -637,7 +594,6 @@
     R = _mm_add_ps(R,P);
     _mm_store_ss(&res, R);
     return res;
-#endif
 //#endif
     return tj[0] * qi[0] + tj[1] * qi[1] + tj[2] * qi[2] + tj[3] * qi[3]
             + tj[4] * qi[4] + tj[5] * qi[5] + tj[6] * qi[6] + tj[7] * qi[7]
--- mmseqs2.orig/src/commons/itoa.h
+++ mmseqs2/src/commons/itoa.h
@@ -25,7 +25,8 @@
 #ifdef NEON
 #include "sse2neon.h"
 #else
-#include <emmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse2.h>
 #endif
 
 #include <stdint.h>
--- mmseqs2.orig/CMakeLists.txt
+++ mmseqs2/CMakeLists.txt
@@ -8,6 +8,7 @@
 set(HAVE_SANITIZER 0 CACHE BOOL "Have Sanitizers")
 set(INSTALL_UTIL 1 CACHE BOOL "Install util scripts")
 set(VERSION_OVERRIDE "" CACHE STRING "Override version string in help and usage messages")
+set(EXE_SUFFIX "" CACHE STRING "Suffix to add to executable names")
 
 #Sanitizers
 if (${HAVE_SANITIZER})
@@ -40,9 +41,6 @@
 
 # set flags
 set(MMSEQS_CXX_FLAGS "-std=c++0x")
-if (NOT ${HAVE_NEON})
-    set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -m64")
-endif ()
 
 # Compiler-specific features
 if (CMAKE_COMPILER_IS_CLANG)
--- mmseqs2.orig/src/prefiltering/UngappedAlignment.cpp
+++ mmseqs2/src/prefiltering/UngappedAlignment.cpp
@@ -72,11 +72,9 @@
     simd_int vMaxScore     = simdi_setzero();
     const simd_int vBias   = simdi8_set(bias);
 #ifndef AVX2
-    #ifdef SSE
     const simd_int sixten  = simdi8_set(16);
     const simd_int fiveten = simdi8_set(15);
 #endif
-#endif
     for(unsigned int pos = 0; pos < seqLen; pos++){
         simd_int template01 = simdi_load((simd_int *)&dbSeq[pos*VECSIZE_INT*4]);
 #ifdef AVX2
@@ -85,7 +83,7 @@
         //        __m256i score_vec_8bit = _mm256_shuffle_epi8(score_matrix_vec01, template01);
         //        __m256i lookup_mask01  = _mm256_cmpgt_epi8(sixten, template01); // 16 > t
         //        score_vec_8bit = _mm256_and_si256(score_vec_8bit, lookup_mask01);
-#elif defined(SSE)
+#else
         // each position has 32 byte
         // 20 scores and 12 zeros
         // load score 0 - 15
@@ -96,16 +94,8 @@
         // _mm_shuffle_epi8
         // for i ... 16
         //   score01[i] = score_matrix_vec01[template01[i]%16]
-#ifdef NEON
-        __m128i score01 =vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(score_matrix_vec01),vreinterpretq_u8_m128i(template01)));
-#else
         __m128i score01 =_mm_shuffle_epi8(score_matrix_vec01,template01);
-#endif
-#ifdef NEON
-        __m128i score16 =vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(score_matrix_vec16),vreinterpretq_u8_m128i(template01)));
-#else
         __m128i score16 =_mm_shuffle_epi8(score_matrix_vec16,template01);
-#endif
         // t[i] < 16 => 0 - 15
         // example: template01: 02 15 12 18 < 16 16 16 16 => FF FF FF 00
         __m128i lookup_mask01 = _mm_cmplt_epi8(template01, sixten);
@@ -292,7 +282,7 @@
     EXTRACT_AVX(24);  EXTRACT_AVX(25);  EXTRACT_AVX(26);  EXTRACT_AVX(27);
     EXTRACT_AVX(28);  EXTRACT_AVX(29);  EXTRACT_AVX(30);  EXTRACT_AVX(31);
 #undef EXTRACT_AVX
-#elif defined(SSE)
+#else
     #define EXTRACT_SSE(i) score_arr[i] = _mm_extract_epi8(score, i)
     EXTRACT_SSE(0);  EXTRACT_SSE(1);   EXTRACT_SSE(2);  EXTRACT_SSE(3);
     EXTRACT_SSE(4);  EXTRACT_SSE(5);   EXTRACT_SSE(6);  EXTRACT_SSE(7);
--- mmseqs2.orig/src/commons/Application.cpp
+++ mmseqs2/src/commons/Application.cpp
@@ -4,10 +4,6 @@
 #include "DistanceCalculator.h"
 #include "Timer.h"
 
-#ifndef NEON
-#include <CpuInfo.h>
-#endif
-
 #include <iomanip>
 
 extern const char *binary_name;
@@ -24,30 +20,6 @@
 extern std::vector<Categories> categories;
 
 void checkCpu() {
-#ifndef NEON
-    CpuInfo info;
-    if (info.HW_x64 == false) {
-        Debug(Debug::ERROR) << "64-bit system is required to run MMseqs2.\n";
-        EXIT(EXIT_FAILURE);
-    }
-#ifdef SEE
-    if(info.HW_SSE41 == false) {
-        Debug(Debug::ERROR) << "SSE4.1 is required to run MMseqs2.\n";
-        EXIT(EXIT_FAILURE);
-    }
-#endif
-#ifdef AVX2
-    if (info.HW_AVX2 == false) {
-        Debug(Debug::ERROR) << "Your machine does not support AVX2.\n";
-        if (info.HW_SSE41 == true) {
-            Debug(Debug::ERROR) << "Please recompile with SSE4.1: cmake -DHAVE_SSE4_1=1 \n";
-        } else {
-            Debug(Debug::ERROR) << "SSE4.1 is the minimum requirement to run MMseqs2.\n";
-        }
-        EXIT(EXIT_FAILURE);
-    }
-#endif
-#endif
 }
 
 Command *getCommandByName(const char *s) {
--- mmseqs2.orig/lib/ksw2/ksw2_extz2_sse.cpp
+++ mmseqs2/lib/ksw2/ksw2_extz2_sse.cpp
@@ -31,24 +31,8 @@
 #include <assert.h>
 #include "ksw2.h"
 
-#ifdef NEON
-#include "sse2neon.h"
-#define __SSE2__
-#define KSW_SSE2_ONLY
-#endif
-
-#ifdef __SSE2__
-#ifndef NEON
-#include <emmintrin.h>
-#endif
-
-#ifdef KSW_SSE2_ONLY
-#undef __SSE4_1__
-#endif
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse4.1.h>
 
 #ifdef KSW_CPU_DISPATCH
 #ifdef __SSE4_1__
@@ -165,11 +149,7 @@
 				st = _mm_loadu_si128((__m128i*)&qrr[t]);
 				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
 				tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
 				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
-#else
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_));
-#endif
 				tmp = _mm_andnot_si128(mask, tmp);
 				_mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
 			}
@@ -181,27 +161,14 @@
 		x1_ = _mm_cvtsi32_si128(x1);
 		v1_ = _mm_cvtsi32_si128(v1);
 		st_ = st / 16, en_ = en / 16;
-		assert(en_ - st_ + 1 <= n_col_);
 		if (!with_cigar) { // score only
 			for (t = st_; t <= en_; ++t) {
 				__m128i z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-#endif
 				__dp_code_block2;
-#ifdef __SSE4_1__
 				_mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
 				_mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
-#else
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t], _mm_and_si128(a, tmp));
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t], _mm_and_si128(b, tmp));
-#endif
 			}
 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
 			__m128i *pr = p + r * n_col_ - st_;
@@ -210,16 +177,9 @@
 				__m128i d, z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
 				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 				tmp = _mm_cmpgt_epi8(b, z);
 				d = _mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-				tmp = _mm_cmpgt_epi8(b, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(a, zero_);
 				_mm_store_si128(&x[t], _mm_and_si128(tmp, a));
@@ -236,16 +196,9 @@
 				__m128i d, z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
 				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
-#ifdef __SSE4_1__
 				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 				tmp = _mm_cmpgt_epi8(z, b);
 				d = _mm_blendv_epi8(flag2_, d, tmp);             // d = z > b? d : 2
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
-#endif
 				__dp_code_block2;
 				tmp = _mm_cmpgt_epi8(zero_, a);
 				_mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
@@ -276,13 +229,8 @@
 					_mm_storeu_si128((__m128i*)&H[t], H1);
 					t_ = _mm_set1_epi32(t);
 					tmp = _mm_cmpgt_epi32(H1, max_H_);
-#ifdef __SSE4_1__
 					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
 					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
-#else
-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
-#endif
 				}
 				_mm_storeu_si128((__m128i*)HH, max_H_);
 				_mm_storeu_si128((__m128i*)tt, max_t_);
@@ -334,4 +282,3 @@
 		kfree(km, mem2); kfree(km, off);
 	}
 }
-#endif // __SSE2__
--- mmseqs2.orig/src/commons/Util.cpp
+++ mmseqs2/src/commons/Util.cpp
@@ -620,18 +620,10 @@
 #undef c
 
     // use _mm_shuffle_epi8 to look up reverse complement
-#ifdef NEON
-    kmer1 = vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(lookup1),vreinterpretq_u8_m128i(kmer1)));
-#else
     kmer1 =_mm_shuffle_epi8(lookup1, kmer1);
-#endif
 
 
-#ifdef NEON
-    kmer2 = vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(lookup2),vreinterpretq_u8_m128i(kmer2)));
-#else
     kmer2 = _mm_shuffle_epi8(lookup2, kmer2);
-#endif
 
 
     // _mm_or_si128: bitwise OR
@@ -639,11 +631,7 @@
 
     // set upper 8 bytes to 0 and revert order of lower 8 bytes
 
-#ifdef NEON
-    x = vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(x),vreinterpretq_u8_m128i(upper)));
-#else
     x = _mm_shuffle_epi8(x, upper);
-#endif
 
     // shift out the unused nucleotide positions (1 <= k <=32 )
     // broadcast 128 bit to 64 bit
