From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Enable building on non-x86_64
--- spoa.orig/src/simd_alignment_engine.cpp
+++ spoa/src/simd_alignment_engine.cpp
@@ -9,7 +9,8 @@
 #include <limits>
 
 extern "C" {
-    #include <immintrin.h> // AVX2 and lower
+    #define SIMDE_ENABLE_NATIVE_ALIASES
+    #include <simde/x86/avx2.h> // AVX2 and lower
 }
 
 #include "spoa/graph.hpp"
@@ -43,8 +44,6 @@
 template<typename T>
 struct InstructionSet;
 
-#if defined(__AVX2__)
-
 constexpr std::uint32_t kRegisterSize = 256;
 using __mxxxi = __m256i;
 
@@ -139,99 +138,6 @@
     }
 };
 
-#elif defined(__SSE4_1__)
-
-constexpr std::uint32_t kRegisterSize = 128;
-using __mxxxi = __m128i;
-
-inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) {
-    return _mm_load_si128(mem_addr);
-}
-
-inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) {
-    _mm_store_si128(mem_addr, a);
-}
-
-inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) {
-    return _mm_or_si128(a, b);
-}
-
-#define _mmxxx_slli_si(a, n) \
-    _mm_slli_si128(a, n)
-
-#define _mmxxx_srli_si(a, n) \
-    _mm_srli_si128(a, n)
-
-template<>
-struct InstructionSet<std::int16_t> {
-    using type = std::int16_t;
-    static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
-    static constexpr std::uint32_t kLogNumVar = 3;
-    static constexpr std::uint32_t kLSS = 2;
-    static constexpr std::uint32_t kRSS = 14;
-    static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_add_epi16(a, b);
-    }
-    static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_sub_epi16(a, b);
-    }
-    static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_min_epi16(a, b);
-    }
-    static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_max_epi16(a, b);
-    }
-    static inline __mxxxi _mmxxx_set1_epi(type a) {
-        return _mm_set1_epi16(a);
-    }
-    static inline void _mmxxx_prefix_max(__mxxxi& a, const __mxxxi* masks,
-        const __mxxxi* penalties) {
-
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[0]), 2)));
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[1]), 4)));
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[2]), 8)));
-    }
-};
-
-template<>
-struct InstructionSet<std::int32_t> {
-    using type = std::int32_t;
-    static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
-    static constexpr std::uint32_t kLogNumVar = 2;
-    static constexpr std::uint32_t kLSS = 4;
-    static constexpr std::uint32_t kRSS = 12;
-    static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_add_epi32(a, b);
-    }
-    static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_sub_epi32(a, b);
-    }
-    static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_min_epi32(a, b);
-    }
-    static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_max_epi32(a, b);
-    }
-    static inline __mxxxi _mmxxx_set1_epi(type a) {
-        return _mm_set1_epi32(a);
-    }
-    static inline void _mmxxx_prefix_max(__mxxxi& a, const __mxxxi* masks,
-        const __mxxxi* penalties) {
-
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[0]), 4)));
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[1]), 8)));
-    }
-};
-
-#endif
-
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
 template<typename T>
 void _mmxxx_print(const __mxxxi& a) {
 
@@ -288,28 +194,16 @@
     return -1;
 }
 
-#endif
-
 std::unique_ptr<AlignmentEngine> createSimdAlignmentEngine(AlignmentType type,
     AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g,
     std::int8_t e, std::int8_t q, std::int8_t c) {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     return std::unique_ptr<AlignmentEngine>(new SimdAlignmentEngine(type,
         subtype, m, n, g, e, q, c));
-
-#else
-
-    return nullptr;
-
-#endif
 }
 
 struct SimdAlignmentEngine::Implementation {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::vector<std::uint32_t> node_id_to_rank;
 
     std::unique_ptr<__mxxxi[]> sequence_profile_storage;
@@ -342,7 +236,6 @@
             penalties(nullptr) {
     }
 
-#endif
 };
 
 SimdAlignmentEngine::SimdAlignmentEngine(AlignmentType type,
@@ -358,8 +251,6 @@
 void SimdAlignmentEngine::prealloc(std::uint32_t max_sequence_size,
     std::uint32_t alphabet_size) {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t longest_path = max_sequence_size * (alphabet_size + 1) + 1 +
         InstructionSet<std::int16_t>::kNumVar;
 
@@ -374,14 +265,11 @@
             alphabet_size * max_sequence_size, alphabet_size);
     }
 
-#endif
 }
 
 void SimdAlignmentEngine::realloc(std::uint32_t matrix_width,
     std::uint32_t matrix_height, std::uint32_t num_codes) {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     if (pimpl_->node_id_to_rank.size() < matrix_height - 1) {
         pimpl_->node_id_to_rank.resize(matrix_height - 1, 0);
     }
@@ -453,7 +341,6 @@
         pimpl_->penalties_storage = std::unique_ptr<__mxxxi[]>(storage);
     }
 
-#endif
 }
 
 template<typename T>
@@ -461,8 +348,6 @@
     const std::unique_ptr<Graph>& graph, std::uint32_t normal_matrix_width,
     std::uint32_t matrix_width, std::uint32_t matrix_height) noexcept {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::int32_t padding_penatly = -1 * std::max(std::max(abs(m_), abs(n_)),
         std::max(abs(g_), abs(q_)));
 
@@ -643,7 +528,6 @@
             break;
     }
 
-#endif
 }
 
 Alignment SimdAlignmentEngine::align(const char* sequence,
@@ -653,8 +537,6 @@
         return Alignment();
     }
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t longest_path = graph->nodes().size() + 1 + sequence_size +
         InstructionSet<std::int16_t>::kNumVar;
 
@@ -680,19 +562,12 @@
 
     return Alignment();
 
-#else
-
-    return Alignment();
-
-#endif
 }
 
 template<typename T>
 Alignment SimdAlignmentEngine::linear(const char* sequence,
     std::uint32_t sequence_size, const std::unique_ptr<Graph>& graph) noexcept {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t normal_matrix_width = sequence_size;
     std::uint32_t matrix_width = (sequence_size + (sequence_size % T::kNumVar == 0 ?
         0 : T::kNumVar - sequence_size % T::kNumVar)) / T::kNumVar;
@@ -1027,19 +902,12 @@
     std::reverse(alignment.begin(), alignment.end());
     return alignment;
 
-#else
-
-    return Alignment();
-
-#endif
 }
 
 template<typename T>
 Alignment SimdAlignmentEngine::affine(const char* sequence,
     std::uint32_t sequence_size, const std::unique_ptr<Graph>& graph) noexcept {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t normal_matrix_width = sequence_size;
     std::uint32_t matrix_width = (sequence_size + (sequence_size % T::kNumVar == 0 ?
         0 : T::kNumVar - sequence_size % T::kNumVar)) / T::kNumVar;
@@ -1457,19 +1325,12 @@
     std::reverse(alignment.begin(), alignment.end());
     return alignment;
 
-#else
-
-    return Alignment();
-
-#endif
 }
 
 template<typename T>
 Alignment SimdAlignmentEngine::convex(const char* sequence,
     std::uint32_t sequence_size, const std::unique_ptr<Graph>& graph) noexcept {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t normal_matrix_width = sequence_size;
     std::uint32_t matrix_width = (sequence_size + (sequence_size % T::kNumVar == 0 ?
         0 : T::kNumVar - sequence_size % T::kNumVar)) / T::kNumVar;
@@ -1962,11 +1823,6 @@
 
     std::reverse(alignment.begin(), alignment.end());
     return alignment;
-#else
-
-    return Alignment();
-
-#endif
 }
 
 }
