From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Enable building on non-x86_64
--- spoa.orig/src/simd_alignment_engine.cpp
+++ spoa/src/simd_alignment_engine.cpp
@@ -9,7 +9,7 @@
 #include <limits>
 
 extern "C" {
-    #include <immintrin.h> // AVX2 and lower
+    #include "simde/x86/avx2.h" // AVX2 and lower
 }
 
 #include "spoa/graph.hpp"
@@ -43,31 +43,29 @@
 template<typename T>
 struct InstructionSet;
 
-#if defined(__AVX2__)
-
 constexpr std::uint32_t kRegisterSize = 256;
-using __mxxxi = __m256i;
+using __mxxxi = simde__m256i;
 
 inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) {
-    return _mm256_load_si256(mem_addr);
+    return simde_mm256_load_si256(mem_addr);
 }
 
 inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) {
-    _mm256_store_si256(mem_addr, a);
+    simde_mm256_store_si256(mem_addr, a);
 }
 
 inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) {
-    return _mm256_or_si256(a, b);
+    return simde_mm256_or_si256(a, b);
 }
 
 #define _mmxxx_slli_si(a, n) n < 16 ? \
-    _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, \
-        _MM_SHUFFLE(0, 0, 2, 0)), 16 - n) : \
-    _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0))
+    simde_mm256_alignr_epi8(a, simde_mm256_permute2x128_si256(a, a, \
+        SIMDE_MM_SHUFFLE(0, 0, 2, 0)), 16 - n) : \
+    simde_mm256_permute2x128_si256(a, a, SIMDE_MM_SHUFFLE(0, 0, 2, 0))
 
 #define _mmxxx_srli_si(a, n) \
-    _mm256_srli_si256(_mm256_permute2x128_si256(a, a, \
-        _MM_SHUFFLE(2, 0, 0, 1)), n - 16)
+    simde_mm256_srli_si256(simde_mm256_permute2x128_si256(a, a, \
+        SIMDE_MM_SHUFFLE(2, 0, 0, 1)), n - 16)
 
 template<>
 struct InstructionSet<std::int16_t> {
@@ -77,19 +75,19 @@
     static constexpr std::uint32_t kLSS = 2; // Left Shift Size
     static constexpr std::uint32_t kRSS = 30; // Right Shift Size
     static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm256_add_epi16(a, b);
+        return simde_mm256_add_epi16(a, b);
     }
     static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm256_sub_epi16(a, b);
+        return simde_mm256_sub_epi16(a, b);
     }
     static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm256_min_epi16(a, b);
+        return simde_mm256_min_epi16(a, b);
     }
     static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm256_max_epi16(a, b);
+        return simde_mm256_max_epi16(a, b);
     }
     static inline __mxxxi _mmxxx_set1_epi(type a) {
-        return _mm256_set1_epi16(a);
+        return simde_mm256_set1_epi16(a);
     }
     static inline void _mmxxx_prefix_max(__mxxxi& a, const __mxxxi* masks,
         const __mxxxi* penalties) {
@@ -113,19 +111,19 @@
     static constexpr std::uint32_t kLSS = 4;
     static constexpr std::uint32_t kRSS = 28;
     static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm256_add_epi32(a, b);
+        return simde_mm256_add_epi32(a, b);
     }
     static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm256_sub_epi32(a, b);
+        return simde_mm256_sub_epi32(a, b);
     }
     static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm256_min_epi32(a, b);
+        return simde_mm256_min_epi32(a, b);
     }
     static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm256_max_epi32(a, b);
+        return simde_mm256_max_epi32(a, b);
     }
     static inline __mxxxi _mmxxx_set1_epi(type a) {
-        return _mm256_set1_epi32(a);
+        return simde_mm256_set1_epi32(a);
     }
     static inline void _mmxxx_prefix_max(__mxxxi& a, const __mxxxi* masks,
         const __mxxxi* penalties) {
@@ -139,99 +137,6 @@
     }
 };
 
-#elif defined(__SSE4_1__)
-
-constexpr std::uint32_t kRegisterSize = 128;
-using __mxxxi = __m128i;
-
-inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) {
-    return _mm_load_si128(mem_addr);
-}
-
-inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) {
-    _mm_store_si128(mem_addr, a);
-}
-
-inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) {
-    return _mm_or_si128(a, b);
-}
-
-#define _mmxxx_slli_si(a, n) \
-    _mm_slli_si128(a, n)
-
-#define _mmxxx_srli_si(a, n) \
-    _mm_srli_si128(a, n)
-
-template<>
-struct InstructionSet<std::int16_t> {
-    using type = std::int16_t;
-    static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
-    static constexpr std::uint32_t kLogNumVar = 3;
-    static constexpr std::uint32_t kLSS = 2;
-    static constexpr std::uint32_t kRSS = 14;
-    static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_add_epi16(a, b);
-    }
-    static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_sub_epi16(a, b);
-    }
-    static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_min_epi16(a, b);
-    }
-    static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_max_epi16(a, b);
-    }
-    static inline __mxxxi _mmxxx_set1_epi(type a) {
-        return _mm_set1_epi16(a);
-    }
-    static inline void _mmxxx_prefix_max(__mxxxi& a, const __mxxxi* masks,
-        const __mxxxi* penalties) {
-
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[0]), 2)));
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[1]), 4)));
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[2]), 8)));
-    }
-};
-
-template<>
-struct InstructionSet<std::int32_t> {
-    using type = std::int32_t;
-    static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
-    static constexpr std::uint32_t kLogNumVar = 2;
-    static constexpr std::uint32_t kLSS = 4;
-    static constexpr std::uint32_t kRSS = 12;
-    static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_add_epi32(a, b);
-    }
-    static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_sub_epi32(a, b);
-    }
-    static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_min_epi32(a, b);
-    }
-    static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
-        return _mm_max_epi32(a, b);
-    }
-    static inline __mxxxi _mmxxx_set1_epi(type a) {
-        return _mm_set1_epi32(a);
-    }
-    static inline void _mmxxx_prefix_max(__mxxxi& a, const __mxxxi* masks,
-        const __mxxxi* penalties) {
-
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[0]), 4)));
-        a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(
-            _mmxxx_add_epi(a, penalties[1]), 8)));
-    }
-};
-
-#endif
-
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
 template<typename T>
 void _mmxxx_print(const __mxxxi& a) {
 
@@ -288,28 +193,16 @@
     return -1;
 }
 
-#endif
-
 std::unique_ptr<AlignmentEngine> createSimdAlignmentEngine(AlignmentType type,
     AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g,
     std::int8_t e, std::int8_t q, std::int8_t c) {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     return std::unique_ptr<AlignmentEngine>(new SimdAlignmentEngine(type,
         subtype, m, n, g, e, q, c));
-
-#else
-
-    return nullptr;
-
-#endif
 }
 
 struct SimdAlignmentEngine::Implementation {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::vector<std::uint32_t> node_id_to_rank;
 
     std::unique_ptr<__mxxxi[]> sequence_profile_storage;
@@ -342,7 +235,6 @@
             penalties(nullptr) {
     }
 
-#endif
 };
 
 SimdAlignmentEngine::SimdAlignmentEngine(AlignmentType type,
@@ -358,8 +250,6 @@
 void SimdAlignmentEngine::prealloc(std::uint32_t max_sequence_size,
     std::uint32_t alphabet_size) {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t longest_path = max_sequence_size * (alphabet_size + 1) + 1 +
         InstructionSet<std::int16_t>::kNumVar;
 
@@ -374,14 +264,11 @@
             alphabet_size * max_sequence_size, alphabet_size);
     }
 
-#endif
 }
 
 void SimdAlignmentEngine::realloc(std::uint32_t matrix_width,
     std::uint32_t matrix_height, std::uint32_t num_codes) {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     if (pimpl_->node_id_to_rank.size() < matrix_height - 1) {
         pimpl_->node_id_to_rank.resize(matrix_height - 1, 0);
     }
@@ -453,7 +340,6 @@
         pimpl_->penalties_storage = std::unique_ptr<__mxxxi[]>(storage);
     }
 
-#endif
 }
 
 template<typename T>
@@ -461,8 +347,6 @@
     const std::unique_ptr<Graph>& graph, std::uint32_t normal_matrix_width,
     std::uint32_t matrix_width, std::uint32_t matrix_height) noexcept {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::int32_t padding_penatly = -1 * std::max(std::max(abs(m_), abs(n_)),
         std::max(abs(g_), abs(q_)));
 
@@ -643,7 +527,6 @@
             break;
     }
 
-#endif
 }
 
 Alignment SimdAlignmentEngine::align(const char* sequence,
@@ -653,8 +536,6 @@
         return Alignment();
     }
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t longest_path = graph->nodes().size() + 1 + sequence_size +
         InstructionSet<std::int16_t>::kNumVar;
 
@@ -680,19 +561,12 @@
 
     return Alignment();
 
-#else
-
-    return Alignment();
-
-#endif
 }
 
 template<typename T>
 Alignment SimdAlignmentEngine::linear(const char* sequence,
     std::uint32_t sequence_size, const std::unique_ptr<Graph>& graph) noexcept {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t normal_matrix_width = sequence_size;
     std::uint32_t matrix_width = (sequence_size + (sequence_size % T::kNumVar == 0 ?
         0 : T::kNumVar - sequence_size % T::kNumVar)) / T::kNumVar;
@@ -1027,19 +901,12 @@
     std::reverse(alignment.begin(), alignment.end());
     return alignment;
 
-#else
-
-    return Alignment();
-
-#endif
 }
 
 template<typename T>
 Alignment SimdAlignmentEngine::affine(const char* sequence,
     std::uint32_t sequence_size, const std::unique_ptr<Graph>& graph) noexcept {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t normal_matrix_width = sequence_size;
     std::uint32_t matrix_width = (sequence_size + (sequence_size % T::kNumVar == 0 ?
         0 : T::kNumVar - sequence_size % T::kNumVar)) / T::kNumVar;
@@ -1457,19 +1324,12 @@
     std::reverse(alignment.begin(), alignment.end());
     return alignment;
 
-#else
-
-    return Alignment();
-
-#endif
 }
 
 template<typename T>
 Alignment SimdAlignmentEngine::convex(const char* sequence,
     std::uint32_t sequence_size, const std::unique_ptr<Graph>& graph) noexcept {
 
-#if defined(__AVX2__) || defined(__SSE4_1__)
-
     std::uint32_t normal_matrix_width = sequence_size;
     std::uint32_t matrix_width = (sequence_size + (sequence_size % T::kNumVar == 0 ?
         0 : T::kNumVar - sequence_size % T::kNumVar)) / T::kNumVar;
@@ -1962,11 +1822,6 @@
 
     std::reverse(alignment.begin(), alignment.end());
     return alignment;
-#else
-
-    return Alignment();
-
-#endif
 }
 
 }
