From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Use the SIMD Everywhere header only library

Always use the "AVX2" codepath as SIMDe will provide the non AVX2
equivalents automatically

--- a/src/mcf_simd.hh
+++ b/src/mcf_simd.hh
@@ -4,14 +4,11 @@
 #ifndef MCF_SIMD_HH
 #define MCF_SIMD_HH
 
-#if defined __SSE4_1__
-#include <immintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/avx2.h"
 
 namespace mcf {
 
-#if defined __AVX2__
-
 typedef __m256i SimdInt;
 
 const int simdBytes = 32;
@@ -129,133 +126,6 @@
   return _mm256_shuffle_epi8(items, choices);
 }
 
-#elif defined __SSE4_1__
-
-typedef __m128i SimdInt;
-
-const int simdBytes = 16;
-
-static inline SimdInt simdZero() {
-  return _mm_setzero_si128();
-}
-
-static inline SimdInt simdOnes() {
-  return _mm_set1_epi32(-1);
-}
-
-static inline SimdInt simdLoad(const void *p) {
-  return _mm_loadu_si128((const SimdInt *)p);
-}
-
-static inline void simdStore(void *p, SimdInt x) {
-  _mm_storeu_si128((SimdInt *)p, x);
-}
-
-static inline SimdInt simdOr(SimdInt x, SimdInt y) {
-  return _mm_or_si128(x, y);
-}
-
-static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) {
-  return _mm_blendv_epi8(x, y, mask);  // SSE4.1
-}
-
-const int simdLen = 4;
-
-static inline SimdInt simdSet(int i3, int i2, int i1, int i0) {
-  return _mm_set_epi32(i3, i2, i1, i0);
-}
-
-static inline SimdInt simdSet1(char iF, char iE, char iD, char iC,
-			       char iB, char iA, char i9, char i8,
-			       char i7, char i6, char i5, char i4,
-			       char i3, char i2, char i1, char i0) {
-  return _mm_set_epi8(iF, iE, iD, iC, iB, iA, i9, i8,
-		      i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static inline SimdInt simdFill(int x) {
-  return _mm_set1_epi32(x);
-}
-
-static inline SimdInt simdFill1(char x) {
-  return _mm_set1_epi8(x);
-}
-
-static inline SimdInt simdGt(SimdInt x, SimdInt y) {
-  return _mm_cmpgt_epi32(x, y);
-}
-
-static inline SimdInt simdGe1(SimdInt x, SimdInt y) {
-  return _mm_cmpeq_epi8(_mm_min_epu8(x, y), y);
-}
-
-static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
-  return _mm_add_epi32(x, y);
-}
-
-static inline SimdInt simdAdd1(SimdInt x, SimdInt y) {
-  return _mm_add_epi8(x, y);
-}
-
-static inline SimdInt simdAdds1(SimdInt x, SimdInt y) {
-  return _mm_adds_epu8(x, y);
-}
-
-static inline SimdInt simdSub(SimdInt x, SimdInt y) {
-  return _mm_sub_epi32(x, y);
-}
-
-static inline SimdInt simdSub1(SimdInt x, SimdInt y) {
-  return _mm_sub_epi8(x, y);
-}
-
-static inline SimdInt simdLeft(SimdInt x, int bits) {
-  return _mm_slli_epi32(x, bits);
-}
-
-static inline SimdInt simdMax(SimdInt x, SimdInt y) {
-  return _mm_max_epi32(x, y);  // SSE4.1
-}
-
-static inline SimdInt simdMin1(SimdInt x, SimdInt y) {
-  return _mm_min_epu8(x, y);
-}
-
-static inline int simdHorizontalMax(SimdInt x) {
-  x = simdMax(x, _mm_shuffle_epi32(x, 0x4E));
-  x = simdMax(x, _mm_shuffle_epi32(x, 0xB1));
-  return _mm_cvtsi128_si32(x);
-}
-
-static inline int simdHorizontalMin1(SimdInt x) {
-  x = _mm_min_epu8(x, _mm_srli_epi16(x, 8));
-  x = _mm_minpos_epu16(x);  // SSE4.1
-  return _mm_extract_epi16(x, 0);
-}
-
-static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) {
-  return _mm_shuffle_epi8(items, choices);  // SSSE3
-}
-
-#else
-
-typedef int SimdInt;
-const int simdBytes = 1;
-const int simdLen = 1;
-static inline int simdZero() { return 0; }
-static inline int simdSet(int x) { return x; }
-static inline int simdFill(int x) { return x; }
-static inline int simdLoad(const int *p) { return *p; }
-static inline void simdStore(int *p, int x) { *p = x; }
-static inline int simdGt(int x, int y) { return x > y; }
-static inline int simdAdd(int x, int y) { return x + y; }
-static inline int simdSub(int x, int y) { return x - y; }
-static inline int simdMax(int x, int y) { return x > y ? x : y; }
-static inline int simdBlend(int x, int y, int mask) { return mask ? y : x; }
-static inline int simdHorizontalMax(int a) { return a; }
-
-#endif
-
 }
 
 #endif
--- a/src/GappedXdropAligner.cc
+++ b/src/GappedXdropAligner.cc
@@ -142,17 +142,13 @@
     if (isAffine) {
       for (int i = 0; i < numCells; i += simdLen) {
 	SimdInt s = simdSet(
-#ifdef __SSE4_1__
-#ifdef __AVX2__
 			    s1[7][s2[7]],
 			    s1[6][s2[6]],
 			    s1[5][s2[5]],
 			    s1[4][s2[4]],
-#endif
 			    s1[3][s2[3]],
 			    s1[2][s2[2]],
 			    s1[1][s2[1]],
-#endif
 			    s1[0][s2[0]]);
 	SimdInt x = simdLoad(x2+i);
 	SimdInt y = simdSub(simdLoad(y1+i), mDelGrowCost);
--- a/src/GappedXdropAlignerPssm.cc
+++ b/src/GappedXdropAlignerPssm.cc
@@ -93,17 +93,13 @@
     if (isAffine) {
       for (int i = 0; i < numCells; i += simdLen) {
 	SimdInt s = simdSet(
-#ifdef __SSE4_1__
-#ifdef __AVX2__
 			    s2[-7][s1[7]],
 			    s2[-6][s1[6]],
 			    s2[-5][s1[5]],
 			    s2[-4][s1[4]],
-#endif
 			    s2[-3][s1[3]],
 			    s2[-2][s1[2]],
 			    s2[-1][s1[1]],
-#endif
 			    s2[-0][s1[0]]);
 	SimdInt x = simdLoad(x2+i);
 	SimdInt y = simdSub(simdLoad(y1+i), mDelGrowCost);
--- a/makefile
+++ b/makefile
@@ -1,16 +1,25 @@
-CXXFLAGS = -msse4 -O3 -std=c++11 -pthread -DHAS_CXX_THREADS
+CXXFLAGS += -O3 -std=c++11 -pthread -DHAS_CXX_THREADS
 all:
 	@cd src && $(MAKE) CXXFLAGS="$(CXXFLAGS)"
 
-progs = src/lastdb src/lastal src/last-split src/last-merge-batches	\
-src/last-pair-probs src/lastdb8 src/lastal8 src/last-split8
+SFX :=
+progs = src/lastdb$(SFX) src/lastal$(SFX) src/last-split$(SFX) \
+	src/last-merge-batches$(SFX) src/last-pair-probs$(SFX) src/lastdb8$(SFX) \
+	src/lastal8$(SFX) src/last-split8$(SFX)
 
 prefix = /usr/local
 exec_prefix = $(prefix)
 bindir = $(exec_prefix)/bin
+libexecdir = $(exec_prefix)/libexec
+
 install: all
 	mkdir -p $(bindir)
-	cp $(progs) scripts/* $(bindir)
+	cp scripts/* $(bindir)
+ifeq (,$(SFX))
+	cp $(progs) $(bindir)
+else
+	cp $(progs) $(libexecdir)
+endif
 
 clean:
 	@cd src && $(MAKE) clean
--- a/src/makefile
+++ b/src/makefile
@@ -1,6 +1,5 @@
-CXXFLAGS = -O3 -Wall -Wextra -Wcast-qual -Wswitch-enum -Wundef	\
+CXXFLAGS += -O3 -Wall -Wextra -Wcast-qual -Wswitch-enum -Wundef	\
 -Wcast-align -pedantic -g
-CXXFLAGS += -msse4
 CXXFLAGS += -std=c++11
 CXXFLAGS += -pthread -DHAS_CXX_THREADS
 # -Wconversion
@@ -58,8 +57,10 @@
 
 MBOBJ = last-merge-batches.o
 
-ALL = lastdb lastal last-split last-merge-batches last-pair-probs	\
-lastdb8 lastal8 last-split8
+SFX :=
+
+ALL = lastdb$(SFX) lastal$(SFX) last-split$(SFX) last-merge-batches$(SFX) \
+      last-pair-probs$(SFX) lastdb8$(SFX) lastal8$(SFX) last-split8$(SFX)
 
 indexObj8 = $(indexObj4:.o=.o8)
 alignObj8 = $(alignObj4:.o=.o8)
@@ -68,33 +69,33 @@
 all: $(ALL)
 
 indexAllObj4 = $(indexObj0) $(indexObj4)
-lastdb: $(indexAllObj4)
+lastdb$(SFX): $(indexAllObj4)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj4) -lz
 
 indexAllObj8 = $(indexObj0) $(indexObj8)
-lastdb8: $(indexAllObj8)
+lastdb8$(SFX): $(indexAllObj8)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexAllObj8) -lz
 
 alignAllObj4 = $(alignObj0) $(alignObj4)
-lastal: $(alignAllObj4)
+lastal$(SFX): $(alignAllObj4)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj4) -lz
 
 alignAllObj8 = $(alignObj0) $(alignObj8)
-lastal8: $(alignAllObj8)
+lastal8$(SFX): $(alignAllObj8)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignAllObj8) -lz
 
 splitAllObj4 = $(splitObj0) $(splitObj4)
-last-split: $(splitAllObj4)
+last-split$(SFX): $(splitAllObj4)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj4)
 
 splitAllObj8 = $(splitObj0) $(splitObj8)
-last-split8: $(splitAllObj8)
+last-split8$(SFX): $(splitAllObj8)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitAllObj8)
 
-last-pair-probs: $(PPOBJ)
+last-pair-probs$(SFX): $(PPOBJ)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ) -lz
 
-last-merge-batches: $(MBOBJ)
+last-merge-batches$(SFX): $(MBOBJ)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ $(MBOBJ)
 
 .SUFFIXES:
--- a/src/GappedXdropAlignerDna.cc
+++ b/src/GappedXdropAlignerDna.cc
@@ -4,8 +4,6 @@
 #include "GappedXdropAligner.hh"
 #include "GappedXdropAlignerInl.hh"
 
-#if defined __SSE4_1__
-
 //#include <iostream>  // for debugging
 
 namespace cbrc {
@@ -43,12 +41,10 @@
 
   const SimdInt scorer4x4 =
     simdSet1(
-#ifdef __AVX2__
 		 scorer[3][3], scorer[3][2], scorer[3][1], scorer[3][0],
 		 scorer[2][3], scorer[2][2], scorer[2][1], scorer[2][0],
 		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
 		 scorer[0][3], scorer[0][2], scorer[0][1], scorer[0][0],
-#endif
 		 scorer[3][3], scorer[3][2], scorer[3][1], scorer[3][0],
 		 scorer[2][3], scorer[2][2], scorer[2][1], scorer[2][0],
 		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
@@ -128,8 +124,6 @@
 
       for (int i = 0; i < numCells; i += simdBytes) {
 	SimdInt s = simdSet1(
-#ifdef __SSE4_1__
-#ifdef __AVX2__
 			     scorer[s1[31]][s2[31]],
 			     scorer[s1[30]][s2[30]],
 			     scorer[s1[29]][s2[29]],
@@ -146,7 +140,6 @@
 			     scorer[s1[18]][s2[18]],
 			     scorer[s1[17]][s2[17]],
 			     scorer[s1[16]][s2[16]],
-#endif
 			     scorer[s1[15]][s2[15]],
 			     scorer[s1[14]][s2[14]],
 			     scorer[s1[13]][s2[13]],
@@ -162,7 +155,6 @@
 			     scorer[s1[3]][s2[3]],
 			     scorer[s1[2]][s2[2]],
 			     scorer[s1[1]][s2[1]],
-#endif
 			     scorer[s1[0]][s2[0]]);
 
 	SimdInt x = simdAdds1(simdLoad(x2+i), mScoreRise12);
@@ -278,5 +270,3 @@
 }
 
 }
-
-#endif
--- a/src/Alignment.cc
+++ b/src/Alignment.cc
@@ -357,13 +357,11 @@
 				  del.openCost, del.growCost,
 				  ins.openCost, ins.growCost,
 				  gap.pairCost, gap.isAffine, maxDrop, smMax)
-#if defined __SSE4_1__
     : isSimdMatrix ? aligner.alignDna(seq1 + start1, seq2 + start2,
 				      isForward, sm,
 				      del.openCost, del.growCost,
 				      ins.openCost, ins.growCost,
 				      maxDrop, smMax, alph.numbersToUppercase)
-#endif
     :           aligner.align(seq1 + start1, seq2 + start2,
 			      isForward, globality, sm,
 			      del.openCost, del.growCost,
@@ -383,14 +381,12 @@
       while( greedyAligner.getNextChunk( end1, end2, size ) )
 	chunks.push_back( SegmentPair( end1 - size, end2 - size, size ) );
     }
-#if defined __SSE4_1__
     else if (isSimdMatrix && !pssm2 && !sm2qual) {
       while (aligner.getNextChunkDna(end1, end2, size,
 				     del.openCost, del.growCost,
 				     ins.openCost, ins.growCost))
 	chunks.push_back(SegmentPair(end1 - size, end2 - size, size));
     }
-#endif
     else {
       while( aligner.getNextChunk( end1, end2, size,
 				   del.openCost, del.growCost,
--- a/src/GappedXdropAligner.hh
+++ b/src/GappedXdropAligner.hh
@@ -352,7 +352,6 @@
   void initFrame();
 
   // Everything below here is for alignDna & getNextChunkDna
-#if defined __SSE4_1__
   std::vector<TinyScore> xTinyScores;
   std::vector<TinyScore> yTinyScores;
   std::vector<TinyScore> zTinyScores;
@@ -402,7 +401,6 @@
     while (*x2 != target) ++x2;
     bestSeq1position = x2 - x2beg + seq1beg;
   }
-#endif
 };
 
 }
