From: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: use the simde header library for greater compatibility
--- hisat2.orig/aligner_bt.cpp
+++ hisat2/aligner_bt.cpp
@@ -626,7 +626,7 @@
 	const TAlScore sc_rfe = prob_.sc_->refGapExtend();
 	const bool local = !prob_.sc_->monotone;
 	const CpQuad *qup = NULL;
-	const __m128i *qlf = NULL;
+	const simde__m128i *qlf = NULL;
 	size_t per = prob_.cper_->per_;
 	ASSERT_ONLY(size_t nrow = prob_.cper_->nrow());
 	size_t ncol = prob_.cper_->ncol();
@@ -637,7 +637,7 @@
 		qup = prob_.cper_->qrows_.ptr() + (ncol * (ydiv-1)) + xdiv * per;
 	}
 	if(!left) {
-		// Set up the column pointers to point to the first __m128i word in the
+		// Set up the column pointers to point to the first simde__m128i word in the
 		// relevant column
 		size_t off = (niter << 2) * (xdiv-1);
 		qlf = prob_.cper_->qcols_.ptr() + off;
--- hisat2.orig/aligner_sw.h
+++ hisat2/aligner_sw.h
@@ -70,7 +70,7 @@
 #include <iostream>
 #include <limits>
 #include "threading.h"
-#include <emmintrin.h>
+#include "debian/include/simde/x86/sse2.h"
 #include "aligner_sw_common.h"
 #include "aligner_sw_nuc.h"
 #include "ds.h"
--- hisat2.orig/aligner_swsse.cpp
+++ hisat2/aligner_swsse.cpp
@@ -23,7 +23,7 @@
 
 /**
  * Given a number of rows (nrow), a number of columns (ncol), and the
- * number of words to fit inside a single __m128i vector, initialize the
+ * number of words to fit inside a single simde__m128i vector, initialize the
  * matrix buffer to accomodate the needed configuration of vectors.
  */
 void SSEMatrix::init(
--- hisat2.orig/aligner_swsse.h
+++ hisat2/aligner_swsse.h
@@ -83,7 +83,7 @@
  *
  * Matrix memory is laid out as follows:
  *
- * - Elements (individual cell scores) are packed into __m128i vectors
+ * - Elements (individual cell scores) are packed into simde__m128i vectors
  * - Vectors are packed into quartets, quartet elements correspond to: a vector
  *   from E, one from F, one from H, and one that's "reserved"
  * - Quartets are packed into columns, where the number of quartets is
@@ -110,7 +110,7 @@
 	/**
 	 * Return a pointer to the matrix buffer.
 	 */
-	inline __m128i *ptr() {
+	inline simde__m128i *ptr() {
 		assert(inited_);
 		return matbuf_.ptr();
 	}
@@ -119,7 +119,7 @@
 	 * Return a pointer to the E vector at the given row and column.  Note:
 	 * here row refers to rows of vectors, not rows of elements.
 	 */
-	inline __m128i* evec(size_t row, size_t col) {
+	inline simde__m128i* evec(size_t row, size_t col) {
 		assert_lt(row, nvecrow_);
 		assert_lt(col, nveccol_);
 		size_t elt = row * rowstride() + col * colstride() + E;
@@ -131,7 +131,7 @@
 	 * Like evec, but it's allowed to ask for a pointer to one column after the
 	 * final one.
 	 */
-	inline __m128i* evecUnsafe(size_t row, size_t col) {
+	inline simde__m128i* evecUnsafe(size_t row, size_t col) {
 		assert_lt(row, nvecrow_);
 		assert_leq(col, nveccol_);
 		size_t elt = row * rowstride() + col * colstride() + E;
@@ -143,7 +143,7 @@
 	 * Return a pointer to the F vector at the given row and column.  Note:
 	 * here row refers to rows of vectors, not rows of elements.
 	 */
-	inline __m128i* fvec(size_t row, size_t col) {
+	inline simde__m128i* fvec(size_t row, size_t col) {
 		assert_lt(row, nvecrow_);
 		assert_lt(col, nveccol_);
 		size_t elt = row * rowstride() + col * colstride() + F;
@@ -155,7 +155,7 @@
 	 * Return a pointer to the H vector at the given row and column.  Note:
 	 * here row refers to rows of vectors, not rows of elements.
 	 */
-	inline __m128i* hvec(size_t row, size_t col) {
+	inline simde__m128i* hvec(size_t row, size_t col) {
 		assert_lt(row, nvecrow_);
 		assert_lt(col, nveccol_);
 		size_t elt = row * rowstride() + col * colstride() + H;
@@ -167,7 +167,7 @@
 	 * Return a pointer to the TMP vector at the given row and column.  Note:
 	 * here row refers to rows of vectors, not rows of elements.
 	 */
-	inline __m128i* tmpvec(size_t row, size_t col) {
+	inline simde__m128i* tmpvec(size_t row, size_t col) {
 		assert_lt(row, nvecrow_);
 		assert_lt(col, nveccol_);
 		size_t elt = row * rowstride() + col * colstride() + TMP;
@@ -179,7 +179,7 @@
 	 * Like tmpvec, but it's allowed to ask for a pointer to one column after
 	 * the final one.
 	 */
-	inline __m128i* tmpvecUnsafe(size_t row, size_t col) {
+	inline simde__m128i* tmpvecUnsafe(size_t row, size_t col) {
 		assert_lt(row, nvecrow_);
 		assert_leq(col, nveccol_);
 		size_t elt = row * rowstride() + col * colstride() + TMP;
@@ -189,7 +189,7 @@
 	
 	/**
 	 * Given a number of rows (nrow), a number of columns (ncol), and the
-	 * number of words to fit inside a single __m128i vector, initialize the
+	 * number of words to fit inside a single simde__m128i vector, initialize the
 	 * matrix buffer to accomodate the needed configuration of vectors.
 	 */
 	void init(
@@ -198,13 +198,13 @@
 		size_t wperv);
 	
 	/**
-	 * Return the number of __m128i's you need to skip over to get from one
+	 * Return the number of simde__m128i's you need to skip over to get from one
 	 * cell to the cell one column over from it.
 	 */
 	inline size_t colstride() const { return colstride_; }
 
 	/**
-	 * Return the number of __m128i's you need to skip over to get from one
+	 * Return the number of simde__m128i's you need to skip over to get from one
 	 * cell to the cell one row down from it.
 	 */
 	inline size_t rowstride() const { return rowstride_; }
--- hisat2.orig/aligner_swsse_ee_i16.cpp
+++ hisat2/aligner_swsse_ee_i16.cpp
@@ -85,7 +85,7 @@
 	// const size_t len = rd->length();
     const size_t len = dpRows();
 	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
-	// How many __m128i's are needed
+	// How many simde__m128i's are needed
 	size_t n128s =
 		64 +                    // slack bytes, for alignment?
 		(seglen * ALPHA_SIZE)   // query profile data
@@ -228,51 +228,26 @@
 #ifdef NDEBUG
 
 #define assert_all_eq0(x)
-#define assert_all_gt(x, y)
-#define assert_all_gt_lo(x)
 #define assert_all_lt(x, y)
-#define assert_all_lt_hi(x)
 
 #else
 
 #define assert_all_eq0(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	tmp = _mm_cmpeq_epi16(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_gt(x, y) { \
-	__m128i tmp = _mm_cmpgt_epi16(x, y); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_gt_lo(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	tmp = _mm_cmpgt_epi16(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i z = simde_mm_setzero_si128(); \
+	simde__m128i tmp = simde_mm_setzero_si128(); \
+	z = simde_mm_xor_si128(z, z); \
+	tmp = simde_mm_cmpeq_epi16(x, z); \
+	assert_eq(0xffff, simde_mm_movemask_epi8(tmp)); \
 }
 
 #define assert_all_lt(x, y) { \
-	__m128i tmp = _mm_cmplt_epi16(x, y); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i tmp = simde_mm_cmplt_epi16(x, y); \
+	assert_eq(0xffff, simde_mm_movemask_epi8(tmp)); \
 }
 
 #define assert_all_leq(x, y) { \
-	__m128i tmp = _mm_cmpgt_epi16(x, y); \
-	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_lt_hi(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_cmpeq_epi16(z, z); \
-	z = _mm_srli_epi16(z, 1); \
-	tmp = _mm_cmplt_epi16(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i tmp = simde_mm_cmpgt_epi16(x, y); \
+	assert_eq(0x0000, simde_mm_movemask_epi8(tmp)); \
 }
 #endif
 
@@ -322,8 +297,8 @@
 	// we'll call "left" and "right".
 	d.vecbuf_.resize(4 * 2 * iter);
 	d.vecbuf_.zero();
-	__m128i *vbuf_l = d.vecbuf_.ptr();
-	__m128i *vbuf_r = d.vecbuf_.ptr() + (4 * iter);
+	simde__m128i *vbuf_l = d.vecbuf_.ptr();
+	simde__m128i *vbuf_r = d.vecbuf_.ptr() + (4 * iter);
 	
 	// This is the data structure that holds candidate cells per diagonal.
 	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
@@ -356,91 +331,91 @@
 	// Much of the implmentation below is adapted from Michael's code.
 
 	// Set all elts to reference gap open penalty
-	__m128i rfgapo   = _mm_setzero_si128();
-	__m128i rfgape   = _mm_setzero_si128();
-	__m128i rdgapo   = _mm_setzero_si128();
-	__m128i rdgape   = _mm_setzero_si128();
-	__m128i vlo      = _mm_setzero_si128();
-	__m128i vhi      = _mm_setzero_si128();
-	__m128i vhilsw   = _mm_setzero_si128();
-	__m128i vlolsw   = _mm_setzero_si128();
-	__m128i ve       = _mm_setzero_si128();
-	__m128i vf       = _mm_setzero_si128();
-	__m128i vh       = _mm_setzero_si128();
-	__m128i vhd      = _mm_setzero_si128();
-	__m128i vhdtmp   = _mm_setzero_si128();
-	__m128i vtmp     = _mm_setzero_si128();
+	simde__m128i rfgapo   = simde_mm_setzero_si128();
+	simde__m128i rfgape   = simde_mm_setzero_si128();
+	simde__m128i rdgapo   = simde_mm_setzero_si128();
+	simde__m128i rdgape   = simde_mm_setzero_si128();
+	simde__m128i vlo      = simde_mm_setzero_si128();
+	simde__m128i vhi      = simde_mm_setzero_si128();
+	simde__m128i vhilsw   = simde_mm_setzero_si128();
+	simde__m128i vlolsw   = simde_mm_setzero_si128();
+	simde__m128i ve       = simde_mm_setzero_si128();
+	simde__m128i vf       = simde_mm_setzero_si128();
+	simde__m128i vh       = simde_mm_setzero_si128();
+	simde__m128i vhd      = simde_mm_setzero_si128();
+	simde__m128i vhdtmp   = simde_mm_setzero_si128();
+	simde__m128i vtmp     = simde_mm_setzero_si128();
 
 	assert_gt(sc_->refGapOpen(), 0);
 	assert_leq(sc_->refGapOpen(), MAX_I16);
-	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
-	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
-	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	rfgapo = simde_mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
+	rfgapo = simde_mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = simde_mm_shuffle_epi32(rfgapo, 0);
 	
 	// Set all elts to reference gap extension penalty
 	assert_gt(sc_->refGapExtend(), 0);
 	assert_leq(sc_->refGapExtend(), MAX_I16);
 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
-	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
-	rfgape = _mm_shufflelo_epi16(rfgape, 0);
-	rfgape = _mm_shuffle_epi32(rfgape, 0);
+	rfgape = simde_mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
+	rfgape = simde_mm_shufflelo_epi16(rfgape, 0);
+	rfgape = simde_mm_shuffle_epi32(rfgape, 0);
 
 	// Set all elts to read gap open penalty
 	assert_gt(sc_->readGapOpen(), 0);
 	assert_leq(sc_->readGapOpen(), MAX_I16);
-	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
-	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
-	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	rdgapo = simde_mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
+	rdgapo = simde_mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = simde_mm_shuffle_epi32(rdgapo, 0);
 	
 	// Set all elts to read gap extension penalty
 	assert_gt(sc_->readGapExtend(), 0);
 	assert_leq(sc_->readGapExtend(), MAX_I16);
 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
-	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
-	rdgape = _mm_shufflelo_epi16(rdgape, 0);
-	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	rdgape = simde_mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
+	rdgape = simde_mm_shufflelo_epi16(rdgape, 0);
+	rdgape = simde_mm_shuffle_epi32(rdgape, 0);
 
 	// Set all elts to 0x8000 (min value for signed 16-bit)
-	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
-	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
+	vlo = simde_mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
+	vlo = simde_mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
 	
 	// Set all elts to 0x7fff (max value for signed 16-bit)
-	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
-	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
+	vhi = simde_mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
+	vhi = simde_mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
 	
 	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
-	vlolsw = _mm_shuffle_epi32(vlo, 0);
-	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	vlolsw = simde_mm_shuffle_epi32(vlo, 0);
+	vlolsw = simde_mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
 	
 	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
-	vhilsw = _mm_shuffle_epi32(vhi, 0);
-	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	vhilsw = simde_mm_shuffle_epi32(vhi, 0);
+	vhilsw = simde_mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
 	
-	// Points to a long vector of __m128i where each element is a block of
+	// Points to a long vector of simde__m128i where each element is a block of
 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
 	// Blocks of cells are organized in the same interleaved manner as they are
 	// calculated by the Farrar algorithm.
-	const __m128i *pvScore; // points into the query profile
+	const simde__m128i *pvScore; // points into the query profile
 
 	const size_t colstride = ROWSTRIDE_2COL * iter;
 	
 	// Initialize the H and E vectors in the first matrix column
-	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
-	/* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
-	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
+	simde__m128i *pvELeft = vbuf_l + 0; simde__m128i *pvERight = vbuf_r + 0;
+	/* simde__m128i *pvFLeft = vbuf_l + 1; */ simde__m128i *pvFRight = vbuf_r + 1;
+	simde__m128i *pvHLeft = vbuf_l + 2; simde__m128i *pvHRight = vbuf_r + 2;
 	
 	// Maximum score in final row
 	bool found = false;
 	TCScore lrmax = MIN_I16;
 	
 	for(size_t i = 0; i < iter; i++) {
-		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
+		simde_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
 		// Could initialize Hs to high or low.  If high, cells in the lower
 		// triangle will have somewhat more legitiate scores, but still won't
 		// be exhaustively scored.
-		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
+		simde_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
 	}
 	
 	assert_gt(sc_->gapbar, 0);
@@ -476,49 +451,49 @@
 		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
 		
 		// Set all cells to low value
-		vf = _mm_cmpeq_epi16(vf, vf);
-		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
-		vf = _mm_or_si128(vf, vlolsw);
+		vf = simde_mm_cmpeq_epi16(vf, vf);
+		vf = simde_mm_slli_epi16(vf, NBITS_PER_WORD-1);
+		vf = simde_mm_or_si128(vf, vlolsw);
 		
 		// Load H vector from the final row of the previous column
-		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
+		vh = simde_mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
 		// Shift 2 bytes down so that topmost (least sig) cell gets 0
-		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		vh = simde_mm_slli_si128(vh, NBYTES_PER_WORD);
 		// Fill topmost (least sig) cell with high value
-		vh = _mm_or_si128(vh, vhilsw);
+		vh = simde_mm_or_si128(vh, vhilsw);
 		
 		// For each character in the reference text:
 		size_t j;
 		for(j = 0; j < iter; j++) {
 			// Load cells from E, calculated previously
-			ve = _mm_load_si128(pvELeft);
-			vhd = _mm_load_si128(pvHLeft);
+			ve = simde_mm_load_si128(pvELeft);
+			vhd = simde_mm_load_si128(pvHLeft);
 			assert_all_lt(ve, vhi);
 			pvELeft += ROWSTRIDE_2COL;
 			
 			// Store cells in F, calculated previously
-			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
-			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
-			_mm_store_si128(pvFRight, vf);
+			vf = simde_mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			vf = simde_mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			simde_mm_store_si128(pvFRight, vf);
 			pvFRight += ROWSTRIDE_2COL;
 			
 			// Factor in query profile (matches and mismatches)
-			vh = _mm_adds_epi16(vh, pvScore[0]);
+			vh = simde_mm_adds_epi16(vh, pvScore[0]);
 			
 			// Update H, factoring in E and F
-			vh = _mm_max_epi16(vh, vf);
+			vh = simde_mm_max_epi16(vh, vf);
 			
 			// Update vE value
 			vhdtmp = vhd;
-			vhd = _mm_subs_epi16(vhd, rdgapo);
-			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
-			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epi16(ve, rdgape);
-			ve = _mm_max_epi16(ve, vhd);
-			vh = _mm_max_epi16(vh, ve);
+			vhd = simde_mm_subs_epi16(vhd, rdgapo);
+			vhd = simde_mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			vhd = simde_mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epi16(ve, rdgape);
+			ve = simde_mm_max_epi16(ve, vhd);
+			vh = simde_mm_max_epi16(vh, ve);
 
 			// Save the new vH values
-			_mm_store_si128(pvHRight, vh);
+			simde_mm_store_si128(pvHRight, vh);
 			pvHRight += ROWSTRIDE_2COL;
 			vtmp = vh;
 			assert_all_lt(ve, vhi);
@@ -528,49 +503,49 @@
 			pvHLeft += ROWSTRIDE_2COL;
 
 			// Save E values
-			_mm_store_si128(pvERight, ve);
+			simde_mm_store_si128(pvERight, ve);
 			pvERight += ROWSTRIDE_2COL;
 			
 			// Update vf value
-			vtmp = _mm_subs_epi16(vtmp, rfgapo);
-			vf = _mm_subs_epi16(vf, rfgape);
+			vtmp = simde_mm_subs_epi16(vtmp, rfgapo);
+			vf = simde_mm_subs_epi16(vf, rfgape);
 			assert_all_lt(vf, vhi);
-			vf = _mm_max_epi16(vf, vtmp);
+			vf = simde_mm_max_epi16(vf, vtmp);
 			
 			pvScore += 2; // move on to next query profile / gap veto
 		}
 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
 		pvFRight -= colstride; // reset to start of column
-		vtmp = _mm_load_si128(pvFRight);
+		vtmp = simde_mm_load_si128(pvFRight);
 		
 		pvHRight -= colstride; // reset to start of column
-		vh = _mm_load_si128(pvHRight);
+		vh = simde_mm_load_si128(pvHRight);
 		
 		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
 		
 		// vf from last row gets shifted down by one to overlay the first row
 		// rfgape has already been subtracted from it.
-		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
-		vf = _mm_or_si128(vf, vlolsw);
+		vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = simde_mm_or_si128(vf, vlolsw);
 		
-		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_max_epi16(vtmp, vf);
-		vtmp = _mm_cmpgt_epi16(vf, vtmp);
-		int cmp = _mm_movemask_epi8(vtmp);
+		vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_max_epi16(vtmp, vf);
+		vtmp = simde_mm_cmpgt_epi16(vf, vtmp);
+		int cmp = simde_mm_movemask_epi8(vtmp);
 		
 		// If any element of vtmp is greater than H - gap-open...
 		j = 0;
 		while(cmp != 0x0000) {
 			// Store this vf
-			_mm_store_si128(pvFRight, vf);
+			simde_mm_store_si128(pvFRight, vf);
 			pvFRight += ROWSTRIDE_2COL;
 			
 			// Update vh w/r/t new vf
-			vh = _mm_max_epi16(vh, vf);
+			vh = simde_mm_max_epi16(vh, vf);
 			
 			// Save vH values
-			_mm_store_si128(pvHRight, vh);
+			simde_mm_store_si128(pvHRight, vh);
 			pvHRight += ROWSTRIDE_2COL;
 			
 			pvScore += 2;
@@ -578,31 +553,31 @@
 			assert_lt(j, iter);
 			if(++j == iter) {
 				pvFRight -= colstride;
-				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				vtmp = simde_mm_load_si128(pvFRight);   // load next vf ASAP
 				pvHRight -= colstride;
-				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				vh = simde_mm_load_si128(pvHRight);     // load next vh ASAP
 				pvScore = d.profbuf_.ptr() + off + 1;
 				j = 0;
-				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
-				vf = _mm_or_si128(vf, vlolsw);
+				vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = simde_mm_or_si128(vf, vlolsw);
 			} else {
-				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
-				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				vtmp = simde_mm_load_si128(pvFRight);   // load next vf ASAP
+				vh = simde_mm_load_si128(pvHRight);     // load next vh ASAP
 			}
 			
 			// Update F with another gap extension
-			vf = _mm_subs_epi16(vf, rfgape);
-			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_max_epi16(vtmp, vf);
-			vtmp = _mm_cmpgt_epi16(vf, vtmp);
-			cmp = _mm_movemask_epi8(vtmp);
+			vf = simde_mm_subs_epi16(vf, rfgape);
+			vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_max_epi16(vtmp, vf);
+			vtmp = simde_mm_cmpgt_epi16(vf, vtmp);
+			cmp = simde_mm_movemask_epi8(vtmp);
 			nfixup++;
 		}
 
 		
 		// Check in the last row for the maximum so far
-		__m128i *vtmp = vbuf_r + 2 /* H */ + (d.lastIter_ * ROWSTRIDE_2COL);
+		simde__m128i *vtmp = vbuf_r + 2 /* H */ + (d.lastIter_ * ROWSTRIDE_2COL);
 		// Note: we may not want to extract from the final row
 		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
 		found = true;
@@ -625,9 +600,9 @@
 		// Save some elements to checkpoints
 		if(checkpoint) {
 			
-			__m128i *pvE = vbuf_r + 0;
-			__m128i *pvF = vbuf_r + 1;
-			__m128i *pvH = vbuf_r + 2;
+			simde__m128i *pvE = vbuf_r + 0;
+			simde__m128i *pvF = vbuf_r + 1;
+			simde__m128i *pvH = vbuf_r + 2;
 			size_t coli = i - rfi_;
 			if(coli < cper_.locol_) cper_.locol_ = coli;
 			if(coli > cper_.hicol_) cper_.hicol_ = coli;
@@ -727,16 +702,16 @@
 					assert_gt(coli, 0);
 					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
 					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
-					__m128i *dst = cper_.qcols_.ptr() + coloff;
-					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+					simde__m128i *dst = cper_.qcols_.ptr() + coloff;
+					memcpy(dst, vbuf_r, sizeof(simde__m128i) * wordspercol);
 				}
 			}
 			if(cper_.debug_) {
 				// Save the column using memcpys
 				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
 				size_t coloff = coli * wordspercol;
-				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
-				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+				simde__m128i *dst = cper_.qcolsD_.ptr() + coloff;
+				memcpy(dst, vbuf_r, sizeof(simde__m128i) * wordspercol);
 			}
 		}
 	}
@@ -822,104 +797,104 @@
 	// Much of the implmentation below is adapted from Michael's code.
 
 	// Set all elts to reference gap open penalty
-	__m128i rfgapo   = _mm_setzero_si128();
-	__m128i rfgape   = _mm_setzero_si128();
-	__m128i rdgapo   = _mm_setzero_si128();
-	__m128i rdgape   = _mm_setzero_si128();
-	__m128i vlo      = _mm_setzero_si128();
-	__m128i vhi      = _mm_setzero_si128();
-	__m128i vhilsw   = _mm_setzero_si128();
-	__m128i vlolsw   = _mm_setzero_si128();
-	__m128i ve       = _mm_setzero_si128();
-	__m128i vf       = _mm_setzero_si128();
-	__m128i vh       = _mm_setzero_si128();
+	simde__m128i rfgapo   = simde_mm_setzero_si128();
+	simde__m128i rfgape   = simde_mm_setzero_si128();
+	simde__m128i rdgapo   = simde_mm_setzero_si128();
+	simde__m128i rdgape   = simde_mm_setzero_si128();
+	simde__m128i vlo      = simde_mm_setzero_si128();
+	simde__m128i vhi      = simde_mm_setzero_si128();
+	simde__m128i vhilsw   = simde_mm_setzero_si128();
+	simde__m128i vlolsw   = simde_mm_setzero_si128();
+	simde__m128i ve       = simde_mm_setzero_si128();
+	simde__m128i vf       = simde_mm_setzero_si128();
+	simde__m128i vh       = simde_mm_setzero_si128();
 #if 0
-	__m128i vhd      = _mm_setzero_si128();
-	__m128i vhdtmp   = _mm_setzero_si128();
+	simde__m128i vhd      = simde_mm_setzero_si128();
+	simde__m128i vhdtmp   = simde_mm_setzero_si128();
 #endif
-	__m128i vtmp     = _mm_setzero_si128();
+	simde__m128i vtmp     = simde_mm_setzero_si128();
 
 	assert_gt(sc_->refGapOpen(), 0);
 	assert_leq(sc_->refGapOpen(), MAX_I16);
-	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
-	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
-	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	rfgapo = simde_mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
+	rfgapo = simde_mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = simde_mm_shuffle_epi32(rfgapo, 0);
 	
 	// Set all elts to reference gap extension penalty
 	assert_gt(sc_->refGapExtend(), 0);
 	assert_leq(sc_->refGapExtend(), MAX_I16);
 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
-	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
-	rfgape = _mm_shufflelo_epi16(rfgape, 0);
-	rfgape = _mm_shuffle_epi32(rfgape, 0);
+	rfgape = simde_mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
+	rfgape = simde_mm_shufflelo_epi16(rfgape, 0);
+	rfgape = simde_mm_shuffle_epi32(rfgape, 0);
 
 	// Set all elts to read gap open penalty
 	assert_gt(sc_->readGapOpen(), 0);
 	assert_leq(sc_->readGapOpen(), MAX_I16);
-	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
-	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
-	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	rdgapo = simde_mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
+	rdgapo = simde_mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = simde_mm_shuffle_epi32(rdgapo, 0);
 	
 	// Set all elts to read gap extension penalty
 	assert_gt(sc_->readGapExtend(), 0);
 	assert_leq(sc_->readGapExtend(), MAX_I16);
 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
-	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
-	rdgape = _mm_shufflelo_epi16(rdgape, 0);
-	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	rdgape = simde_mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
+	rdgape = simde_mm_shufflelo_epi16(rdgape, 0);
+	rdgape = simde_mm_shuffle_epi32(rdgape, 0);
 
 	// Set all elts to 0x8000 (min value for signed 16-bit)
-	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
-	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
+	vlo = simde_mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
+	vlo = simde_mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
 	
 	// Set all elts to 0x7fff (max value for signed 16-bit)
-	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
-	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
+	vhi = simde_mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
+	vhi = simde_mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
 	
 	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
-	vlolsw = _mm_shuffle_epi32(vlo, 0);
-	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	vlolsw = simde_mm_shuffle_epi32(vlo, 0);
+	vlolsw = simde_mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
 	
 	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
-	vhilsw = _mm_shuffle_epi32(vhi, 0);
-	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	vhilsw = simde_mm_shuffle_epi32(vhi, 0);
+	vhilsw = simde_mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
 	
-	// Points to a long vector of __m128i where each element is a block of
+	// Points to a long vector of simde__m128i where each element is a block of
 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
 	// Blocks of cells are organized in the same interleaved manner as they are
 	// calculated by the Farrar algorithm.
-	const __m128i *pvScore; // points into the query profile
+	const simde__m128i *pvScore; // points into the query profile
 
 	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
 	const size_t colstride = d.mat_.colstride();
 	assert_eq(ROWSTRIDE, colstride / iter);
 	
 	// Initialize the H and E vectors in the first matrix column
-	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
-	__m128i *pvETmp = d.mat_.evec(0, 0);
+	simde__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
+	simde__m128i *pvETmp = d.mat_.evec(0, 0);
 	
 	// Maximum score in final row
 	bool found = false;
 	TCScore lrmax = MIN_I16;
 	
 	for(size_t i = 0; i < iter; i++) {
-		_mm_store_si128(pvETmp, vlo);
+		simde_mm_store_si128(pvETmp, vlo);
 		// Could initialize Hs to high or low.  If high, cells in the lower
 		// triangle will have somewhat more legitiate scores, but still won't
 		// be exhaustively scored.
-		_mm_store_si128(pvHTmp, vlo);
+		simde_mm_store_si128(pvHTmp, vlo);
 		pvETmp += ROWSTRIDE;
 		pvHTmp += ROWSTRIDE;
 	}
 	// These are swapped just before the innermost loop
-	__m128i *pvHStore = d.mat_.hvec(0, 0);
-	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
-	__m128i *pvELoad  = d.mat_.evec(0, 0);
-	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
-	__m128i *pvFStore = d.mat_.fvec(0, 0);
-	__m128i *pvFTmp   = NULL;
+	simde__m128i *pvHStore = d.mat_.hvec(0, 0);
+	simde__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
+	simde__m128i *pvELoad  = d.mat_.evec(0, 0);
+	simde__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
+	simde__m128i *pvFStore = d.mat_.fvec(0, 0);
+	simde__m128i *pvFTmp   = NULL;
 	
 	assert_gt(sc_->gapbar, 0);
 	size_t nfixup = 0;
@@ -950,60 +925,60 @@
 		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
 		
 		// Set all cells to low value
-		vf = _mm_cmpeq_epi16(vf, vf);
-		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
-		vf = _mm_or_si128(vf, vlolsw);
+		vf = simde_mm_cmpeq_epi16(vf, vf);
+		vf = simde_mm_slli_epi16(vf, NBITS_PER_WORD-1);
+		vf = simde_mm_or_si128(vf, vlolsw);
 		
 		// Load H vector from the final row of the previous column
-		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
+		vh = simde_mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
 		// Shift 2 bytes down so that topmost (least sig) cell gets 0
-		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		vh = simde_mm_slli_si128(vh, NBYTES_PER_WORD);
 		// Fill topmost (least sig) cell with high value
-		vh = _mm_or_si128(vh, vhilsw);
+		vh = simde_mm_or_si128(vh, vhilsw);
 		
 		// For each character in the reference text:
 		size_t j;
 		for(j = 0; j < iter; j++) {
 			// Load cells from E, calculated previously
-			ve = _mm_load_si128(pvELoad);
+			ve = simde_mm_load_si128(pvELoad);
 #if 0
-			vhd = _mm_load_si128(pvHLoad);
+			vhd = simde_mm_load_si128(pvHLoad);
 #endif
 			assert_all_lt(ve, vhi);
 			pvELoad += ROWSTRIDE;
 			
 			// Store cells in F, calculated previously
-			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
-			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
-			_mm_store_si128(pvFStore, vf);
+			vf = simde_mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			vf = simde_mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			simde_mm_store_si128(pvFStore, vf);
 			pvFStore += ROWSTRIDE;
 			
 			// Factor in query profile (matches and mismatches)
-			vh = _mm_adds_epi16(vh, pvScore[0]);
+			vh = simde_mm_adds_epi16(vh, pvScore[0]);
 			
 			// Update H, factoring in E and F
-			vh = _mm_max_epi16(vh, ve);
-			vh = _mm_max_epi16(vh, vf);
+			vh = simde_mm_max_epi16(vh, ve);
+			vh = simde_mm_max_epi16(vh, vf);
 			
 			// Save the new vH values
-			_mm_store_si128(pvHStore, vh);
+			simde_mm_store_si128(pvHStore, vh);
 			pvHStore += ROWSTRIDE;
 			
 			// Update vE value
 			vtmp = vh;
 #if 0
 			vhdtmp = vhd;
-			vhd = _mm_subs_epi16(vhd, rdgapo);
-			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
-			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epi16(ve, rdgape);
-			ve = _mm_max_epi16(ve, vhd);
+			vhd = simde_mm_subs_epi16(vhd, rdgapo);
+			vhd = simde_mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			vhd = simde_mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epi16(ve, rdgape);
+			ve = simde_mm_max_epi16(ve, vhd);
 #else
-			vh = _mm_subs_epi16(vh, rdgapo);
-			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
-			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epi16(ve, rdgape);
-			ve = _mm_max_epi16(ve, vh);
+			vh = simde_mm_subs_epi16(vh, rdgapo);
+			vh = simde_mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+			vh = simde_mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epi16(ve, rdgape);
+			ve = simde_mm_max_epi16(ve, vh);
 #endif
 			assert_all_lt(ve, vhi);
 			
@@ -1011,34 +986,34 @@
 #if 0
 			vh = vhdtmp;
 #else
-			vh = _mm_load_si128(pvHLoad);
+			vh = simde_mm_load_si128(pvHLoad);
 #endif
 			pvHLoad += ROWSTRIDE;
 			
 			// Save E values
-			_mm_store_si128(pvEStore, ve);
+			simde_mm_store_si128(pvEStore, ve);
 			pvEStore += ROWSTRIDE;
 			
 			// Update vf value
-			vtmp = _mm_subs_epi16(vtmp, rfgapo);
-			vf = _mm_subs_epi16(vf, rfgape);
+			vtmp = simde_mm_subs_epi16(vtmp, rfgapo);
+			vf = simde_mm_subs_epi16(vf, rfgape);
 			assert_all_lt(vf, vhi);
-			vf = _mm_max_epi16(vf, vtmp);
+			vf = simde_mm_max_epi16(vf, vtmp);
 			
 			pvScore += 2; // move on to next query profile / gap veto
 		}
 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
 		pvFTmp = pvFStore;
 		pvFStore -= colstride; // reset to start of column
-		vtmp = _mm_load_si128(pvFStore);
+		vtmp = simde_mm_load_si128(pvFStore);
 		
 		pvHStore -= colstride; // reset to start of column
-		vh = _mm_load_si128(pvHStore);
+		vh = simde_mm_load_si128(pvHStore);
 		
 #if 0
 #else
 		pvEStore -= colstride; // reset to start of column
-		ve = _mm_load_si128(pvEStore);
+		ve = simde_mm_load_si128(pvEStore);
 #endif
 		
 		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
@@ -1046,37 +1021,37 @@
 		
 		// vf from last row gets shifted down by one to overlay the first row
 		// rfgape has already been subtracted from it.
-		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
-		vf = _mm_or_si128(vf, vlolsw);
+		vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = simde_mm_or_si128(vf, vlolsw);
 		
-		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_max_epi16(vtmp, vf);
-		vtmp = _mm_cmpgt_epi16(vf, vtmp);
-		int cmp = _mm_movemask_epi8(vtmp);
+		vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_max_epi16(vtmp, vf);
+		vtmp = simde_mm_cmpgt_epi16(vf, vtmp);
+		int cmp = simde_mm_movemask_epi8(vtmp);
 		
 		// If any element of vtmp is greater than H - gap-open...
 		j = 0;
 		while(cmp != 0x0000) {
 			// Store this vf
-			_mm_store_si128(pvFStore, vf);
+			simde_mm_store_si128(pvFStore, vf);
 			pvFStore += ROWSTRIDE;
 			
 			// Update vh w/r/t new vf
-			vh = _mm_max_epi16(vh, vf);
+			vh = simde_mm_max_epi16(vh, vf);
 			
 			// Save vH values
-			_mm_store_si128(pvHStore, vh);
+			simde_mm_store_si128(pvHStore, vh);
 			pvHStore += ROWSTRIDE;
 			
 			// Update E in case it can be improved using our new vh
 #if 0
 #else
-			vh = _mm_subs_epi16(vh, rdgapo);
-			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
-			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
-			ve = _mm_max_epi16(ve, vh);
-			_mm_store_si128(pvEStore, ve);
+			vh = simde_mm_subs_epi16(vh, rdgapo);
+			vh = simde_mm_adds_epi16(vh, *pvScore); // veto some read gap opens
+			vh = simde_mm_adds_epi16(vh, *pvScore); // veto some read gap opens
+			ve = simde_mm_max_epi16(ve, vh);
+			simde_mm_store_si128(pvEStore, ve);
 			pvEStore += ROWSTRIDE;
 #endif
 			pvScore += 2;
@@ -1084,34 +1059,34 @@
 			assert_lt(j, iter);
 			if(++j == iter) {
 				pvFStore -= colstride;
-				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				vtmp = simde_mm_load_si128(pvFStore);   // load next vf ASAP
 				pvHStore -= colstride;
-				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				vh = simde_mm_load_si128(pvHStore);     // load next vh ASAP
 #if 0
 #else
 				pvEStore -= colstride;
-				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
+				ve = simde_mm_load_si128(pvEStore);     // load next ve ASAP
 #endif
 				pvScore = d.profbuf_.ptr() + off + 1;
 				j = 0;
-				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
-				vf = _mm_or_si128(vf, vlolsw);
+				vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = simde_mm_or_si128(vf, vlolsw);
 			} else {
-				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
-				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				vtmp = simde_mm_load_si128(pvFStore);   // load next vf ASAP
+				vh = simde_mm_load_si128(pvHStore);     // load next vh ASAP
 #if 0
 #else
-				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
+				ve = simde_mm_load_si128(pvEStore);     // load next vh ASAP
 #endif
 			}
 			
 			// Update F with another gap extension
-			vf = _mm_subs_epi16(vf, rfgape);
-			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_max_epi16(vtmp, vf);
-			vtmp = _mm_cmpgt_epi16(vf, vtmp);
-			cmp = _mm_movemask_epi8(vtmp);
+			vf = simde_mm_subs_epi16(vf, rfgape);
+			vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_max_epi16(vtmp, vf);
+			vtmp = simde_mm_cmpgt_epi16(vf, vtmp);
+			cmp = simde_mm_movemask_epi8(vtmp);
 			nfixup++;
 		}
 
@@ -1132,7 +1107,7 @@
 		}
 #endif
 		
-		__m128i *vtmp = d.mat_.hvec(d.lastIter_, i-rfi_);
+		simde__m128i *vtmp = d.mat_.hvec(d.lastIter_, i-rfi_);
 		// Note: we may not want to extract from the final row
 		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
 		found = true;
@@ -1228,7 +1203,7 @@
 	assert(!d.profbuf_.empty());
 	const size_t colstride = d.mat_.colstride();
 	ASSERT_ONLY(bool sawbest = false);
-	__m128i *pvH = d.mat_.hvec(d.lastIter_, 0);
+	simde__m128i *pvH = d.mat_.hvec(d.lastIter_, 0);
 	for(size_t j = 0; j < ncol; j++) {
 		TAlScore sc = (TAlScore)(((TCScore*)pvH)[d.lastWord_] - 0x7fff);
 		assert_leq(sc, best);
@@ -1354,7 +1329,7 @@
 	size_t rowelt, rowvec, eltvec;
 	size_t left_rowelt, up_rowelt, upleft_rowelt;
 	size_t left_rowvec, up_rowvec, upleft_rowvec;
-	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
+	simde__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
 	NEW_ROW_COL(row, col);
 	while((int)row >= 0) {
 		met.btcell++;
--- hisat2.orig/aligner_swsse_ee_u8.cpp
+++ hisat2/aligner_swsse_ee_u8.cpp
@@ -85,7 +85,7 @@
 	// const size_t len = rd->length();
     const size_t len = dpRows();
 	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
-	// How many __m128i's are needed
+	// How many simde__m128i's are needed
 	size_t n128s =
 		64 +                    // slack bytes, for alignment?
 		(seglen * ALPHA_SIZE)   // query profile data
@@ -231,49 +231,24 @@
 #ifdef NDEBUG
 
 #define assert_all_eq0(x)
-#define assert_all_gt(x, y)
-#define assert_all_gt_lo(x)
 #define assert_all_lt(x, y)
-#define assert_all_lt_hi(x)
 
 #else
 
 #define assert_all_eq0(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	tmp = _mm_cmpeq_epi16(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_gt(x, y) { \
-	__m128i tmp = _mm_cmpgt_epu8(x, y); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_gt_lo(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	tmp = _mm_cmpgt_epu8(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i z = simde_mm_setzero_si128(); \
+	simde__m128i tmp = simde_mm_setzero_si128(); \
+	z = simde_mm_xor_si128(z, z); \
+	tmp = simde_mm_cmpeq_epi16(x, z); \
+	assert_eq(0xffff, simde_mm_movemask_epi8(tmp)); \
 }
 
 #define assert_all_lt(x, y) { \
-	__m128i z = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	__m128i tmp = _mm_subs_epu8(y, x); \
-	tmp = _mm_cmpeq_epi16(tmp, z); \
-	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_lt_hi(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_cmpeq_epu8(z, z); \
-	z = _mm_srli_epu8(z, 1); \
-	tmp = _mm_cmplt_epu8(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i z = simde_mm_setzero_si128(); \
+	z = simde_mm_xor_si128(z, z); \
+	simde__m128i tmp = simde_mm_subs_epu8(y, x); \
+	tmp = simde_mm_cmpeq_epi16(tmp, z); \
+	assert_eq(0x0000, simde_mm_movemask_epi8(tmp)); \
 }
 #endif
 
@@ -325,8 +300,8 @@
 	// we'll call "left" and "right".
 	d.vecbuf_.resize(4 * 2 * iter);
 	d.vecbuf_.zero();
-	__m128i *vbuf_l = d.vecbuf_.ptr();
-	__m128i *vbuf_r = d.vecbuf_.ptr() + (4 * iter);
+	simde__m128i *vbuf_l = d.vecbuf_.ptr();
+	simde__m128i *vbuf_r = d.vecbuf_.ptr() + (4 * iter);
 
 	// This is the data structure that holds candidate cells per diagonal.
 	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
@@ -359,86 +334,86 @@
 	// Much of the implmentation below is adapted from Michael's code.
 
 	// Set all elts to reference gap open penalty
-	__m128i rfgapo   = _mm_setzero_si128();
-	__m128i rfgape   = _mm_setzero_si128();
-	__m128i rdgapo   = _mm_setzero_si128();
-	__m128i rdgape   = _mm_setzero_si128();
-	__m128i vlo      = _mm_setzero_si128();
-	__m128i vhi      = _mm_setzero_si128();
-	__m128i ve       = _mm_setzero_si128();
-	__m128i vf       = _mm_setzero_si128();
-	__m128i vh       = _mm_setzero_si128();
-	__m128i vhd      = _mm_setzero_si128();
-	__m128i vhdtmp   = _mm_setzero_si128();
-	__m128i vtmp     = _mm_setzero_si128();
-	__m128i vzero    = _mm_setzero_si128();
-	__m128i vhilsw   = _mm_setzero_si128();
+	simde__m128i rfgapo   = simde_mm_setzero_si128();
+	simde__m128i rfgape   = simde_mm_setzero_si128();
+	simde__m128i rdgapo   = simde_mm_setzero_si128();
+	simde__m128i rdgape   = simde_mm_setzero_si128();
+	simde__m128i vlo      = simde_mm_setzero_si128();
+	simde__m128i vhi      = simde_mm_setzero_si128();
+	simde__m128i ve       = simde_mm_setzero_si128();
+	simde__m128i vf       = simde_mm_setzero_si128();
+	simde__m128i vh       = simde_mm_setzero_si128();
+	simde__m128i vhd      = simde_mm_setzero_si128();
+	simde__m128i vhdtmp   = simde_mm_setzero_si128();
+	simde__m128i vtmp     = simde_mm_setzero_si128();
+	simde__m128i vzero    = simde_mm_setzero_si128();
+	simde__m128i vhilsw   = simde_mm_setzero_si128();
 
 	assert_gt(sc_->refGapOpen(), 0);
 	assert_leq(sc_->refGapOpen(), MAX_U8);
 	dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
-	rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
-	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
-	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	rfgapo = simde_mm_insert_epi16(rfgapo, dup, 0);
+	rfgapo = simde_mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = simde_mm_shuffle_epi32(rfgapo, 0);
 	
 	// Set all elts to reference gap extension penalty
 	assert_gt(sc_->refGapExtend(), 0);
 	assert_leq(sc_->refGapExtend(), MAX_U8);
 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
 	dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
-	rfgape = _mm_insert_epi16(rfgape, dup, 0);
-	rfgape = _mm_shufflelo_epi16(rfgape, 0);
-	rfgape = _mm_shuffle_epi32(rfgape, 0);
+	rfgape = simde_mm_insert_epi16(rfgape, dup, 0);
+	rfgape = simde_mm_shufflelo_epi16(rfgape, 0);
+	rfgape = simde_mm_shuffle_epi32(rfgape, 0);
 
 	// Set all elts to read gap open penalty
 	assert_gt(sc_->readGapOpen(), 0);
 	assert_leq(sc_->readGapOpen(), MAX_U8);
 	dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
-	rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
-	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
-	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	rdgapo = simde_mm_insert_epi16(rdgapo, dup, 0);
+	rdgapo = simde_mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = simde_mm_shuffle_epi32(rdgapo, 0);
 	
 	// Set all elts to read gap extension penalty
 	assert_gt(sc_->readGapExtend(), 0);
 	assert_leq(sc_->readGapExtend(), MAX_U8);
 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
 	dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
-	rdgape = _mm_insert_epi16(rdgape, dup, 0);
-	rdgape = _mm_shufflelo_epi16(rdgape, 0);
-	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	rdgape = simde_mm_insert_epi16(rdgape, dup, 0);
+	rdgape = simde_mm_shufflelo_epi16(rdgape, 0);
+	rdgape = simde_mm_shuffle_epi32(rdgape, 0);
 	
-	vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
-	vlo = _mm_xor_si128(vlo, vlo);   // all elts = 0
+	vhi = simde_mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
+	vlo = simde_mm_xor_si128(vlo, vlo);   // all elts = 0
 	
 	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
-	vhilsw = _mm_shuffle_epi32(vhi, 0);
-	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	vhilsw = simde_mm_shuffle_epi32(vhi, 0);
+	vhilsw = simde_mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
 	
-	// Points to a long vector of __m128i where each element is a block of
+	// Points to a long vector of simde__m128i where each element is a block of
 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
 	// Blocks of cells are organized in the same interleaved manner as they are
 	// calculated by the Farrar algorithm.
-	const __m128i *pvScore; // points into the query profile
+	const simde__m128i *pvScore; // points into the query profile
 
 	const size_t colstride = ROWSTRIDE_2COL * iter;
 	
 	// Initialize the H and E vectors in the first matrix column
-	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
-	/* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
-	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
+	simde__m128i *pvELeft = vbuf_l + 0; simde__m128i *pvERight = vbuf_r + 0;
+	/* simde__m128i *pvFLeft = vbuf_l + 1; */ simde__m128i *pvFRight = vbuf_r + 1;
+	simde__m128i *pvHLeft = vbuf_l + 2; simde__m128i *pvHRight = vbuf_r + 2;
 	
 	// Maximum score in final row
 	bool found = false;
 	TCScore lrmax = MIN_U8;
 	
 	for(size_t i = 0; i < iter; i++) {
-		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
+		simde_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
 		// Could initialize Hs to high or low.  If high, cells in the lower
 		// triangle will have somewhat more legitiate scores, but still won't
 		// be exhaustively scored.
-		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
+		simde_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
 	}
 	
 	assert_gt(sc_->gapbar, 0);
@@ -474,45 +449,45 @@
 		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
 		
 		// Set all cells to low value
-		vf = _mm_xor_si128(vf, vf);
+		vf = simde_mm_xor_si128(vf, vf);
 
 		// Load H vector from the final row of the previous column
-		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
+		vh = simde_mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
 		// Shift 2 bytes down so that topmost (least sig) cell gets 0
-		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		vh = simde_mm_slli_si128(vh, NBYTES_PER_WORD);
 		// Fill topmost (least sig) cell with high value
-		vh = _mm_or_si128(vh, vhilsw);
+		vh = simde_mm_or_si128(vh, vhilsw);
 		
 		// For each character in the reference text:
 		size_t j;
 		for(j = 0; j < iter; j++) {
 			// Load cells from E, calculated previously
-			ve = _mm_load_si128(pvELeft);
-			vhd = _mm_load_si128(pvHLeft);
+			ve = simde_mm_load_si128(pvELeft);
+			vhd = simde_mm_load_si128(pvHLeft);
 			assert_all_lt(ve, vhi);
 			pvELeft += ROWSTRIDE_2COL;
 			
 			// Store cells in F, calculated previously
-			vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
-			_mm_store_si128(pvFRight, vf);
+			vf = simde_mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
+			simde_mm_store_si128(pvFRight, vf);
 			pvFRight += ROWSTRIDE_2COL;
 			
 			// Factor in query profile (matches and mismatches)
-			vh = _mm_subs_epu8(vh, pvScore[0]);
+			vh = simde_mm_subs_epu8(vh, pvScore[0]);
 			
 			// Update H, factoring in E and F
-			vh = _mm_max_epu8(vh, vf);
+			vh = simde_mm_max_epu8(vh, vf);
 			
 			// Update vE value
 			vhdtmp = vhd;
-			vhd = _mm_subs_epu8(vhd, rdgapo);
-			vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epu8(ve, rdgape);
-			ve = _mm_max_epu8(ve, vhd);
-			vh = _mm_max_epu8(vh, ve);
+			vhd = simde_mm_subs_epu8(vhd, rdgapo);
+			vhd = simde_mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epu8(ve, rdgape);
+			ve = simde_mm_max_epu8(ve, vhd);
+			vh = simde_mm_max_epu8(vh, ve);
 			
 			// Save the new vH values
-			_mm_store_si128(pvHRight, vh);
+			simde_mm_store_si128(pvHRight, vh);
 			pvHRight += ROWSTRIDE_2COL;
 			vtmp = vh;
 			assert_all_lt(ve, vhi);
@@ -522,49 +497,49 @@
 			pvHLeft += ROWSTRIDE_2COL;
 
 			// Save E values
-			_mm_store_si128(pvERight, ve);
+			simde_mm_store_si128(pvERight, ve);
 			pvERight += ROWSTRIDE_2COL;
 			
 			// Update vf value
-			vtmp = _mm_subs_epu8(vtmp, rfgapo);
+			vtmp = simde_mm_subs_epu8(vtmp, rfgapo);
 
-			vf = _mm_subs_epu8(vf, rfgape);
+			vf = simde_mm_subs_epu8(vf, rfgape);
 			assert_all_lt(vf, vhi);
-			vf = _mm_max_epu8(vf, vtmp);
+			vf = simde_mm_max_epu8(vf, vtmp);
 			
 			pvScore += 2; // move on to next query profile / gap veto
 		}
 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
 		pvFRight -= colstride; // reset to start of column
-		vtmp = _mm_load_si128(pvFRight);
+		vtmp = simde_mm_load_si128(pvFRight);
 		
 		pvHRight -= colstride; // reset to start of column
-		vh = _mm_load_si128(pvHRight);
+		vh = simde_mm_load_si128(pvHRight);
 		
 		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
 		
 		// vf from last row gets shifted down by one to overlay the first row
 		// rfgape has already been subtracted from it.
-		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
 		
-		vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_max_epu8(vtmp, vf);
-		vtmp = _mm_subs_epu8(vf, vtmp);
-		vtmp = _mm_cmpeq_epi8(vtmp, vzero);
-		int cmp = _mm_movemask_epi8(vtmp);
+		vf = simde_mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_max_epu8(vtmp, vf);
+		vtmp = simde_mm_subs_epu8(vf, vtmp);
+		vtmp = simde_mm_cmpeq_epi8(vtmp, vzero);
+		int cmp = simde_mm_movemask_epi8(vtmp);
 		
 		// If any element of vtmp is greater than H - gap-open...
 		j = 0;
 		while(cmp != 0xffff) {
 			// Store this vf
-			_mm_store_si128(pvFRight, vf);
+			simde_mm_store_si128(pvFRight, vf);
 			pvFRight += ROWSTRIDE_2COL;
 			
 			// Update vh w/r/t new vf
-			vh = _mm_max_epu8(vh, vf);
+			vh = simde_mm_max_epu8(vh, vf);
 			
 			// Save vH values
-			_mm_store_si128(pvHRight, vh);
+			simde_mm_store_si128(pvHRight, vh);
 			pvHRight += ROWSTRIDE_2COL;
 			
 			pvScore += 2;
@@ -572,29 +547,29 @@
 			assert_lt(j, iter);
 			if(++j == iter) {
 				pvFRight -= colstride;
-				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				vtmp = simde_mm_load_si128(pvFRight);   // load next vf ASAP
 				pvHRight -= colstride;
-				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				vh = simde_mm_load_si128(pvHRight);     // load next vh ASAP
 				pvScore = d.profbuf_.ptr() + off + 1;
 				j = 0;
-				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
 			} else {
-				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
-				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				vtmp = simde_mm_load_si128(pvFRight);   // load next vf ASAP
+				vh = simde_mm_load_si128(pvHRight);     // load next vh ASAP
 			}
 			
 			// Update F with another gap extension
-			vf = _mm_subs_epu8(vf, rfgape);
-			vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_max_epu8(vtmp, vf);
-			vtmp = _mm_subs_epu8(vf, vtmp);
-			vtmp = _mm_cmpeq_epi8(vtmp, vzero);
-			cmp = _mm_movemask_epi8(vtmp);
+			vf = simde_mm_subs_epu8(vf, rfgape);
+			vf = simde_mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_max_epu8(vtmp, vf);
+			vtmp = simde_mm_subs_epu8(vf, vtmp);
+			vtmp = simde_mm_cmpeq_epi8(vtmp, vzero);
+			cmp = simde_mm_movemask_epi8(vtmp);
 			nfixup++;
 		}
 		
 		// Check in the last row for the maximum so far
-		__m128i *vtmp = vbuf_r + 2 /* H */ + (d.lastIter_ * ROWSTRIDE_2COL);
+		simde__m128i *vtmp = vbuf_r + 2 /* H */ + (d.lastIter_ * ROWSTRIDE_2COL);
 		// Note: we may not want to extract from the final row
 		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
 		found = true;
@@ -617,9 +592,9 @@
 		// Save some elements to checkpoints
 		if(checkpoint) {
 			
-			__m128i *pvE = vbuf_r + 0;
-			__m128i *pvF = vbuf_r + 1;
-			__m128i *pvH = vbuf_r + 2;
+			simde__m128i *pvE = vbuf_r + 0;
+			simde__m128i *pvF = vbuf_r + 1;
+			simde__m128i *pvH = vbuf_r + 2;
 			size_t coli = i - rfi_;
 			if(coli < cper_.locol_) cper_.locol_ = coli;
 			if(coli > cper_.hicol_) cper_.hicol_ = coli;
@@ -725,16 +700,16 @@
 					assert_gt(coli, 0);
 					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
 					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
-					__m128i *dst = cper_.qcols_.ptr() + coloff;
-					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+					simde__m128i *dst = cper_.qcols_.ptr() + coloff;
+					memcpy(dst, vbuf_r, sizeof(simde__m128i) * wordspercol);
 				}
 			}
 			if(cper_.debug_) {
 				// Save the column using memcpys
 				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
 				size_t coloff = coli * wordspercol;
-				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
-				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+				simde__m128i *dst = cper_.qcolsD_.ptr() + coloff;
+				memcpy(dst, vbuf_r, sizeof(simde__m128i) * wordspercol);
 			}
 		}
 	}
@@ -822,70 +797,70 @@
 	// Much of the implmentation below is adapted from Michael's code.
 
 	// Set all elts to reference gap open penalty
-	__m128i rfgapo   = _mm_setzero_si128();
-	__m128i rfgape   = _mm_setzero_si128();
-	__m128i rdgapo   = _mm_setzero_si128();
-	__m128i rdgape   = _mm_setzero_si128();
-	__m128i vlo      = _mm_setzero_si128();
-	__m128i vhi      = _mm_setzero_si128();
-	__m128i ve       = _mm_setzero_si128();
-	__m128i vf       = _mm_setzero_si128();
-	__m128i vh       = _mm_setzero_si128();
+	simde__m128i rfgapo   = simde_mm_setzero_si128();
+	simde__m128i rfgape   = simde_mm_setzero_si128();
+	simde__m128i rdgapo   = simde_mm_setzero_si128();
+	simde__m128i rdgape   = simde_mm_setzero_si128();
+	simde__m128i vlo      = simde_mm_setzero_si128();
+	simde__m128i vhi      = simde_mm_setzero_si128();
+	simde__m128i ve       = simde_mm_setzero_si128();
+	simde__m128i vf       = simde_mm_setzero_si128();
+	simde__m128i vh       = simde_mm_setzero_si128();
 #if 0
-	__m128i vhd      = _mm_setzero_si128();
-	__m128i vhdtmp   = _mm_setzero_si128();
+	simde__m128i vhd      = simde_mm_setzero_si128();
+	simde__m128i vhdtmp   = simde_mm_setzero_si128();
 #endif
-	__m128i vtmp     = _mm_setzero_si128();
-	__m128i vzero    = _mm_setzero_si128();
-	__m128i vhilsw   = _mm_setzero_si128();
+	simde__m128i vtmp     = simde_mm_setzero_si128();
+	simde__m128i vzero    = simde_mm_setzero_si128();
+	simde__m128i vhilsw   = simde_mm_setzero_si128();
 
 	assert_gt(sc_->refGapOpen(), 0);
 	assert_leq(sc_->refGapOpen(), MAX_U8);
 	dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
-	rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
-	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
-	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	rfgapo = simde_mm_insert_epi16(rfgapo, dup, 0);
+	rfgapo = simde_mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = simde_mm_shuffle_epi32(rfgapo, 0);
 	
 	// Set all elts to reference gap extension penalty
 	assert_gt(sc_->refGapExtend(), 0);
 	assert_leq(sc_->refGapExtend(), MAX_U8);
 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
 	dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
-	rfgape = _mm_insert_epi16(rfgape, dup, 0);
-	rfgape = _mm_shufflelo_epi16(rfgape, 0);
-	rfgape = _mm_shuffle_epi32(rfgape, 0);
+	rfgape = simde_mm_insert_epi16(rfgape, dup, 0);
+	rfgape = simde_mm_shufflelo_epi16(rfgape, 0);
+	rfgape = simde_mm_shuffle_epi32(rfgape, 0);
 
 	// Set all elts to read gap open penalty
 	assert_gt(sc_->readGapOpen(), 0);
 	assert_leq(sc_->readGapOpen(), MAX_U8);
 	dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
-	rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
-	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
-	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	rdgapo = simde_mm_insert_epi16(rdgapo, dup, 0);
+	rdgapo = simde_mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = simde_mm_shuffle_epi32(rdgapo, 0);
 	
 	// Set all elts to read gap extension penalty
 	assert_gt(sc_->readGapExtend(), 0);
 	assert_leq(sc_->readGapExtend(), MAX_U8);
 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
 	dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
-	rdgape = _mm_insert_epi16(rdgape, dup, 0);
-	rdgape = _mm_shufflelo_epi16(rdgape, 0);
-	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	rdgape = simde_mm_insert_epi16(rdgape, dup, 0);
+	rdgape = simde_mm_shufflelo_epi16(rdgape, 0);
+	rdgape = simde_mm_shuffle_epi32(rdgape, 0);
 	
-	vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
-	vlo = _mm_xor_si128(vlo, vlo);   // all elts = 0
+	vhi = simde_mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
+	vlo = simde_mm_xor_si128(vlo, vlo);   // all elts = 0
 	
 	// vhilsw: topmost (least sig) word set to 0x7fff, all other words=0
-	vhilsw = _mm_shuffle_epi32(vhi, 0);
-	vhilsw = _mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	vhilsw = simde_mm_shuffle_epi32(vhi, 0);
+	vhilsw = simde_mm_srli_si128(vhilsw, NBYTES_PER_REG - NBYTES_PER_WORD);
 	
-	// Points to a long vector of __m128i where each element is a block of
+	// Points to a long vector of simde__m128i where each element is a block of
 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
 	// Blocks of cells are organized in the same interleaved manner as they are
 	// calculated by the Farrar algorithm.
-	const __m128i *pvScore; // points into the query profile
+	const simde__m128i *pvScore; // points into the query profile
 
 	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
 	const size_t colstride = d.mat_.colstride();
@@ -893,26 +868,26 @@
 	assert_eq(ROWSTRIDE, colstride / iter);
 	
 	// Initialize the H and E vectors in the first matrix column
-	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
-	__m128i *pvETmp = d.mat_.evec(0, 0);
+	simde__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
+	simde__m128i *pvETmp = d.mat_.evec(0, 0);
 	
 	// Maximum score in final row
 	bool found = false;
 	TCScore lrmax = MIN_U8;
 	
 	for(size_t i = 0; i < iter; i++) {
-		_mm_store_si128(pvETmp, vlo);
-		_mm_store_si128(pvHTmp, vlo); // start high in end-to-end mode
+		simde_mm_store_si128(pvETmp, vlo);
+		simde_mm_store_si128(pvHTmp, vlo); // start high in end-to-end mode
 		pvETmp += ROWSTRIDE;
 		pvHTmp += ROWSTRIDE;
 	}
 	// These are swapped just before the innermost loop
-	__m128i *pvHStore = d.mat_.hvec(0, 0);
-	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
-	__m128i *pvELoad  = d.mat_.evec(0, 0);
-	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
-	__m128i *pvFStore = d.mat_.fvec(0, 0);
-	__m128i *pvFTmp   = NULL;
+	simde__m128i *pvHStore = d.mat_.hvec(0, 0);
+	simde__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
+	simde__m128i *pvELoad  = d.mat_.evec(0, 0);
+	simde__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
+	simde__m128i *pvFStore = d.mat_.fvec(0, 0);
+	simde__m128i *pvFTmp   = NULL;
 	
 	assert_gt(sc_->gapbar, 0);
 	size_t nfixup = 0;
@@ -943,55 +918,55 @@
 		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
 		
 		// Set all cells to low value
-		vf = _mm_xor_si128(vf, vf);
+		vf = simde_mm_xor_si128(vf, vf);
 
 		// Load H vector from the final row of the previous column
-		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
+		vh = simde_mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
 		// Shift 2 bytes down so that topmost (least sig) cell gets 0
-		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		vh = simde_mm_slli_si128(vh, NBYTES_PER_WORD);
 		// Fill topmost (least sig) cell with high value
-		vh = _mm_or_si128(vh, vhilsw);
+		vh = simde_mm_or_si128(vh, vhilsw);
 		
 		// For each character in the reference text:
 		size_t j;
 		for(j = 0; j < iter; j++) {
 			// Load cells from E, calculated previously
-			ve = _mm_load_si128(pvELoad);
+			ve = simde_mm_load_si128(pvELoad);
 #if 0
-			vhd = _mm_load_si128(pvHLoad);
+			vhd = simde_mm_load_si128(pvHLoad);
 #endif
 			assert_all_lt(ve, vhi);
 			pvELoad += ROWSTRIDE;
 			
 			// Store cells in F, calculated previously
-			vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
-			_mm_store_si128(pvFStore, vf);
+			vf = simde_mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
+			simde_mm_store_si128(pvFStore, vf);
 			pvFStore += ROWSTRIDE;
 			
 			// Factor in query profile (matches and mismatches)
-			vh = _mm_subs_epu8(vh, pvScore[0]);
+			vh = simde_mm_subs_epu8(vh, pvScore[0]);
 			
 			// Update H, factoring in E and F
-			vh = _mm_max_epu8(vh, ve);
-			vh = _mm_max_epu8(vh, vf);
+			vh = simde_mm_max_epu8(vh, ve);
+			vh = simde_mm_max_epu8(vh, vf);
 			
 			// Save the new vH values
-			_mm_store_si128(pvHStore, vh);
+			simde_mm_store_si128(pvHStore, vh);
 			pvHStore += ROWSTRIDE;
 			
 			// Update vE value
 			vtmp = vh;
 #if 0
 			vhdtmp = vhd;
-			vhd = _mm_subs_epu8(vhd, rdgapo);
-			vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epu8(ve, rdgape);
-			ve = _mm_max_epu8(ve, vhd);
+			vhd = simde_mm_subs_epu8(vhd, rdgapo);
+			vhd = simde_mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epu8(ve, rdgape);
+			ve = simde_mm_max_epu8(ve, vhd);
 #else
-			vh = _mm_subs_epu8(vh, rdgapo);
-			vh = _mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epu8(ve, rdgape);
-			ve = _mm_max_epu8(ve, vh);
+			vh = simde_mm_subs_epu8(vh, rdgapo);
+			vh = simde_mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epu8(ve, rdgape);
+			ve = simde_mm_max_epu8(ve, vh);
 #endif
 			assert_all_lt(ve, vhi);
 			
@@ -999,34 +974,34 @@
 #if 0
 			vh = vhdtmp;
 #else
-			vh = _mm_load_si128(pvHLoad);
+			vh = simde_mm_load_si128(pvHLoad);
 #endif
 			pvHLoad += ROWSTRIDE;
 			
 			// Save E values
-			_mm_store_si128(pvEStore, ve);
+			simde_mm_store_si128(pvEStore, ve);
 			pvEStore += ROWSTRIDE;
 			
 			// Update vf value
-			vtmp = _mm_subs_epu8(vtmp, rfgapo);
-			vf = _mm_subs_epu8(vf, rfgape);
+			vtmp = simde_mm_subs_epu8(vtmp, rfgapo);
+			vf = simde_mm_subs_epu8(vf, rfgape);
 			assert_all_lt(vf, vhi);
-			vf = _mm_max_epu8(vf, vtmp);
+			vf = simde_mm_max_epu8(vf, vtmp);
 			
 			pvScore += 2; // move on to next query profile / gap veto
 		}
 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
 		pvFTmp = pvFStore;
 		pvFStore -= colstride; // reset to start of column
-		vtmp = _mm_load_si128(pvFStore);
+		vtmp = simde_mm_load_si128(pvFStore);
 		
 		pvHStore -= colstride; // reset to start of column
-		vh = _mm_load_si128(pvHStore);
+		vh = simde_mm_load_si128(pvHStore);
 		
 #if 0
 #else
 		pvEStore -= colstride; // reset to start of column
-		ve = _mm_load_si128(pvEStore);
+		ve = simde_mm_load_si128(pvEStore);
 #endif
 		
 		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
@@ -1034,35 +1009,35 @@
 		
 		// vf from last row gets shifted down by one to overlay the first row
 		// rfgape has already been subtracted from it.
-		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
 		
-		vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_max_epu8(vtmp, vf);
-		vtmp = _mm_subs_epu8(vf, vtmp);
-		vtmp = _mm_cmpeq_epi8(vtmp, vzero);
-		int cmp = _mm_movemask_epi8(vtmp);
+		vf = simde_mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_max_epu8(vtmp, vf);
+		vtmp = simde_mm_subs_epu8(vf, vtmp);
+		vtmp = simde_mm_cmpeq_epi8(vtmp, vzero);
+		int cmp = simde_mm_movemask_epi8(vtmp);
 		
 		// If any element of vtmp is greater than H - gap-open...
 		j = 0;
 		while(cmp != 0xffff) {
 			// Store this vf
-			_mm_store_si128(pvFStore, vf);
+			simde_mm_store_si128(pvFStore, vf);
 			pvFStore += ROWSTRIDE;
 			
 			// Update vh w/r/t new vf
-			vh = _mm_max_epu8(vh, vf);
+			vh = simde_mm_max_epu8(vh, vf);
 			
 			// Save vH values
-			_mm_store_si128(pvHStore, vh);
+			simde_mm_store_si128(pvHStore, vh);
 			pvHStore += ROWSTRIDE;
 			
 			// Update E in case it can be improved using our new vh
 #if 0
 #else
-			vh = _mm_subs_epu8(vh, rdgapo);
-			vh = _mm_subs_epu8(vh, *pvScore); // veto some read gap opens
-			ve = _mm_max_epu8(ve, vh);
-			_mm_store_si128(pvEStore, ve);
+			vh = simde_mm_subs_epu8(vh, rdgapo);
+			vh = simde_mm_subs_epu8(vh, *pvScore); // veto some read gap opens
+			ve = simde_mm_max_epu8(ve, vh);
+			simde_mm_store_si128(pvEStore, ve);
 			pvEStore += ROWSTRIDE;
 #endif
 			pvScore += 2;
@@ -1070,33 +1045,33 @@
 			assert_lt(j, iter);
 			if(++j == iter) {
 				pvFStore -= colstride;
-				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				vtmp = simde_mm_load_si128(pvFStore);   // load next vf ASAP
 				pvHStore -= colstride;
-				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				vh = simde_mm_load_si128(pvHStore);     // load next vh ASAP
 #if 0
 #else
 				pvEStore -= colstride;
-				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
+				ve = simde_mm_load_si128(pvEStore);     // load next ve ASAP
 #endif
 				pvScore = d.profbuf_.ptr() + off + 1;
 				j = 0;
-				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
 			} else {
-				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
-				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				vtmp = simde_mm_load_si128(pvFStore);   // load next vf ASAP
+				vh = simde_mm_load_si128(pvHStore);     // load next vh ASAP
 #if 0
 #else
-				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
+				ve = simde_mm_load_si128(pvEStore);     // load next vh ASAP
 #endif
 			}
 			
 			// Update F with another gap extension
-			vf = _mm_subs_epu8(vf, rfgape);
-			vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_max_epu8(vtmp, vf);
-			vtmp = _mm_subs_epu8(vf, vtmp);
-			vtmp = _mm_cmpeq_epi8(vtmp, vzero);
-			cmp = _mm_movemask_epi8(vtmp);
+			vf = simde_mm_subs_epu8(vf, rfgape);
+			vf = simde_mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_max_epu8(vtmp, vf);
+			vtmp = simde_mm_subs_epu8(vf, vtmp);
+			vtmp = simde_mm_cmpeq_epi8(vtmp, vzero);
+			cmp = simde_mm_movemask_epi8(vtmp);
 			nfixup++;
 		}
 		
@@ -1117,7 +1092,7 @@
 		}
 #endif
 		
-		__m128i *vtmp = d.mat_.hvec(d.lastIter_, i-rfi_);
+		simde__m128i *vtmp = d.mat_.hvec(d.lastIter_, i-rfi_);
 		// Note: we may not want to extract from the final row
 		TCScore lr = ((TCScore*)(vtmp))[d.lastWord_];
 		found = true;
@@ -1213,7 +1188,7 @@
 	assert(!d.profbuf_.empty());
 	const size_t colstride = d.mat_.colstride();
 	ASSERT_ONLY(bool sawbest = false);
-	__m128i *pvH = d.mat_.hvec(d.lastIter_, 0);
+	simde__m128i *pvH = d.mat_.hvec(d.lastIter_, 0);
 	for(size_t j = 0; j < ncol; j++) {
 		TAlScore sc = (TAlScore)(((TCScore*)pvH)[d.lastWord_] - 0xff);
 		assert_leq(sc, best);
@@ -1339,7 +1314,7 @@
 	size_t rowelt, rowvec, eltvec;
 	size_t left_rowelt, up_rowelt, upleft_rowelt;
 	size_t left_rowvec, up_rowvec, upleft_rowvec;
-	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
+	simde__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
 	NEW_ROW_COL(row, col);
 	while((int)row >= 0) {
 		met.btcell++;
--- hisat2.orig/aligner_swsse_loc_i16.cpp
+++ hisat2/aligner_swsse_loc_i16.cpp
@@ -83,7 +83,7 @@
 	const BTString* qu = fw ? qufw_ : qurc_;
 	const size_t len = rd->length();
 	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
-	// How many __m128i's are needed
+	// How many simde__m128i's are needed
 	size_t n128s =
 		64 +                    // slack bytes, for alignment?
 		(seglen * ALPHA_SIZE)   // query profile data
@@ -226,51 +226,26 @@
 #ifdef NDEBUG
 
 #define assert_all_eq0(x)
-#define assert_all_gt(x, y)
-#define assert_all_gt_lo(x)
 #define assert_all_lt(x, y)
-#define assert_all_lt_hi(x)
 
 #else
 
 #define assert_all_eq0(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	tmp = _mm_cmpeq_epi16(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_gt(x, y) { \
-	__m128i tmp = _mm_cmpgt_epi16(x, y); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_gt_lo(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	tmp = _mm_cmpgt_epi16(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i z = simde_mm_setzero_si128(); \
+	simde__m128i tmp = simde_mm_setzero_si128(); \
+	z = simde_mm_xor_si128(z, z); \
+	tmp = simde_mm_cmpeq_epi16(x, z); \
+	assert_eq(0xffff, simde_mm_movemask_epi8(tmp)); \
 }
 
 #define assert_all_lt(x, y) { \
-	__m128i tmp = _mm_cmplt_epi16(x, y); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i tmp = simde_mm_cmplt_epi16(x, y); \
+	assert_eq(0xffff, simde_mm_movemask_epi8(tmp)); \
 }
 
 #define assert_all_leq(x, y) { \
-	__m128i tmp = _mm_cmpgt_epi16(x, y); \
-	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_lt_hi(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_cmpeq_epi16(z, z); \
-	z = _mm_srli_epi16(z, 1); \
-	tmp = _mm_cmplt_epi16(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i tmp = simde_mm_cmpgt_epi16(x, y); \
+	assert_eq(0x0000, simde_mm_movemask_epi8(tmp)); \
 }
 #endif
 
@@ -322,8 +297,8 @@
 	// we'll call "left" and "right".
 	d.vecbuf_.resize(ROWSTRIDE_2COL * iter * 2);
 	d.vecbuf_.zero();
-	__m128i *vbuf_l = d.vecbuf_.ptr();
-	__m128i *vbuf_r = d.vecbuf_.ptr() + (ROWSTRIDE_2COL * iter);
+	simde__m128i *vbuf_l = d.vecbuf_.ptr();
+	simde__m128i *vbuf_r = d.vecbuf_.ptr() + (ROWSTRIDE_2COL * iter);
 	
 	// This is the data structure that holds candidate cells per diagonal.
 	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
@@ -356,95 +331,95 @@
 	// Much of the implmentation below is adapted from Michael's code.
 
 	// Set all elts to reference gap open penalty
-	__m128i rfgapo   = _mm_setzero_si128();
-	__m128i rfgape   = _mm_setzero_si128();
-	__m128i rdgapo   = _mm_setzero_si128();
-	__m128i rdgape   = _mm_setzero_si128();
-	__m128i vlo      = _mm_setzero_si128();
-	__m128i vhi      = _mm_setzero_si128();
-	__m128i vlolsw   = _mm_setzero_si128();
-	__m128i vmax     = _mm_setzero_si128();
-	__m128i vcolmax  = _mm_setzero_si128();
-	__m128i vmaxtmp  = _mm_setzero_si128();
-	__m128i ve       = _mm_setzero_si128();
-	__m128i vf       = _mm_setzero_si128();
-	__m128i vh       = _mm_setzero_si128();
-	__m128i vhd      = _mm_setzero_si128();
-	__m128i vhdtmp   = _mm_setzero_si128();
-	__m128i vtmp     = _mm_setzero_si128();
-	__m128i vzero    = _mm_setzero_si128();
-	__m128i vminsc   = _mm_setzero_si128();
+	simde__m128i rfgapo   = simde_mm_setzero_si128();
+	simde__m128i rfgape   = simde_mm_setzero_si128();
+	simde__m128i rdgapo   = simde_mm_setzero_si128();
+	simde__m128i rdgape   = simde_mm_setzero_si128();
+	simde__m128i vlo      = simde_mm_setzero_si128();
+	simde__m128i vhi      = simde_mm_setzero_si128();
+	simde__m128i vlolsw   = simde_mm_setzero_si128();
+	simde__m128i vmax     = simde_mm_setzero_si128();
+	simde__m128i vcolmax  = simde_mm_setzero_si128();
+	simde__m128i vmaxtmp  = simde_mm_setzero_si128();
+	simde__m128i ve       = simde_mm_setzero_si128();
+	simde__m128i vf       = simde_mm_setzero_si128();
+	simde__m128i vh       = simde_mm_setzero_si128();
+	simde__m128i vhd      = simde_mm_setzero_si128();
+	simde__m128i vhdtmp   = simde_mm_setzero_si128();
+	simde__m128i vtmp     = simde_mm_setzero_si128();
+	simde__m128i vzero    = simde_mm_setzero_si128();
+	simde__m128i vminsc   = simde_mm_setzero_si128();
 
 	assert_gt(sc_->refGapOpen(), 0);
 	assert_leq(sc_->refGapOpen(), MAX_I16);
-	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
-	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
-	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	rfgapo = simde_mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
+	rfgapo = simde_mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = simde_mm_shuffle_epi32(rfgapo, 0);
 	
 	// Set all elts to reference gap extension penalty
 	assert_gt(sc_->refGapExtend(), 0);
 	assert_leq(sc_->refGapExtend(), MAX_I16);
 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
-	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
-	rfgape = _mm_shufflelo_epi16(rfgape, 0);
-	rfgape = _mm_shuffle_epi32(rfgape, 0);
+	rfgape = simde_mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
+	rfgape = simde_mm_shufflelo_epi16(rfgape, 0);
+	rfgape = simde_mm_shuffle_epi32(rfgape, 0);
 
 	// Set all elts to read gap open penalty
 	assert_gt(sc_->readGapOpen(), 0);
 	assert_leq(sc_->readGapOpen(), MAX_I16);
-	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
-	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
-	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	rdgapo = simde_mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
+	rdgapo = simde_mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = simde_mm_shuffle_epi32(rdgapo, 0);
 	
 	// Set all elts to read gap extension penalty
 	assert_gt(sc_->readGapExtend(), 0);
 	assert_leq(sc_->readGapExtend(), MAX_I16);
 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
-	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
-	rdgape = _mm_shufflelo_epi16(rdgape, 0);
-	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	rdgape = simde_mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
+	rdgape = simde_mm_shufflelo_epi16(rdgape, 0);
+	rdgape = simde_mm_shuffle_epi32(rdgape, 0);
 	
 	// Set all elts to minimum score threshold.  Actually, to 1 less than the
 	// threshold so we can use gt instead of geq.
-	vminsc = _mm_insert_epi16(vminsc, (int)minsc_-1, 0);
-	vminsc = _mm_shufflelo_epi16(vminsc, 0);
-	vminsc = _mm_shuffle_epi32(vminsc, 0);
+	vminsc = simde_mm_insert_epi16(vminsc, (int)minsc_-1, 0);
+	vminsc = simde_mm_shufflelo_epi16(vminsc, 0);
+	vminsc = simde_mm_shuffle_epi32(vminsc, 0);
 
 	// Set all elts to 0x8000 (min value for signed 16-bit)
-	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
-	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
+	vlo = simde_mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
+	vlo = simde_mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
 	
 	// Set all elts to 0x7fff (max value for signed 16-bit)
-	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
-	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
+	vhi = simde_mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
+	vhi = simde_mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
 	
 	// Set all elts to 0x8000 (min value for signed 16-bit)
 	vmax = vlo;
 	
 	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
-	vlolsw = _mm_shuffle_epi32(vlo, 0);
-	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	vlolsw = simde_mm_shuffle_epi32(vlo, 0);
+	vlolsw = simde_mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
 	
-	// Points to a long vector of __m128i where each element is a block of
+	// Points to a long vector of simde__m128i where each element is a block of
 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
 	// Blocks of cells are organized in the same interleaved manner as they are
 	// calculated by the Farrar algorithm.
-	const __m128i *pvScore; // points into the query profile
+	const simde__m128i *pvScore; // points into the query profile
 
 	const size_t colstride = ROWSTRIDE_2COL * iter;
 	
 	// Initialize the H and E vectors in the first matrix column
-	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
-	//__m128i *pvFLeft = vbuf_l + 1;
-	__m128i *pvFRight = vbuf_r + 1;
-	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
+	simde__m128i *pvELeft = vbuf_l + 0; simde__m128i *pvERight = vbuf_r + 0;
+	//simde__m128i *pvFLeft = vbuf_l + 1;
+	simde__m128i *pvFRight = vbuf_r + 1;
+	simde__m128i *pvHLeft = vbuf_l + 2; simde__m128i *pvHRight = vbuf_r + 2;
 	
 	for(size_t i = 0; i < iter; i++) {
 		// start low in local mode
-		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
-		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
+		simde_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
+		simde_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
 		// Note: right and left are going to be swapped as soon as we enter
 		// the outer loop below
 	}
@@ -490,47 +465,47 @@
 		// current iter's?  The way we currently do it, seems like it will
 		// almost always require at least one fixup loop iter (to recalculate
 		// this topmost F).
-		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
+		vh = simde_mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
 		
 		// Set all F cells to low value
-		vf = _mm_cmpeq_epi16(vf, vf);
-		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
-		vf = _mm_or_si128(vf, vlolsw);
+		vf = simde_mm_cmpeq_epi16(vf, vf);
+		vf = simde_mm_slli_epi16(vf, NBITS_PER_WORD-1);
+		vf = simde_mm_or_si128(vf, vlolsw);
 		// vf now contains the vertical contribution
 
 		// Store cells in F, calculated previously
 		// No need to veto ref gap extensions, they're all 0x8000s
-		_mm_store_si128(pvFRight, vf);
+		simde_mm_store_si128(pvFRight, vf);
 		pvFRight += ROWSTRIDE_2COL;
 		
 		// Shift down so that topmost (least sig) cell gets 0
-		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		vh = simde_mm_slli_si128(vh, NBYTES_PER_WORD);
 		// Fill topmost (least sig) cell with low value
-		vh = _mm_or_si128(vh, vlolsw);
+		vh = simde_mm_or_si128(vh, vlolsw);
 		
 		// We pull out one loop iteration to make it easier to veto values in the top row
 		
 		// Load cells from E, calculated previously
-		ve = _mm_load_si128(pvELeft);
-		vhd = _mm_load_si128(pvHLeft);
+		ve = simde_mm_load_si128(pvELeft);
+		vhd = simde_mm_load_si128(pvHLeft);
 		assert_all_lt(ve, vhi);
 		pvELeft += ROWSTRIDE_2COL;
 		// ve now contains the horizontal contribution
 		
 		// Factor in query profile (matches and mismatches)
-		vh = _mm_adds_epi16(vh, pvScore[0]);
+		vh = simde_mm_adds_epi16(vh, pvScore[0]);
 		// vh now contains the diagonal contribution
 		
 		// Update vE value
 		vhdtmp = vhd;
-		vhd = _mm_subs_epi16(vhd, rdgapo);
-		vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
-		vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
-		ve = _mm_subs_epi16(ve, rdgape);
-		ve = _mm_max_epi16(ve, vhd);
+		vhd = simde_mm_subs_epi16(vhd, rdgapo);
+		vhd = simde_mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+		vhd = simde_mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+		ve = simde_mm_subs_epi16(ve, rdgape);
+		ve = simde_mm_max_epi16(ve, vhd);
 
 		// Update H, factoring in E and F
-		vh = _mm_max_epi16(vh, ve);
+		vh = simde_mm_max_epi16(vh, ve);
 		// F won't change anything!
 
 		vf = vh;
@@ -539,7 +514,7 @@
 		vcolmax = vh;
 		
 		// Save the new vH values
-		_mm_store_si128(pvHRight, vh);
+		simde_mm_store_si128(pvHRight, vh);
 
 		assert_all_lt(ve, vhi);
 
@@ -550,11 +525,11 @@
 		pvHLeft += ROWSTRIDE_2COL;
 		
 		// Save E values
-		_mm_store_si128(pvERight, ve);
+		simde_mm_store_si128(pvERight, ve);
 		pvERight += ROWSTRIDE_2COL;
 		
 		// Update vf value
-		vf = _mm_subs_epi16(vf, rfgapo);
+		vf = simde_mm_subs_epi16(vf, rfgapo);
 		assert_all_lt(vf, vhi);
 		
 		pvScore += 2; // move on to next query profile
@@ -563,37 +538,37 @@
 		size_t j;
 		for(j = 1; j < iter; j++) {
 			// Load cells from E, calculated previously
-			ve = _mm_load_si128(pvELeft);
-			vhd = _mm_load_si128(pvHLeft);
+			ve = simde_mm_load_si128(pvELeft);
+			vhd = simde_mm_load_si128(pvHLeft);
 			assert_all_lt(ve, vhi);
 			pvELeft += ROWSTRIDE_2COL;
 			
 			// Store cells in F, calculated previously
-			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
-			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
-			_mm_store_si128(pvFRight, vf);
+			vf = simde_mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			vf = simde_mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			simde_mm_store_si128(pvFRight, vf);
 			pvFRight += ROWSTRIDE_2COL;
 			
 			// Factor in query profile (matches and mismatches)
-			vh = _mm_adds_epi16(vh, pvScore[0]);
-			vh = _mm_max_epi16(vh, vf);
+			vh = simde_mm_adds_epi16(vh, pvScore[0]);
+			vh = simde_mm_max_epi16(vh, vf);
 			
 			// Update vE value
 			vhdtmp = vhd;
-			vhd = _mm_subs_epi16(vhd, rdgapo);
-			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
-			vhd = _mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epi16(ve, rdgape);
-			ve = _mm_max_epi16(ve, vhd);
+			vhd = simde_mm_subs_epi16(vhd, rdgapo);
+			vhd = simde_mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			vhd = simde_mm_adds_epi16(vhd, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epi16(ve, rdgape);
+			ve = simde_mm_max_epi16(ve, vhd);
 			
-			vh = _mm_max_epi16(vh, ve);
+			vh = simde_mm_max_epi16(vh, ve);
 			vtmp = vh;
 			
 			// Update highest score encountered this far
-			vcolmax = _mm_max_epi16(vcolmax, vh);
+			vcolmax = simde_mm_max_epi16(vcolmax, vh);
 			
 			// Save the new vH values
-			_mm_store_si128(pvHRight, vh);
+			simde_mm_store_si128(pvHRight, vh);
 
 			vh = vhdtmp;
 
@@ -602,78 +577,78 @@
 			pvHLeft += ROWSTRIDE_2COL;
 			
 			// Save E values
-			_mm_store_si128(pvERight, ve);
+			simde_mm_store_si128(pvERight, ve);
 			pvERight += ROWSTRIDE_2COL;
 			
 			// Update vf value
-			vtmp = _mm_subs_epi16(vtmp, rfgapo);
-			vf = _mm_subs_epi16(vf, rfgape);
+			vtmp = simde_mm_subs_epi16(vtmp, rfgapo);
+			vf = simde_mm_subs_epi16(vf, rfgape);
 			assert_all_lt(vf, vhi);
-			vf = _mm_max_epi16(vf, vtmp);
+			vf = simde_mm_max_epi16(vf, vtmp);
 			
 			pvScore += 2; // move on to next query profile / gap veto
 		}
 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
 		pvFRight -= colstride; // reset to start of column
-		vtmp = _mm_load_si128(pvFRight);
+		vtmp = simde_mm_load_si128(pvFRight);
 		
 		pvHRight -= colstride; // reset to start of column
-		vh = _mm_load_si128(pvHRight);
+		vh = simde_mm_load_si128(pvHRight);
 		
 		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
 		
 		// vf from last row gets shifted down by one to overlay the first row
 		// rfgape has already been subtracted from it.
-		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
-		vf = _mm_or_si128(vf, vlolsw);
+		vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = simde_mm_or_si128(vf, vlolsw);
 		
-		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_max_epi16(vtmp, vf);
-		vtmp = _mm_cmpgt_epi16(vf, vtmp);
-		int cmp = _mm_movemask_epi8(vtmp);
+		vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_max_epi16(vtmp, vf);
+		vtmp = simde_mm_cmpgt_epi16(vf, vtmp);
+		int cmp = simde_mm_movemask_epi8(vtmp);
 		
 		// If any element of vtmp is greater than H - gap-open...
 		j = 0;
 		while(cmp != 0x0000) {
 			// Store this vf
-			_mm_store_si128(pvFRight, vf);
+			simde_mm_store_si128(pvFRight, vf);
 			pvFRight += ROWSTRIDE_2COL;
 			
 			// Update vh w/r/t new vf
-			vh = _mm_max_epi16(vh, vf);
+			vh = simde_mm_max_epi16(vh, vf);
 			
 			// Save vH values
-			_mm_store_si128(pvHRight, vh);
+			simde_mm_store_si128(pvHRight, vh);
 			pvHRight += ROWSTRIDE_2COL;
 			
 			// Update highest score encountered so far.
-			vcolmax = _mm_max_epi16(vcolmax, vh);
+			vcolmax = simde_mm_max_epi16(vcolmax, vh);
 
 			pvScore += 2;
 			
 			assert_lt(j, iter);
 			if(++j == iter) {
 				pvFRight -= colstride;
-				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				vtmp = simde_mm_load_si128(pvFRight);   // load next vf ASAP
 				pvHRight -= colstride;
-				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				vh = simde_mm_load_si128(pvHRight);     // load next vh ASAP
 				pvScore = d.profbuf_.ptr() + off + 1;
 				j = 0;
-				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
-				vf = _mm_or_si128(vf, vlolsw);
+				vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = simde_mm_or_si128(vf, vlolsw);
 			} else {
-				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
-				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				vtmp = simde_mm_load_si128(pvFRight);   // load next vf ASAP
+				vh = simde_mm_load_si128(pvHRight);     // load next vh ASAP
 			}
 			
 			// Update F with another gap extension
-			vf = _mm_subs_epi16(vf, rfgape);
-			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_max_epi16(vtmp, vf);
-			vtmp = _mm_cmpgt_epi16(vf, vtmp);
-			cmp = _mm_movemask_epi8(vtmp);
+			vf = simde_mm_subs_epi16(vf, rfgape);
+			vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_max_epi16(vtmp, vf);
+			vtmp = simde_mm_cmpgt_epi16(vf, vtmp);
+			cmp = simde_mm_movemask_epi8(vtmp);
 			nfixup++;
 		}
 
@@ -689,9 +664,9 @@
 			assert_lt(lastoff, MAX_SIZE_T);
 			pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
 			for(size_t k = 0; k < iter; k++) {
-				vh = _mm_load_si128(pvHLeft);
-				vtmp = _mm_cmpgt_epi16(pvScore[0], vzero);
-				int cmp = _mm_movemask_epi8(vtmp);
+				vh = simde_mm_load_si128(pvHLeft);
+				vtmp = simde_mm_cmpgt_epi16(pvScore[0], vzero);
+				int cmp = simde_mm_movemask_epi8(vtmp);
 				if(cmp != 0) {
 					// At least one candidate in this mask.  Now iterate
 					// through vm/vh to evaluate individual cells.
@@ -720,9 +695,9 @@
 		// Save some elements to checkpoints
 		if(checkpoint) {
 			
-			__m128i *pvE = vbuf_r + 0;
-			__m128i *pvF = vbuf_r + 1;
-			__m128i *pvH = vbuf_r + 2;
+			simde__m128i *pvE = vbuf_r + 0;
+			simde__m128i *pvF = vbuf_r + 1;
+			simde__m128i *pvH = vbuf_r + 2;
 			size_t coli = i - rfi_;
 			if(coli < cper_.locol_) cper_.locol_ = coli;
 			if(coli > cper_.hicol_) cper_.hicol_ = coli;
@@ -822,30 +797,30 @@
 					assert_gt(coli, 0);
 					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
 					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
-					__m128i *dst = cper_.qcols_.ptr() + coloff;
-					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+					simde__m128i *dst = cper_.qcols_.ptr() + coloff;
+					memcpy(dst, vbuf_r, sizeof(simde__m128i) * wordspercol);
 				}
 			}
 			if(cper_.debug_) {
 				// Save the column using memcpys
 				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
 				size_t coloff = coli * wordspercol;
-				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
-				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+				simde__m128i *dst = cper_.qcolsD_.ptr() + coloff;
+				memcpy(dst, vbuf_r, sizeof(simde__m128i) * wordspercol);
 			}
 		}
 
-		vmax = _mm_max_epi16(vmax, vcolmax);
+		vmax = simde_mm_max_epi16(vmax, vcolmax);
 		{
 			// Get single largest score in this column
 			vmaxtmp = vcolmax;
-			vtmp = _mm_srli_si128(vmaxtmp, 8);
-			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 4);
-			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 2);
-			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
-			int16_t ret = _mm_extract_epi16(vmaxtmp, 0);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 8);
+			vmaxtmp = simde_mm_max_epi16(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 4);
+			vmaxtmp = simde_mm_max_epi16(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 2);
+			vmaxtmp = simde_mm_max_epi16(vmaxtmp, vtmp);
+			int16_t ret = simde_mm_extract_epi16(vmaxtmp, 0);
 			TAlScore score = (TAlScore)(ret + 0x8000);
 			if(ret == MIN_I16) {
 				score = MIN_I64;
@@ -878,9 +853,9 @@
 		assert_lt(lastoff, MAX_SIZE_T);
 		pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
 		for(size_t k = 0; k < iter; k++) {
-			vh = _mm_load_si128(pvHLeft);
-			vtmp = _mm_cmpgt_epi16(pvScore[0], vzero);
-			int cmp = _mm_movemask_epi8(vtmp);
+			vh = simde_mm_load_si128(pvHLeft);
+			vtmp = simde_mm_cmpgt_epi16(pvScore[0], vzero);
+			int cmp = simde_mm_movemask_epi8(vtmp);
 			if(cmp != 0) {
 				// At least one candidate in this mask.  Now iterate
 				// through vm/vh to evaluate individual cells.
@@ -907,13 +882,13 @@
 	}
 
 	// Find largest score in vmax
-	vtmp = _mm_srli_si128(vmax, 8);
-	vmax = _mm_max_epi16(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 4);
-	vmax = _mm_max_epi16(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 2);
-	vmax = _mm_max_epi16(vmax, vtmp);
-	int16_t ret = _mm_extract_epi16(vmax, 0);
+	vtmp = simde_mm_srli_si128(vmax, 8);
+	vmax = simde_mm_max_epi16(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 4);
+	vmax = simde_mm_max_epi16(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 2);
+	vmax = simde_mm_max_epi16(vmax, vtmp);
+	int16_t ret = simde_mm_extract_epi16(vmax, 0);
 
 	// Update metrics
 	if(!debug) {
@@ -996,72 +971,72 @@
 	// Much of the implmentation below is adapted from Michael's code.
 
 	// Set all elts to reference gap open penalty
-	__m128i rfgapo   = _mm_setzero_si128();
-	__m128i rfgape   = _mm_setzero_si128();
-	__m128i rdgapo   = _mm_setzero_si128();
-	__m128i rdgape   = _mm_setzero_si128();
-	__m128i vlo      = _mm_setzero_si128();
-	__m128i vhi      = _mm_setzero_si128();
-	__m128i vlolsw   = _mm_setzero_si128();
-	__m128i vmax     = _mm_setzero_si128();
-	__m128i vcolmax  = _mm_setzero_si128();
-	__m128i vmaxtmp  = _mm_setzero_si128();
-	__m128i ve       = _mm_setzero_si128();
-	__m128i vf       = _mm_setzero_si128();
-	__m128i vh       = _mm_setzero_si128();
-	__m128i vtmp     = _mm_setzero_si128();
+	simde__m128i rfgapo   = simde_mm_setzero_si128();
+	simde__m128i rfgape   = simde_mm_setzero_si128();
+	simde__m128i rdgapo   = simde_mm_setzero_si128();
+	simde__m128i rdgape   = simde_mm_setzero_si128();
+	simde__m128i vlo      = simde_mm_setzero_si128();
+	simde__m128i vhi      = simde_mm_setzero_si128();
+	simde__m128i vlolsw   = simde_mm_setzero_si128();
+	simde__m128i vmax     = simde_mm_setzero_si128();
+	simde__m128i vcolmax  = simde_mm_setzero_si128();
+	simde__m128i vmaxtmp  = simde_mm_setzero_si128();
+	simde__m128i ve       = simde_mm_setzero_si128();
+	simde__m128i vf       = simde_mm_setzero_si128();
+	simde__m128i vh       = simde_mm_setzero_si128();
+	simde__m128i vtmp     = simde_mm_setzero_si128();
 
 	assert_gt(sc_->refGapOpen(), 0);
 	assert_leq(sc_->refGapOpen(), MAX_I16);
-	rfgapo = _mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
-	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
-	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	rfgapo = simde_mm_insert_epi16(rfgapo, sc_->refGapOpen(), 0);
+	rfgapo = simde_mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = simde_mm_shuffle_epi32(rfgapo, 0);
 	
 	// Set all elts to reference gap extension penalty
 	assert_gt(sc_->refGapExtend(), 0);
 	assert_leq(sc_->refGapExtend(), MAX_I16);
 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
-	rfgape = _mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
-	rfgape = _mm_shufflelo_epi16(rfgape, 0);
-	rfgape = _mm_shuffle_epi32(rfgape, 0);
+	rfgape = simde_mm_insert_epi16(rfgape, sc_->refGapExtend(), 0);
+	rfgape = simde_mm_shufflelo_epi16(rfgape, 0);
+	rfgape = simde_mm_shuffle_epi32(rfgape, 0);
 
 	// Set all elts to read gap open penalty
 	assert_gt(sc_->readGapOpen(), 0);
 	assert_leq(sc_->readGapOpen(), MAX_I16);
-	rdgapo = _mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
-	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
-	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	rdgapo = simde_mm_insert_epi16(rdgapo, sc_->readGapOpen(), 0);
+	rdgapo = simde_mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = simde_mm_shuffle_epi32(rdgapo, 0);
 	
 	// Set all elts to read gap extension penalty
 	assert_gt(sc_->readGapExtend(), 0);
 	assert_leq(sc_->readGapExtend(), MAX_I16);
 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
-	rdgape = _mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
-	rdgape = _mm_shufflelo_epi16(rdgape, 0);
-	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	rdgape = simde_mm_insert_epi16(rdgape, sc_->readGapExtend(), 0);
+	rdgape = simde_mm_shufflelo_epi16(rdgape, 0);
+	rdgape = simde_mm_shuffle_epi32(rdgape, 0);
 
 	// Set all elts to 0x8000 (min value for signed 16-bit)
-	vlo = _mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
-	vlo = _mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
+	vlo = simde_mm_cmpeq_epi16(vlo, vlo);             // all elts = 0xffff
+	vlo = simde_mm_slli_epi16(vlo, NBITS_PER_WORD-1); // all elts = 0x8000
 	
 	// Set all elts to 0x7fff (max value for signed 16-bit)
-	vhi = _mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
-	vhi = _mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
+	vhi = simde_mm_cmpeq_epi16(vhi, vhi);             // all elts = 0xffff
+	vhi = simde_mm_srli_epi16(vhi, 1);                // all elts = 0x7fff
 	
 	// Set all elts to 0x8000 (min value for signed 16-bit)
 	vmax = vlo;
 	
 	// vlolsw: topmost (least sig) word set to 0x8000, all other words=0
-	vlolsw = _mm_shuffle_epi32(vlo, 0);
-	vlolsw = _mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
+	vlolsw = simde_mm_shuffle_epi32(vlo, 0);
+	vlolsw = simde_mm_srli_si128(vlolsw, NBYTES_PER_REG - NBYTES_PER_WORD);
 	
-	// Points to a long vector of __m128i where each element is a block of
+	// Points to a long vector of simde__m128i where each element is a block of
 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
 	// Blocks of cells are organized in the same interleaved manner as they are
 	// calculated by the Farrar algorithm.
-	const __m128i *pvScore; // points into the query profile
+	const simde__m128i *pvScore; // points into the query profile
 
 	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
 	const size_t colstride = d.mat_.colstride();
@@ -1069,22 +1044,22 @@
 	assert_eq(ROWSTRIDE, colstride / iter);
 	
 	// Initialize the H and E vectors in the first matrix column
-	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
-	__m128i *pvETmp = d.mat_.evec(0, 0);
+	simde__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
+	simde__m128i *pvETmp = d.mat_.evec(0, 0);
 	
 	for(size_t i = 0; i < iter; i++) {
-		_mm_store_si128(pvETmp, vlo);
-		_mm_store_si128(pvHTmp, vlo); // start low in local mode
+		simde_mm_store_si128(pvETmp, vlo);
+		simde_mm_store_si128(pvHTmp, vlo); // start low in local mode
 		pvETmp += ROWSTRIDE;
 		pvHTmp += ROWSTRIDE;
 	}
 	// These are swapped just before the innermost loop
-	__m128i *pvHStore = d.mat_.hvec(0, 0);
-	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
-	__m128i *pvELoad  = d.mat_.evec(0, 0);
-	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
-	__m128i *pvFStore = d.mat_.fvec(0, 0);
-	__m128i *pvFTmp   = NULL;
+	simde__m128i *pvHStore = d.mat_.hvec(0, 0);
+	simde__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
+	simde__m128i *pvELoad  = d.mat_.evec(0, 0);
+	simde__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
+	simde__m128i *pvFStore = d.mat_.fvec(0, 0);
+	simde__m128i *pvFTmp   = NULL;
 	
 	assert_gt(sc_->gapbar, 0);
 	size_t nfixup = 0;
@@ -1116,69 +1091,69 @@
 		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
 		
 		// Load H vector from the final row of the previous column
-		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
+		vh = simde_mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
 		
 		// Set all F cells to low value
-		vf = _mm_cmpeq_epi16(vf, vf);
-		vf = _mm_slli_epi16(vf, NBITS_PER_WORD-1);
-		vf = _mm_or_si128(vf, vlolsw);
+		vf = simde_mm_cmpeq_epi16(vf, vf);
+		vf = simde_mm_slli_epi16(vf, NBITS_PER_WORD-1);
+		vf = simde_mm_or_si128(vf, vlolsw);
 		// vf now contains the vertical contribution
 
 		// Store cells in F, calculated previously
 		// No need to veto ref gap extensions, they're all 0x8000s
-		_mm_store_si128(pvFStore, vf);
+		simde_mm_store_si128(pvFStore, vf);
 		pvFStore += ROWSTRIDE;
 		
 		// Shift down so that topmost (least sig) cell gets 0
-		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		vh = simde_mm_slli_si128(vh, NBYTES_PER_WORD);
 		// Fill topmost (least sig) cell with low value
-		vh = _mm_or_si128(vh, vlolsw);
+		vh = simde_mm_or_si128(vh, vlolsw);
 		
 		// We pull out one loop iteration to make it easier to veto values in the top row
 		
 		// Load cells from E, calculated previously
-		ve = _mm_load_si128(pvELoad);
+		ve = simde_mm_load_si128(pvELoad);
 		assert_all_lt(ve, vhi);
 		pvELoad += ROWSTRIDE;
 		// ve now contains the horizontal contribution
 		
 		// Factor in query profile (matches and mismatches)
-		vh = _mm_adds_epi16(vh, pvScore[0]);
+		vh = simde_mm_adds_epi16(vh, pvScore[0]);
 		// vh now contains the diagonal contribution
 		
 		// Update H, factoring in E and F
-		vtmp = _mm_max_epi16(vh, ve);
+		vtmp = simde_mm_max_epi16(vh, ve);
 		// F won't change anything!
 		
 		vh = vtmp;
 		
 		// Update highest score so far
 		vcolmax = vlo;
-		vcolmax = _mm_max_epi16(vcolmax, vh);
+		vcolmax = simde_mm_max_epi16(vcolmax, vh);
 		
 		// Save the new vH values
-		_mm_store_si128(pvHStore, vh);
+		simde_mm_store_si128(pvHStore, vh);
 		pvHStore += ROWSTRIDE;
 		
 		// Update vE value
 		vf = vh;
-		vh = _mm_subs_epi16(vh, rdgapo);
-		vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
-		vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
-		ve = _mm_subs_epi16(ve, rdgape);
-		ve = _mm_max_epi16(ve, vh);
+		vh = simde_mm_subs_epi16(vh, rdgapo);
+		vh = simde_mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+		vh = simde_mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+		ve = simde_mm_subs_epi16(ve, rdgape);
+		ve = simde_mm_max_epi16(ve, vh);
 		assert_all_lt(ve, vhi);
 		
 		// Load the next h value
-		vh = _mm_load_si128(pvHLoad);
+		vh = simde_mm_load_si128(pvHLoad);
 		pvHLoad += ROWSTRIDE;
 		
 		// Save E values
-		_mm_store_si128(pvEStore, ve);
+		simde_mm_store_si128(pvEStore, ve);
 		pvEStore += ROWSTRIDE;
 		
 		// Update vf value
-		vf = _mm_subs_epi16(vf, rfgapo);
+		vf = simde_mm_subs_epi16(vf, rfgapo);
 		assert_all_lt(vf, vhi);
 		
 		pvScore += 2; // move on to next query profile
@@ -1187,131 +1162,131 @@
 		size_t j;
 		for(j = 1; j < iter; j++) {
 			// Load cells from E, calculated previously
-			ve = _mm_load_si128(pvELoad);
+			ve = simde_mm_load_si128(pvELoad);
 			assert_all_lt(ve, vhi);
 			pvELoad += ROWSTRIDE;
 			
 			// Store cells in F, calculated previously
-			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
-			vf = _mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
-			_mm_store_si128(pvFStore, vf);
+			vf = simde_mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			vf = simde_mm_adds_epi16(vf, pvScore[1]); // veto some ref gap extensions
+			simde_mm_store_si128(pvFStore, vf);
 			pvFStore += ROWSTRIDE;
 			
 			// Factor in query profile (matches and mismatches)
-			vh = _mm_adds_epi16(vh, pvScore[0]);
+			vh = simde_mm_adds_epi16(vh, pvScore[0]);
 			
 			// Update H, factoring in E and F
-			vh = _mm_max_epi16(vh, ve);
-			vh = _mm_max_epi16(vh, vf);
+			vh = simde_mm_max_epi16(vh, ve);
+			vh = simde_mm_max_epi16(vh, vf);
 			
 			// Update highest score encountered this far
-			vcolmax = _mm_max_epi16(vcolmax, vh);
+			vcolmax = simde_mm_max_epi16(vcolmax, vh);
 			
 			// Save the new vH values
-			_mm_store_si128(pvHStore, vh);
+			simde_mm_store_si128(pvHStore, vh);
 			pvHStore += ROWSTRIDE;
 			
 			// Update vE value
 			vtmp = vh;
-			vh = _mm_subs_epi16(vh, rdgapo);
-			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
-			vh = _mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epi16(ve, rdgape);
-			ve = _mm_max_epi16(ve, vh);
+			vh = simde_mm_subs_epi16(vh, rdgapo);
+			vh = simde_mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+			vh = simde_mm_adds_epi16(vh, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epi16(ve, rdgape);
+			ve = simde_mm_max_epi16(ve, vh);
 			assert_all_lt(ve, vhi);
 			
 			// Load the next h value
-			vh = _mm_load_si128(pvHLoad);
+			vh = simde_mm_load_si128(pvHLoad);
 			pvHLoad += ROWSTRIDE;
 			
 			// Save E values
-			_mm_store_si128(pvEStore, ve);
+			simde_mm_store_si128(pvEStore, ve);
 			pvEStore += ROWSTRIDE;
 			
 			// Update vf value
-			vtmp = _mm_subs_epi16(vtmp, rfgapo);
-			vf = _mm_subs_epi16(vf, rfgape);
+			vtmp = simde_mm_subs_epi16(vtmp, rfgapo);
+			vf = simde_mm_subs_epi16(vf, rfgape);
 			assert_all_lt(vf, vhi);
-			vf = _mm_max_epi16(vf, vtmp);
+			vf = simde_mm_max_epi16(vf, vtmp);
 			
 			pvScore += 2; // move on to next query profile / gap veto
 		}
 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
 		pvFTmp = pvFStore;
 		pvFStore -= colstride; // reset to start of column
-		vtmp = _mm_load_si128(pvFStore);
+		vtmp = simde_mm_load_si128(pvFStore);
 		
 		pvHStore -= colstride; // reset to start of column
-		vh = _mm_load_si128(pvHStore);
+		vh = simde_mm_load_si128(pvHStore);
 		
 		pvEStore -= colstride; // reset to start of column
-		ve = _mm_load_si128(pvEStore);
+		ve = simde_mm_load_si128(pvEStore);
 		
 		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
 		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
 		
 		// vf from last row gets shifted down by one to overlay the first row
 		// rfgape has already been subtracted from it.
-		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
-		vf = _mm_or_si128(vf, vlolsw);
+		vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = simde_mm_or_si128(vf, vlolsw);
 		
-		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_max_epi16(vtmp, vf);
-		vtmp = _mm_cmpgt_epi16(vf, vtmp);
-		int cmp = _mm_movemask_epi8(vtmp);
+		vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_max_epi16(vtmp, vf);
+		vtmp = simde_mm_cmpgt_epi16(vf, vtmp);
+		int cmp = simde_mm_movemask_epi8(vtmp);
 		
 		// If any element of vtmp is greater than H - gap-open...
 		j = 0;
 		while(cmp != 0x0000) {
 			// Store this vf
-			_mm_store_si128(pvFStore, vf);
+			simde_mm_store_si128(pvFStore, vf);
 			pvFStore += ROWSTRIDE;
 			
 			// Update vh w/r/t new vf
-			vh = _mm_max_epi16(vh, vf);
+			vh = simde_mm_max_epi16(vh, vf);
 			
 			// Save vH values
-			_mm_store_si128(pvHStore, vh);
+			simde_mm_store_si128(pvHStore, vh);
 			pvHStore += ROWSTRIDE;
 			
 			// Update highest score encountered this far
-			vcolmax = _mm_max_epi16(vcolmax, vh);
+			vcolmax = simde_mm_max_epi16(vcolmax, vh);
 			
 			// Update E in case it can be improved using our new vh
-			vh = _mm_subs_epi16(vh, rdgapo);
-			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
-			vh = _mm_adds_epi16(vh, *pvScore); // veto some read gap opens
-			ve = _mm_max_epi16(ve, vh);
-			_mm_store_si128(pvEStore, ve);
+			vh = simde_mm_subs_epi16(vh, rdgapo);
+			vh = simde_mm_adds_epi16(vh, *pvScore); // veto some read gap opens
+			vh = simde_mm_adds_epi16(vh, *pvScore); // veto some read gap opens
+			ve = simde_mm_max_epi16(ve, vh);
+			simde_mm_store_si128(pvEStore, ve);
 			pvEStore += ROWSTRIDE;
 			pvScore += 2;
 			
 			assert_lt(j, iter);
 			if(++j == iter) {
 				pvFStore -= colstride;
-				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				vtmp = simde_mm_load_si128(pvFStore);   // load next vf ASAP
 				pvHStore -= colstride;
-				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				vh = simde_mm_load_si128(pvHStore);     // load next vh ASAP
 				pvEStore -= colstride;
-				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
+				ve = simde_mm_load_si128(pvEStore);     // load next ve ASAP
 				pvScore = d.profbuf_.ptr() + off + 1;
 				j = 0;
-				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
-				vf = _mm_or_si128(vf, vlolsw);
+				vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = simde_mm_or_si128(vf, vlolsw);
 			} else {
-				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
-				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
-				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
+				vtmp = simde_mm_load_si128(pvFStore);   // load next vf ASAP
+				vh = simde_mm_load_si128(pvHStore);     // load next vh ASAP
+				ve = simde_mm_load_si128(pvEStore);     // load next vh ASAP
 			}
 			
 			// Update F with another gap extension
-			vf = _mm_subs_epi16(vf, rfgape);
-			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_max_epi16(vtmp, vf);
-			vtmp = _mm_cmpgt_epi16(vf, vtmp);
-			cmp = _mm_movemask_epi8(vtmp);
+			vf = simde_mm_subs_epi16(vf, rfgape);
+			vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_adds_epi16(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_max_epi16(vtmp, vf);
+			vtmp = simde_mm_cmpgt_epi16(vf, vtmp);
+			cmp = simde_mm_movemask_epi8(vtmp);
 			nfixup++;
 		}
 		
@@ -1333,19 +1308,19 @@
 #endif
 
 		// Store column maximum vector in first element of tmp
-		vmax = _mm_max_epi16(vmax, vcolmax);
-		_mm_store_si128(d.mat_.tmpvec(0, i - rfi_), vcolmax);
+		vmax = simde_mm_max_epi16(vmax, vcolmax);
+		simde_mm_store_si128(d.mat_.tmpvec(0, i - rfi_), vcolmax);
 
 		{
 			// Get single largest score in this column
 			vmaxtmp = vcolmax;
-			vtmp = _mm_srli_si128(vmaxtmp, 8);
-			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 4);
-			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 2);
-			vmaxtmp = _mm_max_epi16(vmaxtmp, vtmp);
-			int16_t ret = _mm_extract_epi16(vmaxtmp, 0);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 8);
+			vmaxtmp = simde_mm_max_epi16(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 4);
+			vmaxtmp = simde_mm_max_epi16(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 2);
+			vmaxtmp = simde_mm_max_epi16(vmaxtmp, vtmp);
+			int16_t ret = simde_mm_extract_epi16(vmaxtmp, 0);
 			TAlScore score = (TAlScore)(ret + 0x8000);
 			
 			if(score < minsc_) {
@@ -1370,13 +1345,13 @@
 	}
 
 	// Find largest score in vmax
-	vtmp = _mm_srli_si128(vmax, 8);
-	vmax = _mm_max_epi16(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 4);
-	vmax = _mm_max_epi16(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 2);
-	vmax = _mm_max_epi16(vmax, vtmp);
-	int16_t ret = _mm_extract_epi16(vmax, 0);
+	vtmp = simde_mm_srli_si128(vmax, 8);
+	vmax = simde_mm_max_epi16(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 4);
+	vmax = simde_mm_max_epi16(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 2);
+	vmax = simde_mm_max_epi16(vmax, vtmp);
+	int16_t ret = simde_mm_extract_epi16(vmax, 0);
 
 	// Update metrics
 	if(!debug) {
@@ -1495,21 +1470,21 @@
 		size_t nrow_hi = nrow;
 		// First, check if there is a cell in this column with a score
 		// above the score threshold
-		__m128i vmax = *d.mat_.tmpvec(0, j);
-		__m128i vtmp = _mm_srli_si128(vmax, 8);
-		vmax = _mm_max_epi16(vmax, vtmp);
-		vtmp = _mm_srli_si128(vmax, 4);
-		vmax = _mm_max_epi16(vmax, vtmp);
-		vtmp = _mm_srli_si128(vmax, 2);
-		vmax = _mm_max_epi16(vmax, vtmp);
-		TAlScore score = (TAlScore)((int16_t)_mm_extract_epi16(vmax, 0) + 0x8000);
+		simde__m128i vmax = *d.mat_.tmpvec(0, j);
+		simde__m128i vtmp = simde_mm_srli_si128(vmax, 8);
+		vmax = simde_mm_max_epi16(vmax, vtmp);
+		vtmp = simde_mm_srli_si128(vmax, 4);
+		vmax = simde_mm_max_epi16(vmax, vtmp);
+		vtmp = simde_mm_srli_si128(vmax, 2);
+		vmax = simde_mm_max_epi16(vmax, vtmp);
+		TAlScore score = (TAlScore)((int16_t)simde_mm_extract_epi16(vmax, 0) + 0x8000);
 		assert_geq(score, 0);
 #ifndef NDEBUG
 		{
 			// Start in upper vector row and move down
 			TAlScore max = 0;
 			vmax = *d.mat_.tmpvec(0, j);
-			__m128i *pvH = d.mat_.hvec(0, j);
+			simde__m128i *pvH = d.mat_.hvec(0, j);
 			for(size_t i = 0; i < iter; i++) {
 				for(size_t k = 0; k < NWORDS_PER_REG; k++) {
 					TAlScore sc = (TAlScore)(((TCScore*)pvH)[k] + 0x8000);
@@ -1529,11 +1504,11 @@
 			continue;
 		}
 		// Get pointer to first cell in column to examine:
-		__m128i *pvHorig = d.mat_.hvec(0, j);
-		__m128i *pvH     = pvHorig;
+		simde__m128i *pvHorig = d.mat_.hvec(0, j);
+		simde__m128i *pvH     = pvHorig;
 		// Get pointer to the vector in the following column that corresponds
 		// to the cells diagonally down and to the right from the cells in pvH
-		__m128i *pvHSucc = (j < ncol-1) ? d.mat_.hvec(0, j+1) : NULL;
+		simde__m128i *pvHSucc = (j < ncol-1) ? d.mat_.hvec(0, j+1) : NULL;
 		// Start in upper vector row and move down
 		for(size_t i = 0; i < iter; i++) {
 			if(pvHSucc != NULL) {
@@ -1700,7 +1675,7 @@
 	size_t rowelt, rowvec, eltvec;
 	size_t left_rowelt, up_rowelt, upleft_rowelt;
 	size_t left_rowvec, up_rowvec, upleft_rowvec;
-	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
+	simde__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
 	const size_t gbar = sc_->gapbar;
 	NEW_ROW_COL(row, col);
 	// If 'backEliminate' is true, then every time we visit a cell, we remove
--- hisat2.orig/aligner_swsse_loc_u8.cpp
+++ hisat2/aligner_swsse_loc_u8.cpp
@@ -83,7 +83,7 @@
 	const BTString* qu = fw ? qufw_ : qurc_;
 	const size_t len = rd->length();
 	const size_t seglen = (len + (NWORDS_PER_REG-1)) / NWORDS_PER_REG;
-	// How many __m128i's are needed
+	// How many simde__m128i's are needed
 	size_t n128s =
 		64 +                    // slack bytes, for alignment?
 		(seglen * ALPHA_SIZE)   // query profile data
@@ -240,49 +240,24 @@
 #ifdef NDEBUG
 
 #define assert_all_eq0(x)
-#define assert_all_gt(x, y)
-#define assert_all_gt_lo(x)
 #define assert_all_lt(x, y)
-#define assert_all_lt_hi(x)
 
 #else
 
 #define assert_all_eq0(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	tmp = _mm_cmpeq_epi16(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_gt(x, y) { \
-	__m128i tmp = _mm_cmpgt_epu8(x, y); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_gt_lo(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	tmp = _mm_cmpgt_epu8(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i z = simde_mm_setzero_si128(); \
+	simde__m128i tmp = simde_mm_setzero_si128(); \
+	z = simde_mm_xor_si128(z, z); \
+	tmp = simde_mm_cmpeq_epi16(x, z); \
+	assert_eq(0xffff, simde_mm_movemask_epi8(tmp)); \
 }
 
 #define assert_all_lt(x, y) { \
-	__m128i z = _mm_setzero_si128(); \
-	z = _mm_xor_si128(z, z); \
-	__m128i tmp = _mm_subs_epu8(y, x); \
-	tmp = _mm_cmpeq_epi16(tmp, z); \
-	assert_eq(0x0000, _mm_movemask_epi8(tmp)); \
-}
-
-#define assert_all_lt_hi(x) { \
-	__m128i z = _mm_setzero_si128(); \
-	__m128i tmp = _mm_setzero_si128(); \
-	z = _mm_cmpeq_epu8(z, z); \
-	z = _mm_srli_epu8(z, 1); \
-	tmp = _mm_cmplt_epu8(x, z); \
-	assert_eq(0xffff, _mm_movemask_epi8(tmp)); \
+	simde__m128i z = simde_mm_setzero_si128(); \
+	z = simde_mm_xor_si128(z, z); \
+	simde__m128i tmp = simde_mm_subs_epu8(y, x); \
+	tmp = simde_mm_cmpeq_epi16(tmp, z); \
+	assert_eq(0x0000, simde_mm_movemask_epi8(tmp)); \
 }
 #endif
 
@@ -335,8 +310,8 @@
 	// we'll call "left" and "right".
 	d.vecbuf_.resize(ROWSTRIDE_2COL * iter * 2);
 	d.vecbuf_.zero();
-	__m128i *vbuf_l = d.vecbuf_.ptr();
-	__m128i *vbuf_r = d.vecbuf_.ptr() + (ROWSTRIDE_2COL * iter);
+	simde__m128i *vbuf_l = d.vecbuf_.ptr();
+	simde__m128i *vbuf_r = d.vecbuf_.ptr() + (ROWSTRIDE_2COL * iter);
 	
 	// This is the data structure that holds candidate cells per diagonal.
 	const size_t ndiags = rff_ - rfi_ + dpRows() - 1;
@@ -369,101 +344,101 @@
 	// Much of the implmentation below is adapted from Michael's code.
 
 	// Set all elts to reference gap open penalty
-	__m128i rfgapo   = _mm_setzero_si128();
-	__m128i rfgape   = _mm_setzero_si128();
-	__m128i rdgapo   = _mm_setzero_si128();
-	__m128i rdgape   = _mm_setzero_si128();
-	__m128i vlo      = _mm_setzero_si128();
-	__m128i vhi      = _mm_setzero_si128();
-	__m128i vmax     = _mm_setzero_si128();
-	__m128i vcolmax  = _mm_setzero_si128();
-	__m128i vmaxtmp  = _mm_setzero_si128();
-	__m128i ve       = _mm_setzero_si128();
-	__m128i vf       = _mm_setzero_si128();
-	__m128i vh       = _mm_setzero_si128();
-	__m128i vhd      = _mm_setzero_si128();
-	__m128i vhdtmp   = _mm_setzero_si128();
-	__m128i vtmp     = _mm_setzero_si128();
-	__m128i vzero    = _mm_setzero_si128();
-	__m128i vbias    = _mm_setzero_si128();
-	__m128i vbiasm1  = _mm_setzero_si128();
-	__m128i vminsc   = _mm_setzero_si128();
+	simde__m128i rfgapo   = simde_mm_setzero_si128();
+	simde__m128i rfgape   = simde_mm_setzero_si128();
+	simde__m128i rdgapo   = simde_mm_setzero_si128();
+	simde__m128i rdgape   = simde_mm_setzero_si128();
+	simde__m128i vlo      = simde_mm_setzero_si128();
+	simde__m128i vhi      = simde_mm_setzero_si128();
+	simde__m128i vmax     = simde_mm_setzero_si128();
+	simde__m128i vcolmax  = simde_mm_setzero_si128();
+	simde__m128i vmaxtmp  = simde_mm_setzero_si128();
+	simde__m128i ve       = simde_mm_setzero_si128();
+	simde__m128i vf       = simde_mm_setzero_si128();
+	simde__m128i vh       = simde_mm_setzero_si128();
+	simde__m128i vhd      = simde_mm_setzero_si128();
+	simde__m128i vhdtmp   = simde_mm_setzero_si128();
+	simde__m128i vtmp     = simde_mm_setzero_si128();
+	simde__m128i vzero    = simde_mm_setzero_si128();
+	simde__m128i vbias    = simde_mm_setzero_si128();
+	simde__m128i vbiasm1  = simde_mm_setzero_si128();
+	simde__m128i vminsc   = simde_mm_setzero_si128();
 
 	int dup;
 
 	assert_gt(sc_->refGapOpen(), 0);
 	assert_leq(sc_->refGapOpen(), MAX_U8);
 	dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
-	rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
-	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
-	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	rfgapo = simde_mm_insert_epi16(rfgapo, dup, 0);
+	rfgapo = simde_mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = simde_mm_shuffle_epi32(rfgapo, 0);
 	
 	// Set all elts to reference gap extension penalty
 	assert_gt(sc_->refGapExtend(), 0);
 	assert_leq(sc_->refGapExtend(), MAX_U8);
 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
 	dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
-	rfgape = _mm_insert_epi16(rfgape, dup, 0);
-	rfgape = _mm_shufflelo_epi16(rfgape, 0);
-	rfgape = _mm_shuffle_epi32(rfgape, 0);
+	rfgape = simde_mm_insert_epi16(rfgape, dup, 0);
+	rfgape = simde_mm_shufflelo_epi16(rfgape, 0);
+	rfgape = simde_mm_shuffle_epi32(rfgape, 0);
 
 	// Set all elts to read gap open penalty
 	assert_gt(sc_->readGapOpen(), 0);
 	assert_leq(sc_->readGapOpen(), MAX_U8);
 	dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
-	rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
-	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
-	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	rdgapo = simde_mm_insert_epi16(rdgapo, dup, 0);
+	rdgapo = simde_mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = simde_mm_shuffle_epi32(rdgapo, 0);
 	
 	// Set all elts to read gap extension penalty
 	assert_gt(sc_->readGapExtend(), 0);
 	assert_leq(sc_->readGapExtend(), MAX_U8);
 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
 	dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
-	rdgape = _mm_insert_epi16(rdgape, dup, 0);
-	rdgape = _mm_shufflelo_epi16(rdgape, 0);
-	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	rdgape = simde_mm_insert_epi16(rdgape, dup, 0);
+	rdgape = simde_mm_shufflelo_epi16(rdgape, 0);
+	rdgape = simde_mm_shuffle_epi32(rdgape, 0);
 	
 	// Set all elts to minimum score threshold.  Actually, to 1 less than the
 	// threshold so we can use gt instead of geq.
 	dup = (((int)minsc_ - 1) << 8) | (((int)minsc_ - 1) & 0x00ff);
-	vminsc = _mm_insert_epi16(vminsc, dup, 0);
-	vminsc = _mm_shufflelo_epi16(vminsc, 0);
-	vminsc = _mm_shuffle_epi32(vminsc, 0);
+	vminsc = simde_mm_insert_epi16(vminsc, dup, 0);
+	vminsc = simde_mm_shufflelo_epi16(vminsc, 0);
+	vminsc = simde_mm_shuffle_epi32(vminsc, 0);
 
 	dup = ((d.bias_ - 1) << 8) | ((d.bias_ - 1) & 0x00ff);
-	vbiasm1 = _mm_insert_epi16(vbiasm1, dup, 0);
-	vbiasm1 = _mm_shufflelo_epi16(vbiasm1, 0);
-	vbiasm1 = _mm_shuffle_epi32(vbiasm1, 0);
-	vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
-	vlo = _mm_xor_si128(vlo, vlo);   // all elts = 0
+	vbiasm1 = simde_mm_insert_epi16(vbiasm1, dup, 0);
+	vbiasm1 = simde_mm_shufflelo_epi16(vbiasm1, 0);
+	vbiasm1 = simde_mm_shuffle_epi32(vbiasm1, 0);
+	vhi = simde_mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
+	vlo = simde_mm_xor_si128(vlo, vlo);   // all elts = 0
 	vmax = vlo;
 	
 	// Make a vector of bias offsets
 	dup = (d.bias_ << 8) | (d.bias_ & 0x00ff);
-	vbias = _mm_insert_epi16(vbias, dup, 0);
-	vbias = _mm_shufflelo_epi16(vbias, 0);
-	vbias = _mm_shuffle_epi32(vbias, 0);
+	vbias = simde_mm_insert_epi16(vbias, dup, 0);
+	vbias = simde_mm_shufflelo_epi16(vbias, 0);
+	vbias = simde_mm_shuffle_epi32(vbias, 0);
 	
-	// Points to a long vector of __m128i where each element is a block of
+	// Points to a long vector of simde__m128i where each element is a block of
 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
 	// Blocks of cells are organized in the same interleaved manner as they are
 	// calculated by the Farrar algorithm.
-	const __m128i *pvScore; // points into the query profile
+	const simde__m128i *pvScore; // points into the query profile
 
 	const size_t colstride = ROWSTRIDE_2COL * iter;
 	
 	// Initialize the H and E vectors in the first matrix column
-	__m128i *pvELeft = vbuf_l + 0; __m128i *pvERight = vbuf_r + 0;
-	/* __m128i *pvFLeft = vbuf_l + 1; */ __m128i *pvFRight = vbuf_r + 1;
-	__m128i *pvHLeft = vbuf_l + 2; __m128i *pvHRight = vbuf_r + 2;
+	simde__m128i *pvELeft = vbuf_l + 0; simde__m128i *pvERight = vbuf_r + 0;
+	/* simde__m128i *pvFLeft = vbuf_l + 1; */ simde__m128i *pvFRight = vbuf_r + 1;
+	simde__m128i *pvHLeft = vbuf_l + 2; simde__m128i *pvHRight = vbuf_r + 2;
 	
 	for(size_t i = 0; i < iter; i++) {
 		// start low in local mode
-		_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
-		_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
+		simde_mm_store_si128(pvERight, vlo); pvERight += ROWSTRIDE_2COL;
+		simde_mm_store_si128(pvHRight, vlo); pvHRight += ROWSTRIDE_2COL;
 	}
 	
 	assert_gt(sc_->gapbar, 0);
@@ -507,48 +482,48 @@
 		// current iter's?  The way we currently do it, seems like it will
 		// almost always require at least one fixup loop iter (to recalculate
 		// this topmost F).
-		vh = _mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
+		vh = simde_mm_load_si128(pvHLeft + colstride - ROWSTRIDE_2COL);
 		
 		// Set all cells to low value
-		vf = _mm_xor_si128(vf, vf);
+		vf = simde_mm_xor_si128(vf, vf);
 		// vf now contains the vertical contribution
 
 		// Store cells in F, calculated previously
 		// No need to veto ref gap extensions, they're all 0x00s
-		_mm_store_si128(pvFRight, vf);
+		simde_mm_store_si128(pvFRight, vf);
 		pvFRight += ROWSTRIDE_2COL;
 		
 		// Shift down so that topmost (least sig) cell gets 0
-		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		vh = simde_mm_slli_si128(vh, NBYTES_PER_WORD);
 		
 		// We pull out one loop iteration to make it easier to veto values in the top row
 		
 		// Load cells from E, calculated previously
-		ve = _mm_load_si128(pvELeft);
-		vhd = _mm_load_si128(pvHLeft);
+		ve = simde_mm_load_si128(pvELeft);
+		vhd = simde_mm_load_si128(pvHLeft);
 		assert_all_lt(ve, vhi);
 		pvELeft += ROWSTRIDE_2COL;
 		// ve now contains the horizontal contribution
 		
 		// Factor in query profile (matches and mismatches)
-		vh = _mm_adds_epu8(vh, pvScore[0]);
-		vh = _mm_subs_epu8(vh, vbias);
+		vh = simde_mm_adds_epu8(vh, pvScore[0]);
+		vh = simde_mm_subs_epu8(vh, vbias);
 		// vh now contains the diagonal contribution
 
 		vhdtmp = vhd;
-		vhd = _mm_subs_epu8(vhd, rdgapo);
-		vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
-		ve = _mm_subs_epu8(ve, rdgape);
-		ve = _mm_max_epu8(ve, vhd);
+		vhd = simde_mm_subs_epu8(vhd, rdgapo);
+		vhd = simde_mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
+		ve = simde_mm_subs_epu8(ve, rdgape);
+		ve = simde_mm_max_epu8(ve, vhd);
 
-		vh = _mm_max_epu8(vh, ve);
+		vh = simde_mm_max_epu8(vh, ve);
 		vf = vh;
 
 		// Update highest score so far
 		vcolmax = vh;
 		
 		// Save the new vH values
-		_mm_store_si128(pvHRight, vh);
+		simde_mm_store_si128(pvHRight, vh);
 
 		vh = vhdtmp;
 		assert_all_lt(ve, vhi);
@@ -556,11 +531,11 @@
 		pvHLeft += ROWSTRIDE_2COL;
 		
 		// Save E values
-		_mm_store_si128(pvERight, ve);
+		simde_mm_store_si128(pvERight, ve);
 		pvERight += ROWSTRIDE_2COL;
 		
 		// Update vf value
-		vf = _mm_subs_epu8(vf, rfgapo);
+		vf = simde_mm_subs_epu8(vf, rfgapo);
 		assert_all_lt(vf, vhi);
 		
 		pvScore += 2; // move on to next query profile
@@ -569,37 +544,37 @@
 		size_t j;
 		for(j = 1; j < iter; j++) {
 			// Load cells from E, calculated previously
-			ve = _mm_load_si128(pvELeft);
-			vhd = _mm_load_si128(pvHLeft);
+			ve = simde_mm_load_si128(pvELeft);
+			vhd = simde_mm_load_si128(pvHLeft);
 			assert_all_lt(ve, vhi);
 			pvELeft += ROWSTRIDE_2COL;
 			
 			// Store cells in F, calculated previously
-			vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
-			_mm_store_si128(pvFRight, vf);
+			vf = simde_mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
+			simde_mm_store_si128(pvFRight, vf);
 			pvFRight += ROWSTRIDE_2COL;
 			
 			// Factor in query profile (matches and mismatches)
-			vh = _mm_adds_epu8(vh, pvScore[0]);
-			vh = _mm_subs_epu8(vh, vbias);
+			vh = simde_mm_adds_epu8(vh, pvScore[0]);
+			vh = simde_mm_subs_epu8(vh, vbias);
 			
 			// Update H, factoring in E and F
-			vh = _mm_max_epu8(vh, vf);
+			vh = simde_mm_max_epu8(vh, vf);
 
 			vhdtmp = vhd;
-			vhd = _mm_subs_epu8(vhd, rdgapo);
-			vhd = _mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epu8(ve, rdgape);
-			ve = _mm_max_epu8(ve, vhd);
+			vhd = simde_mm_subs_epu8(vhd, rdgapo);
+			vhd = simde_mm_subs_epu8(vhd, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epu8(ve, rdgape);
+			ve = simde_mm_max_epu8(ve, vhd);
 			
-			vh = _mm_max_epu8(vh, ve);
+			vh = simde_mm_max_epu8(vh, ve);
 			vtmp = vh;
 			
 			// Update highest score encountered this far
-			vcolmax = _mm_max_epu8(vcolmax, vh);
+			vcolmax = simde_mm_max_epu8(vcolmax, vh);
 			
 			// Save the new vH values
-			_mm_store_si128(pvHRight, vh);
+			simde_mm_store_si128(pvHRight, vh);
 
 			vh = vhdtmp;
 
@@ -608,79 +583,79 @@
 			pvHLeft += ROWSTRIDE_2COL;
 			
 			// Save E values
-			_mm_store_si128(pvERight, ve);
+			simde_mm_store_si128(pvERight, ve);
 			pvERight += ROWSTRIDE_2COL;
 			
 			// Update vf value
-			vtmp = _mm_subs_epu8(vtmp, rfgapo);
-			vf = _mm_subs_epu8(vf, rfgape);
+			vtmp = simde_mm_subs_epu8(vtmp, rfgapo);
+			vf = simde_mm_subs_epu8(vf, rfgape);
 			assert_all_lt(vf, vhi);
-			vf = _mm_max_epu8(vf, vtmp);
+			vf = simde_mm_max_epu8(vf, vtmp);
 			
 			pvScore += 2; // move on to next query profile / gap veto
 		}
 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
 		pvFRight -= colstride; // reset to start of column
-		vtmp = _mm_load_si128(pvFRight);
+		vtmp = simde_mm_load_si128(pvFRight);
 		
 		pvHRight -= colstride; // reset to start of column
-		vh = _mm_load_si128(pvHRight);
+		vh = simde_mm_load_si128(pvHRight);
 		
 		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
 		
 		// vf from last row gets shifted down by one to overlay the first row
 		// rfgape has already been subtracted from it.
-		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
 		
-		vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_max_epu8(vtmp, vf);
+		vf = simde_mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_max_epu8(vtmp, vf);
 		// TODO: We're testing whether F changed.  Can't we just assume that F
 		// did change and instead check whether H changed?  Might save us from
 		// entering the fixup loop.
-		vtmp = _mm_subs_epu8(vf, vtmp);
-		vtmp = _mm_cmpeq_epi8(vtmp, vzero);
-		int cmp = _mm_movemask_epi8(vtmp);
+		vtmp = simde_mm_subs_epu8(vf, vtmp);
+		vtmp = simde_mm_cmpeq_epi8(vtmp, vzero);
+		int cmp = simde_mm_movemask_epi8(vtmp);
 		
 		// If any element of vtmp is greater than H - gap-open...
 		j = 0;
 		while(cmp != 0xffff) {
 			// Store this vf
-			_mm_store_si128(pvFRight, vf);
+			simde_mm_store_si128(pvFRight, vf);
 			pvFRight += ROWSTRIDE_2COL;
 			
 			// Update vh w/r/t new vf
-			vh = _mm_max_epu8(vh, vf);
+			vh = simde_mm_max_epu8(vh, vf);
 			
 			// Save vH values
-			_mm_store_si128(pvHRight, vh);
+			simde_mm_store_si128(pvHRight, vh);
 			pvHRight += ROWSTRIDE_2COL;
 			
 			// Update highest score encountered so far.
-			vcolmax = _mm_max_epu8(vcolmax, vh);
+			vcolmax = simde_mm_max_epu8(vcolmax, vh);
 
 			pvScore += 2;
 			
 			assert_lt(j, iter);
 			if(++j == iter) {
 				pvFRight -= colstride;
-				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
+				vtmp = simde_mm_load_si128(pvFRight);   // load next vf ASAP
 				pvHRight -= colstride;
-				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				vh = simde_mm_load_si128(pvHRight);     // load next vh ASAP
 				pvScore = d.profbuf_.ptr() + off + 1;
 				j = 0;
-				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
 			} else {
-				vtmp = _mm_load_si128(pvFRight);   // load next vf ASAP
-				vh = _mm_load_si128(pvHRight);     // load next vh ASAP
+				vtmp = simde_mm_load_si128(pvFRight);   // load next vf ASAP
+				vh = simde_mm_load_si128(pvHRight);     // load next vh ASAP
 			}
 			
 			// Update F with another gap extension
-			vf = _mm_subs_epu8(vf, rfgape);
-			vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_max_epu8(vtmp, vf);
-			vtmp = _mm_subs_epu8(vf, vtmp);
-			vtmp = _mm_cmpeq_epi8(vtmp, vzero);
-			cmp = _mm_movemask_epi8(vtmp);
+			vf = simde_mm_subs_epu8(vf, rfgape);
+			vf = simde_mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_max_epu8(vtmp, vf);
+			vtmp = simde_mm_subs_epu8(vf, vtmp);
+			vtmp = simde_mm_cmpeq_epi8(vtmp, vzero);
+			cmp = simde_mm_movemask_epi8(vtmp);
 			nfixup++;
 		}
 
@@ -696,9 +671,9 @@
 			assert_lt(lastoff, MAX_SIZE_T);
 			pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
 			for(size_t k = 0; k < iter; k++) {
-				vh = _mm_load_si128(pvHLeft);
-				vtmp = _mm_cmpgt_epi8(pvScore[0], vbiasm1);
-				int cmp = _mm_movemask_epi8(vtmp);
+				vh = simde_mm_load_si128(pvHLeft);
+				vtmp = simde_mm_cmpgt_epi8(pvScore[0], vbiasm1);
+				int cmp = simde_mm_movemask_epi8(vtmp);
 				if(cmp != 0xffff) {
 					// At least one candidate in this mask.  Now iterate
 					// through vm/vh to evaluate individual cells.
@@ -726,9 +701,9 @@
 		// Save some elements to checkpoints
 		if(checkpoint) {
 			
-			__m128i *pvE = vbuf_r + 0;
-			__m128i *pvF = vbuf_r + 1;
-			__m128i *pvH = vbuf_r + 2;
+			simde__m128i *pvE = vbuf_r + 0;
+			simde__m128i *pvF = vbuf_r + 1;
+			simde__m128i *pvH = vbuf_r + 2;
 			size_t coli = i - rfi_;
 			if(coli < cper_.locol_) cper_.locol_ = coli;
 			if(coli > cper_.hicol_) cper_.hicol_ = coli;
@@ -824,34 +799,34 @@
 					assert_gt(coli, 0);
 					size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
 					size_t coloff = (coli >> cper_.perpow2_) * wordspercol;
-					__m128i *dst = cper_.qcols_.ptr() + coloff;
-					memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+					simde__m128i *dst = cper_.qcols_.ptr() + coloff;
+					memcpy(dst, vbuf_r, sizeof(simde__m128i) * wordspercol);
 				}
 			}
 			if(cper_.debug_) {
 				// Save the column using memcpys
 				size_t wordspercol = cper_.niter_ * ROWSTRIDE_2COL;
 				size_t coloff = coli * wordspercol;
-				__m128i *dst = cper_.qcolsD_.ptr() + coloff;
-				memcpy(dst, vbuf_r, sizeof(__m128i) * wordspercol);
+				simde__m128i *dst = cper_.qcolsD_.ptr() + coloff;
+				memcpy(dst, vbuf_r, sizeof(simde__m128i) * wordspercol);
 			}
 		}
 
 		// Store column maximum vector in first element of tmp
-		vmax = _mm_max_epu8(vmax, vcolmax);
+		vmax = simde_mm_max_epu8(vmax, vcolmax);
 
 		{
 			// Get single largest score in this column
 			vmaxtmp = vcolmax;
-			vtmp = _mm_srli_si128(vmaxtmp, 8);
-			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 4);
-			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 2);
-			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 1);
-			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
-			int score = _mm_extract_epi16(vmaxtmp, 0);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 8);
+			vmaxtmp = simde_mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 4);
+			vmaxtmp = simde_mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 2);
+			vmaxtmp = simde_mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 1);
+			vmaxtmp = simde_mm_max_epu8(vmaxtmp, vtmp);
+			int score = simde_mm_extract_epi16(vmaxtmp, 0);
 			score = score & 0x00ff;
 
 			// Could we have saturated?
@@ -888,9 +863,9 @@
 		assert_lt(lastoff, MAX_SIZE_T);
 		pvScore = d.profbuf_.ptr() + lastoff; // even elts = query profile, odd = gap barrier
 		for(size_t k = 0; k < iter; k++) {
-			vh = _mm_load_si128(pvHLeft);
-			vtmp = _mm_cmpgt_epi8(pvScore[0], vbiasm1);
-			int cmp = _mm_movemask_epi8(vtmp);
+			vh = simde_mm_load_si128(pvHLeft);
+			vtmp = simde_mm_cmpgt_epi8(pvScore[0], vbiasm1);
+			int cmp = simde_mm_movemask_epi8(vtmp);
 			if(cmp != 0xffff) {
 				// At least one candidate in this mask.  Now iterate
 				// through vm/vh to evaluate individual cells.
@@ -916,14 +891,14 @@
 	}
 
 	// Find largest score in vmax
-	vtmp = _mm_srli_si128(vmax, 8);
-	vmax = _mm_max_epu8(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 4);
-	vmax = _mm_max_epu8(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 2);
-	vmax = _mm_max_epu8(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 1);
-	vmax = _mm_max_epu8(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 8);
+	vmax = simde_mm_max_epu8(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 4);
+	vmax = simde_mm_max_epu8(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 2);
+	vmax = simde_mm_max_epu8(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 1);
+	vmax = simde_mm_max_epu8(vmax, vtmp);
 	
 	// Update metrics
 	if(!debug) {
@@ -934,7 +909,7 @@
 		met.fixup += nfixup;                    // DP fixup loop iters
 	}
 	
-	int score = _mm_extract_epi16(vmax, 0);
+	int score = simde_mm_extract_epi16(vmax, 0);
 	score = score & 0x00ff;
 
 	flag = 0;
@@ -1005,72 +980,72 @@
 	// Much of the implmentation below is adapted from Michael's code.
 
 	// Set all elts to reference gap open penalty
-	__m128i rfgapo   = _mm_setzero_si128();
-	__m128i rfgape   = _mm_setzero_si128();
-	__m128i rdgapo   = _mm_setzero_si128();
-	__m128i rdgape   = _mm_setzero_si128();
-	__m128i vlo      = _mm_setzero_si128();
-	__m128i vhi      = _mm_setzero_si128();
-	__m128i vmax     = _mm_setzero_si128();
-	__m128i vcolmax  = _mm_setzero_si128();
-	__m128i vmaxtmp  = _mm_setzero_si128();
-	__m128i ve       = _mm_setzero_si128();
-	__m128i vf       = _mm_setzero_si128();
-	__m128i vh       = _mm_setzero_si128();
-	__m128i vtmp     = _mm_setzero_si128();
-	__m128i vzero    = _mm_setzero_si128();
-	__m128i vbias    = _mm_setzero_si128();
+	simde__m128i rfgapo   = simde_mm_setzero_si128();
+	simde__m128i rfgape   = simde_mm_setzero_si128();
+	simde__m128i rdgapo   = simde_mm_setzero_si128();
+	simde__m128i rdgape   = simde_mm_setzero_si128();
+	simde__m128i vlo      = simde_mm_setzero_si128();
+	simde__m128i vhi      = simde_mm_setzero_si128();
+	simde__m128i vmax     = simde_mm_setzero_si128();
+	simde__m128i vcolmax  = simde_mm_setzero_si128();
+	simde__m128i vmaxtmp  = simde_mm_setzero_si128();
+	simde__m128i ve       = simde_mm_setzero_si128();
+	simde__m128i vf       = simde_mm_setzero_si128();
+	simde__m128i vh       = simde_mm_setzero_si128();
+	simde__m128i vtmp     = simde_mm_setzero_si128();
+	simde__m128i vzero    = simde_mm_setzero_si128();
+	simde__m128i vbias    = simde_mm_setzero_si128();
 
 	assert_gt(sc_->refGapOpen(), 0);
 	assert_leq(sc_->refGapOpen(), MAX_U8);
 	dup = (sc_->refGapOpen() << 8) | (sc_->refGapOpen() & 0x00ff);
-	rfgapo = _mm_insert_epi16(rfgapo, dup, 0);
-	rfgapo = _mm_shufflelo_epi16(rfgapo, 0);
-	rfgapo = _mm_shuffle_epi32(rfgapo, 0);
+	rfgapo = simde_mm_insert_epi16(rfgapo, dup, 0);
+	rfgapo = simde_mm_shufflelo_epi16(rfgapo, 0);
+	rfgapo = simde_mm_shuffle_epi32(rfgapo, 0);
 	
 	// Set all elts to reference gap extension penalty
 	assert_gt(sc_->refGapExtend(), 0);
 	assert_leq(sc_->refGapExtend(), MAX_U8);
 	assert_leq(sc_->refGapExtend(), sc_->refGapOpen());
 	dup = (sc_->refGapExtend() << 8) | (sc_->refGapExtend() & 0x00ff);
-	rfgape = _mm_insert_epi16(rfgape, dup, 0);
-	rfgape = _mm_shufflelo_epi16(rfgape, 0);
-	rfgape = _mm_shuffle_epi32(rfgape, 0);
+	rfgape = simde_mm_insert_epi16(rfgape, dup, 0);
+	rfgape = simde_mm_shufflelo_epi16(rfgape, 0);
+	rfgape = simde_mm_shuffle_epi32(rfgape, 0);
 
 	// Set all elts to read gap open penalty
 	assert_gt(sc_->readGapOpen(), 0);
 	assert_leq(sc_->readGapOpen(), MAX_U8);
 	dup = (sc_->readGapOpen() << 8) | (sc_->readGapOpen() & 0x00ff);
-	rdgapo = _mm_insert_epi16(rdgapo, dup, 0);
-	rdgapo = _mm_shufflelo_epi16(rdgapo, 0);
-	rdgapo = _mm_shuffle_epi32(rdgapo, 0);
+	rdgapo = simde_mm_insert_epi16(rdgapo, dup, 0);
+	rdgapo = simde_mm_shufflelo_epi16(rdgapo, 0);
+	rdgapo = simde_mm_shuffle_epi32(rdgapo, 0);
 	
 	// Set all elts to read gap extension penalty
 	assert_gt(sc_->readGapExtend(), 0);
 	assert_leq(sc_->readGapExtend(), MAX_U8);
 	assert_leq(sc_->readGapExtend(), sc_->readGapOpen());
 	dup = (sc_->readGapExtend() << 8) | (sc_->readGapExtend() & 0x00ff);
-	rdgape = _mm_insert_epi16(rdgape, dup, 0);
-	rdgape = _mm_shufflelo_epi16(rdgape, 0);
-	rdgape = _mm_shuffle_epi32(rdgape, 0);
+	rdgape = simde_mm_insert_epi16(rdgape, dup, 0);
+	rdgape = simde_mm_shufflelo_epi16(rdgape, 0);
+	rdgape = simde_mm_shuffle_epi32(rdgape, 0);
 	
-	vhi = _mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
-	vlo = _mm_xor_si128(vlo, vlo);   // all elts = 0
+	vhi = simde_mm_cmpeq_epi16(vhi, vhi); // all elts = 0xffff
+	vlo = simde_mm_xor_si128(vlo, vlo);   // all elts = 0
 	vmax = vlo;
 	
 	// Make a vector of bias offsets
 	dup = (d.bias_ << 8) | (d.bias_ & 0x00ff);
-	vbias = _mm_insert_epi16(vbias, dup, 0);
-	vbias = _mm_shufflelo_epi16(vbias, 0);
-	vbias = _mm_shuffle_epi32(vbias, 0);
+	vbias = simde_mm_insert_epi16(vbias, dup, 0);
+	vbias = simde_mm_shufflelo_epi16(vbias, 0);
+	vbias = simde_mm_shuffle_epi32(vbias, 0);
 	
-	// Points to a long vector of __m128i where each element is a block of
+	// Points to a long vector of simde__m128i where each element is a block of
 	// contiguous cells in the E, F or H matrix.  If the index % 3 == 0, then
 	// the block of cells is from the E matrix.  If index % 3 == 1, they're
 	// from the F matrix.  If index % 3 == 2, then they're from the H matrix.
 	// Blocks of cells are organized in the same interleaved manner as they are
 	// calculated by the Farrar algorithm.
-	const __m128i *pvScore; // points into the query profile
+	const simde__m128i *pvScore; // points into the query profile
 
 	d.mat_.init(dpRows(), rff_ - rfi_, NWORDS_PER_REG);
 	const size_t colstride = d.mat_.colstride();
@@ -1078,22 +1053,22 @@
 	assert_eq(ROWSTRIDE, colstride / iter);
 	
 	// Initialize the H and E vectors in the first matrix column
-	__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
-	__m128i *pvETmp = d.mat_.evec(0, 0);
+	simde__m128i *pvHTmp = d.mat_.tmpvec(0, 0);
+	simde__m128i *pvETmp = d.mat_.evec(0, 0);
 	
 	for(size_t i = 0; i < iter; i++) {
-		_mm_store_si128(pvETmp, vlo);
-		_mm_store_si128(pvHTmp, vlo); // start low in local mode
+		simde_mm_store_si128(pvETmp, vlo);
+		simde_mm_store_si128(pvHTmp, vlo); // start low in local mode
 		pvETmp += ROWSTRIDE;
 		pvHTmp += ROWSTRIDE;
 	}
 	// These are swapped just before the innermost loop
-	__m128i *pvHStore = d.mat_.hvec(0, 0);
-	__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
-	__m128i *pvELoad  = d.mat_.evec(0, 0);
-	__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
-	__m128i *pvFStore = d.mat_.fvec(0, 0);
-	__m128i *pvFTmp   = NULL;
+	simde__m128i *pvHStore = d.mat_.hvec(0, 0);
+	simde__m128i *pvHLoad  = d.mat_.tmpvec(0, 0);
+	simde__m128i *pvELoad  = d.mat_.evec(0, 0);
+	simde__m128i *pvEStore = d.mat_.evecUnsafe(0, 1);
+	simde__m128i *pvFStore = d.mat_.fvec(0, 0);
+	simde__m128i *pvFTmp   = NULL;
 	
 	assert_gt(sc_->gapbar, 0);
 	size_t nfixup = 0;
@@ -1125,60 +1100,60 @@
 		pvScore = d.profbuf_.ptr() + off; // even elts = query profile, odd = gap barrier
 		
 		// Load H vector from the final row of the previous column
-		vh = _mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
+		vh = simde_mm_load_si128(pvHLoad + colstride - ROWSTRIDE);
 		
 		// Set all cells to low value
-		vf = _mm_xor_si128(vf, vf);
+		vf = simde_mm_xor_si128(vf, vf);
 		
 		// Store cells in F, calculated previously
 		// No need to veto ref gap extensions, they're all 0x00s
-		_mm_store_si128(pvFStore, vf);
+		simde_mm_store_si128(pvFStore, vf);
 		pvFStore += ROWSTRIDE;
 		
 		// Shift down so that topmost (least sig) cell gets 0
-		vh = _mm_slli_si128(vh, NBYTES_PER_WORD);
+		vh = simde_mm_slli_si128(vh, NBYTES_PER_WORD);
 		
 		// We pull out one loop iteration to make it easier to veto values in the top row
 		
 		// Load cells from E, calculated previously
-		ve = _mm_load_si128(pvELoad);
+		ve = simde_mm_load_si128(pvELoad);
 		assert_all_lt(ve, vhi);
 		pvELoad += ROWSTRIDE;
 		
 		// Factor in query profile (matches and mismatches)
-		vh = _mm_adds_epu8(vh, pvScore[0]);
-		vh = _mm_subs_epu8(vh, vbias);
+		vh = simde_mm_adds_epu8(vh, pvScore[0]);
+		vh = simde_mm_subs_epu8(vh, vbias);
 		
 		// Update H, factoring in E and F
-		vh = _mm_max_epu8(vh, ve);
-		vh = _mm_max_epu8(vh, vf);
+		vh = simde_mm_max_epu8(vh, ve);
+		vh = simde_mm_max_epu8(vh, vf);
 		
 		// Update highest score so far
-		vcolmax = _mm_xor_si128(vcolmax, vcolmax);
-		vcolmax = _mm_max_epu8(vcolmax, vh);
+		vcolmax = simde_mm_xor_si128(vcolmax, vcolmax);
+		vcolmax = simde_mm_max_epu8(vcolmax, vh);
 		
 		// Save the new vH values
-		_mm_store_si128(pvHStore, vh);
+		simde_mm_store_si128(pvHStore, vh);
 		pvHStore += ROWSTRIDE;
 		
 		// Update vE value
 		vf = vh;
-		vh = _mm_subs_epu8(vh, rdgapo);
-		vh = _mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
-		ve = _mm_subs_epu8(ve, rdgape);
-		ve = _mm_max_epu8(ve, vh);
+		vh = simde_mm_subs_epu8(vh, rdgapo);
+		vh = simde_mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
+		ve = simde_mm_subs_epu8(ve, rdgape);
+		ve = simde_mm_max_epu8(ve, vh);
 		assert_all_lt(ve, vhi);
 		
 		// Load the next h value
-		vh = _mm_load_si128(pvHLoad);
+		vh = simde_mm_load_si128(pvHLoad);
 		pvHLoad += ROWSTRIDE;
 		
 		// Save E values
-		_mm_store_si128(pvEStore, ve);
+		simde_mm_store_si128(pvEStore, ve);
 		pvEStore += ROWSTRIDE;
 		
 		// Update vf value
-		vf = _mm_subs_epu8(vf, rfgapo);
+		vf = simde_mm_subs_epu8(vf, rfgapo);
 		assert_all_lt(vf, vhi);
 		
 		pvScore += 2; // move on to next query profile
@@ -1187,127 +1162,127 @@
 		size_t j;
 		for(j = 1; j < iter; j++) {
 			// Load cells from E, calculated previously
-			ve = _mm_load_si128(pvELoad);
+			ve = simde_mm_load_si128(pvELoad);
 			assert_all_lt(ve, vhi);
 			pvELoad += ROWSTRIDE;
 			
 			// Store cells in F, calculated previously
-			vf = _mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
-			_mm_store_si128(pvFStore, vf);
+			vf = simde_mm_subs_epu8(vf, pvScore[1]); // veto some ref gap extensions
+			simde_mm_store_si128(pvFStore, vf);
 			pvFStore += ROWSTRIDE;
 			
 			// Factor in query profile (matches and mismatches)
-			vh = _mm_adds_epu8(vh, pvScore[0]);
-			vh = _mm_subs_epu8(vh, vbias);
+			vh = simde_mm_adds_epu8(vh, pvScore[0]);
+			vh = simde_mm_subs_epu8(vh, vbias);
 			
 			// Update H, factoring in E and F
-			vh = _mm_max_epu8(vh, ve);
-			vh = _mm_max_epu8(vh, vf);
+			vh = simde_mm_max_epu8(vh, ve);
+			vh = simde_mm_max_epu8(vh, vf);
 			
 			// Update highest score encountered this far
-			vcolmax = _mm_max_epu8(vcolmax, vh);
+			vcolmax = simde_mm_max_epu8(vcolmax, vh);
 			
 			// Save the new vH values
-			_mm_store_si128(pvHStore, vh);
+			simde_mm_store_si128(pvHStore, vh);
 			pvHStore += ROWSTRIDE;
 			
 			// Update vE value
 			vtmp = vh;
-			vh = _mm_subs_epu8(vh, rdgapo);
-			vh = _mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
-			ve = _mm_subs_epu8(ve, rdgape);
-			ve = _mm_max_epu8(ve, vh);
+			vh = simde_mm_subs_epu8(vh, rdgapo);
+			vh = simde_mm_subs_epu8(vh, pvScore[1]); // veto some read gap opens
+			ve = simde_mm_subs_epu8(ve, rdgape);
+			ve = simde_mm_max_epu8(ve, vh);
 			assert_all_lt(ve, vhi);
 			
 			// Load the next h value
-			vh = _mm_load_si128(pvHLoad);
+			vh = simde_mm_load_si128(pvHLoad);
 			pvHLoad += ROWSTRIDE;
 			
 			// Save E values
-			_mm_store_si128(pvEStore, ve);
+			simde_mm_store_si128(pvEStore, ve);
 			pvEStore += ROWSTRIDE;
 			
 			// Update vf value
-			vtmp = _mm_subs_epu8(vtmp, rfgapo);
-			vf = _mm_subs_epu8(vf, rfgape);
+			vtmp = simde_mm_subs_epu8(vtmp, rfgapo);
+			vf = simde_mm_subs_epu8(vf, rfgape);
 			assert_all_lt(vf, vhi);
-			vf = _mm_max_epu8(vf, vtmp);
+			vf = simde_mm_max_epu8(vf, vtmp);
 			
 			pvScore += 2; // move on to next query profile / gap veto
 		}
 		// pvHStore, pvELoad, pvEStore have all rolled over to the next column
 		pvFTmp = pvFStore;
 		pvFStore -= colstride; // reset to start of column
-		vtmp = _mm_load_si128(pvFStore);
+		vtmp = simde_mm_load_si128(pvFStore);
 		
 		pvHStore -= colstride; // reset to start of column
-		vh = _mm_load_si128(pvHStore);
+		vh = simde_mm_load_si128(pvHStore);
 		
 		pvEStore -= colstride; // reset to start of column
-		ve = _mm_load_si128(pvEStore);
+		ve = simde_mm_load_si128(pvEStore);
 		
 		pvHLoad = pvHStore;    // new pvHLoad = pvHStore
 		pvScore = d.profbuf_.ptr() + off + 1; // reset veto vector
 		
 		// vf from last row gets shifted down by one to overlay the first row
 		// rfgape has already been subtracted from it.
-		vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+		vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
 		
-		vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
-		vf = _mm_max_epu8(vtmp, vf);
-		vtmp = _mm_subs_epu8(vf, vtmp);
-		vtmp = _mm_cmpeq_epi8(vtmp, vzero);
-		int cmp = _mm_movemask_epi8(vtmp);
+		vf = simde_mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+		vf = simde_mm_max_epu8(vtmp, vf);
+		vtmp = simde_mm_subs_epu8(vf, vtmp);
+		vtmp = simde_mm_cmpeq_epi8(vtmp, vzero);
+		int cmp = simde_mm_movemask_epi8(vtmp);
 		
 		// If any element of vtmp is greater than H - gap-open...
 		j = 0;
 		while(cmp != 0xffff) {
 			// Store this vf
-			_mm_store_si128(pvFStore, vf);
+			simde_mm_store_si128(pvFStore, vf);
 			pvFStore += ROWSTRIDE;
 			
 			// Update vh w/r/t new vf
-			vh = _mm_max_epu8(vh, vf);
+			vh = simde_mm_max_epu8(vh, vf);
 			
 			// Save vH values
-			_mm_store_si128(pvHStore, vh);
+			simde_mm_store_si128(pvHStore, vh);
 			pvHStore += ROWSTRIDE;
 			
 			// Update highest score encountered this far
-			vcolmax = _mm_max_epu8(vcolmax, vh);
+			vcolmax = simde_mm_max_epu8(vcolmax, vh);
 			
 			// Update E in case it can be improved using our new vh
-			vh = _mm_subs_epu8(vh, rdgapo);
-			vh = _mm_subs_epu8(vh, *pvScore); // veto some read gap opens
-			ve = _mm_max_epu8(ve, vh);
-			_mm_store_si128(pvEStore, ve);
+			vh = simde_mm_subs_epu8(vh, rdgapo);
+			vh = simde_mm_subs_epu8(vh, *pvScore); // veto some read gap opens
+			ve = simde_mm_max_epu8(ve, vh);
+			simde_mm_store_si128(pvEStore, ve);
 			pvEStore += ROWSTRIDE;
 			pvScore += 2;
 			
 			assert_lt(j, iter);
 			if(++j == iter) {
 				pvFStore -= colstride;
-				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
+				vtmp = simde_mm_load_si128(pvFStore);   // load next vf ASAP
 				pvHStore -= colstride;
-				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
+				vh = simde_mm_load_si128(pvHStore);     // load next vh ASAP
 				pvEStore -= colstride;
-				ve = _mm_load_si128(pvEStore);     // load next ve ASAP
+				ve = simde_mm_load_si128(pvEStore);     // load next ve ASAP
 				pvScore = d.profbuf_.ptr() + off + 1;
 				j = 0;
-				vf = _mm_slli_si128(vf, NBYTES_PER_WORD);
+				vf = simde_mm_slli_si128(vf, NBYTES_PER_WORD);
 			} else {
-				vtmp = _mm_load_si128(pvFStore);   // load next vf ASAP
-				vh = _mm_load_si128(pvHStore);     // load next vh ASAP
-				ve = _mm_load_si128(pvEStore);     // load next vh ASAP
+				vtmp = simde_mm_load_si128(pvFStore);   // load next vf ASAP
+				vh = simde_mm_load_si128(pvHStore);     // load next vh ASAP
+				ve = simde_mm_load_si128(pvEStore);     // load next vh ASAP
 			}
 			
 			// Update F with another gap extension
-			vf = _mm_subs_epu8(vf, rfgape);
-			vf = _mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
-			vf = _mm_max_epu8(vtmp, vf);
-			vtmp = _mm_subs_epu8(vf, vtmp);
-			vtmp = _mm_cmpeq_epi8(vtmp, vzero);
-			cmp = _mm_movemask_epi8(vtmp);
+			vf = simde_mm_subs_epu8(vf, rfgape);
+			vf = simde_mm_subs_epu8(vf, *pvScore); // veto some ref gap extensions
+			vf = simde_mm_max_epu8(vtmp, vf);
+			vtmp = simde_mm_subs_epu8(vf, vtmp);
+			vtmp = simde_mm_cmpeq_epi8(vtmp, vzero);
+			cmp = simde_mm_movemask_epi8(vtmp);
 			nfixup++;
 		}
 
@@ -1329,21 +1304,21 @@
 #endif
 
 		// Store column maximum vector in first element of tmp
-		vmax = _mm_max_epu8(vmax, vcolmax);
-		_mm_store_si128(d.mat_.tmpvec(0, i - rfi_), vcolmax);
+		vmax = simde_mm_max_epu8(vmax, vcolmax);
+		simde_mm_store_si128(d.mat_.tmpvec(0, i - rfi_), vcolmax);
 
 		{
 			// Get single largest score in this column
 			vmaxtmp = vcolmax;
-			vtmp = _mm_srli_si128(vmaxtmp, 8);
-			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 4);
-			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 2);
-			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
-			vtmp = _mm_srli_si128(vmaxtmp, 1);
-			vmaxtmp = _mm_max_epu8(vmaxtmp, vtmp);
-			int score = _mm_extract_epi16(vmaxtmp, 0);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 8);
+			vmaxtmp = simde_mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 4);
+			vmaxtmp = simde_mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 2);
+			vmaxtmp = simde_mm_max_epu8(vmaxtmp, vtmp);
+			vtmp = simde_mm_srli_si128(vmaxtmp, 1);
+			vmaxtmp = simde_mm_max_epu8(vmaxtmp, vtmp);
+			int score = simde_mm_extract_epi16(vmaxtmp, 0);
 			score = score & 0x00ff;
 
 			// Could we have saturated?
@@ -1375,14 +1350,14 @@
 	}
 
 	// Find largest score in vmax
-	vtmp = _mm_srli_si128(vmax, 8);
-	vmax = _mm_max_epu8(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 4);
-	vmax = _mm_max_epu8(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 2);
-	vmax = _mm_max_epu8(vmax, vtmp);
-	vtmp = _mm_srli_si128(vmax, 1);
-	vmax = _mm_max_epu8(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 8);
+	vmax = simde_mm_max_epu8(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 4);
+	vmax = simde_mm_max_epu8(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 2);
+	vmax = simde_mm_max_epu8(vmax, vtmp);
+	vtmp = simde_mm_srli_si128(vmax, 1);
+	vmax = simde_mm_max_epu8(vmax, vtmp);
 	
 	// Update metrics
 	if(!debug) {
@@ -1393,7 +1368,7 @@
 		met.fixup += nfixup;                    // DP fixup loop iters
 	}
 	
-	int score = _mm_extract_epi16(vmax, 0);
+	int score = simde_mm_extract_epi16(vmax, 0);
 	score = score & 0x00ff;
 
 	flag = 0;
@@ -1496,22 +1471,22 @@
 		size_t nrow_hi = nrow;
 		// First, check if there is a cell in this column with a score
 		// above the score threshold
-		__m128i vmax = *d.mat_.tmpvec(0, j);
-		__m128i vtmp = _mm_srli_si128(vmax, 8);
-		vmax = _mm_max_epu8(vmax, vtmp);
-		vtmp = _mm_srli_si128(vmax, 4);
-		vmax = _mm_max_epu8(vmax, vtmp);
-		vtmp = _mm_srli_si128(vmax, 2);
-		vmax = _mm_max_epu8(vmax, vtmp);
-		vtmp = _mm_srli_si128(vmax, 1);
-		vmax = _mm_max_epu8(vmax, vtmp);
-		int score = _mm_extract_epi16(vmax, 0);
+		simde__m128i vmax = *d.mat_.tmpvec(0, j);
+		simde__m128i vtmp = simde_mm_srli_si128(vmax, 8);
+		vmax = simde_mm_max_epu8(vmax, vtmp);
+		vtmp = simde_mm_srli_si128(vmax, 4);
+		vmax = simde_mm_max_epu8(vmax, vtmp);
+		vtmp = simde_mm_srli_si128(vmax, 2);
+		vmax = simde_mm_max_epu8(vmax, vtmp);
+		vtmp = simde_mm_srli_si128(vmax, 1);
+		vmax = simde_mm_max_epu8(vmax, vtmp);
+		int score = simde_mm_extract_epi16(vmax, 0);
 		score = score & 0x00ff;
 #ifndef NDEBUG
 		{
 			// Start in upper vector row and move down
 			TAlScore max = 0;
-			__m128i *pvH = d.mat_.hvec(0, j);
+			simde__m128i *pvH = d.mat_.hvec(0, j);
 			for(size_t i = 0; i < iter; i++) {
 				for(size_t k = 0; k < NWORDS_PER_REG; k++) {
 					TAlScore sc = (TAlScore)((TCScore*)pvH)[k];
@@ -1529,11 +1504,11 @@
 			continue;
 		}
 		// Get pointer to first cell in column to examine:
-		__m128i *pvHorig = d.mat_.hvec(0, j);
-		__m128i *pvH     = pvHorig;
+		simde__m128i *pvHorig = d.mat_.hvec(0, j);
+		simde__m128i *pvH     = pvHorig;
 		// Get pointer to the vector in the following column that corresponds
 		// to the cells diagonally down and to the right from the cells in pvH
-		__m128i *pvHSucc = (j < ncol-1) ? d.mat_.hvec(0, j+1) : NULL;
+		simde__m128i *pvHSucc = (j < ncol-1) ? d.mat_.hvec(0, j+1) : NULL;
 		// Start in upper vector row and move down
 		for(size_t i = 0; i < iter; i++) {
 			if(pvHSucc != NULL) {
@@ -1709,7 +1684,7 @@
 	size_t rowelt, rowvec, eltvec;
 	size_t left_rowelt, up_rowelt, upleft_rowelt;
 	size_t left_rowvec, up_rowvec, upleft_rowvec;
-	__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
+	simde__m128i *cur_vec, *left_vec, *up_vec, *upleft_vec;
 	NEW_ROW_COL(row, col);
 	while((int)row >= 0) {
 		met.btcell++;
--- hisat2.orig/sse_util.cpp
+++ hisat2/sse_util.cpp
@@ -25,9 +25,9 @@
  * Given a column of filled-in cells, save the checkpointed cells in cs_.
  */
 void Checkpointer::commitCol(
-	__m128i *pvH,
-	__m128i *pvE,
-	__m128i *pvF,
+	simde__m128i *pvH,
+	simde__m128i *pvE,
+	simde__m128i *pvF,
 	size_t coli)
 {
 }
--- hisat2.orig/sse_util.h
+++ hisat2/sse_util.h
@@ -24,7 +24,7 @@
 #include "ds.h"
 #include "limit.h"
 #include <iostream>
-#include <emmintrin.h>
+#include "debian/include/simde/x86/sse2.h"
 
 class EList_m128i {
 public:
@@ -103,7 +103,7 @@
 	 */
 	void zero() {
 		if(cur_ > 0) {
-			memset(list_, 0, cur_ * sizeof(__m128i));
+			memset(list_, 0, cur_ * sizeof(simde__m128i));
 		}
 	}
 
@@ -148,7 +148,7 @@
 	/**
 	 * Return a reference to the ith element.
 	 */
-	inline __m128i& operator[](size_t i) {
+	inline simde__m128i& operator[](size_t i) {
 		assert_lt(i, cur_);
 		return list_[i];
 	}
@@ -156,7 +156,7 @@
 	/**
 	 * Return a reference to the ith element.
 	 */
-	inline __m128i operator[](size_t i) const {
+	inline simde__m128i operator[](size_t i) const {
 		assert_lt(i, cur_);
 		return list_[i];
 	}
@@ -164,26 +164,26 @@
 	/**
 	 * Return a reference to the ith element.
 	 */
-	inline __m128i& get(size_t i) {
+	inline simde__m128i& get(size_t i) {
 		return operator[](i);
 	}
 	
 	/**
 	 * Return a reference to the ith element.
 	 */
-	inline __m128i get(size_t i) const {
+	inline simde__m128i get(size_t i) const {
 		return operator[](i);
 	}
 
 	/**
 	 * Return a pointer to the beginning of the buffer.
 	 */
-	__m128i *ptr() { return list_; }
+	simde__m128i *ptr() { return list_; }
 
 	/**
 	 * Return a const pointer to the beginning of the buffer.
 	 */
-	const __m128i *ptr() const { return list_; }
+	const simde__m128i *ptr() const { return list_; }
 
 	/**
 	 * Return memory category.
@@ -214,21 +214,21 @@
 	 * Allocate a T array of length sz_ and store in list_.  Also,
 	 * tally into the global memory tally.
 	 */
-	__m128i *alloc(size_t sz) {
-		__m128i* last_alloc_;
+	simde__m128i *alloc(size_t sz) {
+		simde__m128i* last_alloc_;
 		try {
-			last_alloc_ = new __m128i[sz + 2];
+			last_alloc_ = new simde__m128i[sz + 2];
 		} catch(std::bad_alloc& e) {
-			std::cerr << "Error: Out of memory allocating " << sz << " __m128i's for DP matrix: '" << e.what() << "'" << std::endl;
+			std::cerr << "Error: Out of memory allocating " << sz << " simde__m128i's for DP matrix: '" << e.what() << "'" << std::endl;
 			throw e;
 		}
-		__m128i* tmp = last_alloc_;
+		simde__m128i* tmp = last_alloc_;
 		size_t tmpint = (size_t)tmp;
 		// Align it!
 		if((tmpint & 0xf) != 0) {
 			tmpint += 15;
 			tmpint &= (~0xf);
-			tmp = reinterpret_cast<__m128i*>(tmpint);
+			tmp = reinterpret_cast<simde__m128i*>(tmpint);
 		}
 		assert_eq(0, (tmpint & 0xf)); // should be 16-byte aligned
 		assert(tmp != NULL);
@@ -267,7 +267,7 @@
 	 */
 	void expandCopyExact(size_t newsz) {
 		if(newsz <= sz_) return;
-		__m128i* tmp = alloc(newsz);
+		simde__m128i* tmp = alloc(newsz);
 		assert(tmp != NULL);
 		size_t cur = cur_;
 		if(list_ != NULL) {
@@ -303,7 +303,7 @@
 		assert(list_ != NULL);
 		assert_gt(newsz, 0);
 		free();
-		__m128i* tmp = alloc(newsz);
+		simde__m128i* tmp = alloc(newsz);
 		assert(tmp != NULL);
 		list_ = tmp;
 		sz_ = newsz;
@@ -311,8 +311,8 @@
 	}
 
 	int      cat_;        // memory category, for accounting purposes
-	__m128i* last_alloc_; // what new[] originally returns
-	__m128i *list_;       // list ptr, aligned version of what new[] returns
+	simde__m128i* last_alloc_; // what new[] originally returns
+	simde__m128i *list_;       // list ptr, aligned version of what new[] returns
 	size_t   sz_;         // capacity
 	size_t   cur_;        // occupancy (AKA size)
 };
@@ -399,7 +399,7 @@
 	 */
 	int64_t debugCell(size_t row, size_t col, int hef) const {
 		assert(debug_);
-		const __m128i* ptr = qcolsD_.ptr() + hef;
+		const simde__m128i* ptr = qcolsD_.ptr() + hef;
 		// Fast forward to appropriate column
 		ptr += ((col * niter_) << 2);
 		size_t mod = row % niter_; // which m128i
@@ -477,7 +477,7 @@
 		// It must be in a checkpointed column
 		assert_eq(lomask_, (col & lomask_));
 		// Fast forward to appropriate column
-		const __m128i* ptr = qcols_.ptr() + hef;
+		const simde__m128i* ptr = qcols_.ptr() + hef;
 		ptr += (((col >> perpow2_) * niter_) << 2);
 		size_t mod = row % niter_; // which m128i
 		size_t div = row / niter_; // offset into m128i
@@ -507,7 +507,7 @@
 	/**
 	 * Given a column of filled-in cells, save the checkpointed cells in cs_.
 	 */
-	void commitCol(__m128i *pvH, __m128i *pvE, __m128i *pvF, size_t coli);
+	void commitCol(simde__m128i *pvH, simde__m128i *pvE, simde__m128i *pvF, size_t coli);
 	
 	/**
 	 * Reset the state of the Checkpointer.
@@ -564,7 +564,7 @@
 	
 	// We store columns in this way to reduce overhead of populating them
 	bool          is8_;     // true -> fill used 8-bit cells
-	size_t        niter_;   // # __m128i words per column
+	size_t        niter_;   // # simde__m128i words per column
 	EList_m128i   qcols_;   // checkpoint E/F/H values for select columns
 	
 	bool          debug_;   // get debug checkpoints? (i.e. fill qcolsD_?)
--- hisat2.orig/Makefile
+++ hisat2/Makefile
@@ -55,7 +55,7 @@
 	MACOS = 1
 endif
 
-EXTRA_FLAGS += -DPOPCNT_CAPABILITY
+#EXTRA_FLAGS += -DPOPCNT_CAPABILITY
 INC += -I third_party
 
 MM_DEF = 
