https://github.com/Tarsnap/tarsnap/commit/ca40c06f290fb8298dc2e583303d45b58878f37b
https://github.com/Tarsnap/tarsnap/commit/4af6d8350ab53d0f1f3104ce3d9072c2d5f9ef7a

From 4af6d8350ab53d0f1f3104ce3d9072c2d5f9ef7a Mon Sep 17 00:00:00 2001
From: Graham Percival <gperciva@tarsnap.com>
Date: Fri, 1 Apr 2022 16:58:43 -0700
Subject: [PATCH] scrypt: Fix strict aliasing

The original scrypt code treated its data as blobs of bytes, accessing
them in whatever manner was convenient from time to time:

* as 32-bit words or vectors thereof for the purpose of the Salsa20/8 core
* in machine-word-sized chunks (aka. size_t) for block copy and xor operations
* as 32-bit words for the Integerify function.

This worked fine at the time, but newer compilers apply strict aliasing rules
which allow them to assume that e.g. data accessed as a uint32_t is not the
same as data accessed as a size_t, resulting in miscompilation.

Note that in recent versions of scrypt (after 2015-07-18; versions 1.2.0 and
later) such miscompilation should be detected by the built-in runtime testing.

To avoid aliasing problems, the generic scrypt code now operates on uint32_t
throughout while the SSE2-enabled scrypt code operates on __m128i throughout.

Experimentally, we found that memcpy() speeds up blkcpy() in the plain C
case, but slowed it down in the _sse2.c case (probably because memcpy
can make use of vector instructions internally, but cannot assume that
it will always have a multiple of 16 bytes, as we do).
--- a/lib/crypto/crypto_scrypt_smix.c
+++ b/lib/crypto/crypto_scrypt_smix.c
@@ -27,39 +27,32 @@
  * online backup system.
  */
 #include <stdint.h>
+#include <string.h>
 
 #include "sysendian.h"
 
 #include "crypto_scrypt_smix.h"
 
-static void blkcpy(void *, const void *, size_t);
-static void blkxor(void *, const void *, size_t);
+static void blkcpy(uint32_t *, const uint32_t *, size_t);
+static void blkxor(uint32_t *, const uint32_t *, size_t);
 static void salsa20_8(uint32_t[16]);
 static void blockmix_salsa8(const uint32_t *, uint32_t *, uint32_t *, size_t);
-static uint64_t integerify(const void *, size_t);
+static uint64_t integerify(const uint32_t *, size_t);
 
 static void
-blkcpy(void * dest, const void * src, size_t len)
+blkcpy(uint32_t * dest, const uint32_t * src, size_t len)
 {
-	size_t * D = dest;
-	const size_t * S = src;
-	size_t L = len / sizeof(size_t);
-	size_t i;
 
-	for (i = 0; i < L; i++)
-		D[i] = S[i];
+	memcpy(dest, src, len);
 }
 
 static void
-blkxor(void * dest, const void * src, size_t len)
+blkxor(uint32_t * dest, const uint32_t * src, size_t len)
 {
-	size_t * D = dest;
-	const size_t * S = src;
-	size_t L = len / sizeof(size_t);
 	size_t i;
 
-	for (i = 0; i < L; i++)
-		D[i] ^= S[i];
+	for (i = 0; i < len / 4; i++)
+		dest[i] ^= src[i];
 }
 
 /**
@@ -145,9 +138,9 @@ blockmix_salsa8(const uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r)
  * Return the result of parsing B_{2r-1} as a little-endian integer.
  */
 static uint64_t
-integerify(const void * B, size_t r)
+integerify(const uint32_t * B, size_t r)
 {
-	const uint32_t * X = (const void *)((uintptr_t)(B) + (2 * r - 1) * 64);
+	const uint32_t * X = B + (2 * r - 1) * 16;
 
 	return (((uint64_t)(X[1]) << 32) + X[0]);
 }
--- a/lib/crypto/crypto_scrypt_smix_sse2.c
+++ b/lib/crypto/crypto_scrypt_smix_sse2.c
@@ -36,34 +36,30 @@
 
 #include "crypto_scrypt_smix_sse2.h"
 
-static void blkcpy(void *, const void *, size_t);
-static void blkxor(void *, const void *, size_t);
+static void blkcpy(__m128i *, const __m128i *, size_t);
+static void blkxor(__m128i *, const __m128i *, size_t);
 static void salsa20_8(__m128i[4]);
 static void blockmix_salsa8(const __m128i *, __m128i *, __m128i *, size_t);
-static uint64_t integerify(const void *, size_t);
+static uint64_t integerify(const __m128i *, size_t);
 
 static void
-blkcpy(void * dest, const void * src, size_t len)
+blkcpy(__m128i * dest, const __m128i * src, size_t len)
 {
-	__m128i * D = dest;
-	const __m128i * S = src;
 	size_t L = len / 16;
 	size_t i;
 
 	for (i = 0; i < L; i++)
-		D[i] = S[i];
+		dest[i] = src[i];
 }
 
 static void
-blkxor(void * dest, const void * src, size_t len)
+blkxor(__m128i * dest, const __m128i * src, size_t len)
 {
-	__m128i * D = dest;
-	const __m128i * S = src;
 	size_t L = len / 16;
 	size_t i;
 
 	for (i = 0; i < L; i++)
-		D[i] = _mm_xor_si128(D[i], S[i]);
+		dest[i] = _mm_xor_si128(dest[i], src[i]);
 }
 
 /**
@@ -168,11 +164,18 @@ blockmix_salsa8(const __m128i * Bin, __m128i * Bout, __m128i * X, size_t r)
  * Note that B's layout is permuted compared to the generic implementation.
  */
 static uint64_t
-integerify(const void * B, size_t r)
+integerify(const __m128i * B, size_t r)
 {
-	const uint32_t * X = (const void *)((uintptr_t)(B) + (2 * r - 1) * 64);
+	const __m128i * X = B + (2*r - 1) * 4;
+	uint32_t X0, X13;
 
-	return (((uint64_t)(X[13]) << 32) + X[0]);
+	/* Get the first 32-bit element in X[0]. */
+	X0 = (uint32_t)_mm_cvtsi128_si32(X[0]);
+
+	/* Get the second 32-bit element in X[3]. */
+	X13 = (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(X[3], 4));
+
+	return (((uint64_t)(X13) << 32) + X0);
 }
 
 /**
