From 086cf9075087adfe08f50752077a29c964ba1dab Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 14 Feb 2017 13:43:27 -0800
Subject: crypto: gf128mul - fix some comments

Fix incorrect references to GF(128) instead of GF(2^128), as these are
two entirely different fields, and fix a few other incorrect comments.

Cc: Alex Cope <alexcope@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/gf128mul.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'crypto/gf128mul.c')

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 72015fee..d9e3eecc 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -44,7 +44,7 @@
  ---------------------------------------------------------------------------
  Issue 31/01/2006
 
- This file provides fast multiplication in GF(128) as required by several
+ This file provides fast multiplication in GF(2^128) as required by several
  cryptographic authentication modes
 */
 
@@ -116,9 +116,10 @@
 static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
 static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe);
 
-/* These functions multiply a field element by x, by x^4 and by x^8
- * in the polynomial field representation. It uses 32-bit word operations
- * to gain speed but compensates for machine endianess and hence works
+/*
+ * The following functions multiply a field element by x or by x^8 in
+ * the polynomial field representation.  They use 64-bit word operations
+ * to gain speed but compensate for machine endianness and hence work
  * correctly on both styles of machine.
  */
 
@@ -251,7 +252,7 @@ EXPORT_SYMBOL(gf128mul_bbe);
 
 /*      This version uses 64k bytes of table space.
     A 16 byte buffer has to be multiplied by a 16 byte key
-    value in GF(128).  If we consider a GF(128) value in
+    value in GF(2^128).  If we consider a GF(2^128) value in
     the buffer's lowest byte, we can construct a table of
     the 256 16 byte values that result from the 256 values
     of this byte.  This requires 4096 bytes. But we also
@@ -330,7 +331,7 @@ EXPORT_SYMBOL(gf128mul_64k_bbe);
 
 /*      This version uses 4k bytes of table space.
     A 16 byte buffer has to be multiplied by a 16 byte key
-    value in GF(128).  If we consider a GF(128) value in a
+    value in GF(2^128).  If we consider a GF(2^128) value in a
     single byte, we can construct a table of the 256 16 byte
     values that result from the 256 values of this byte.
     This requires 4096 bytes. If we take the highest byte in
-- 
cgit v1.2.3


From 192de53897d18aea01ca546267ef1b1b1c38c673 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 14 Feb 2017 13:43:28 -0800
Subject: crypto: gf128mul - remove xx() macro

The xx() macro serves no purpose and can be removed.

Cc: Alex Cope <alexcope@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/gf128mul.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'crypto/gf128mul.c')

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index d9e3eecc..c050cf6f 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -97,20 +97,18 @@
     the table above
 */
 
-#define xx(p, q)	0x##p##q
-
 #define xda_bbe(i) ( \
-	(i & 0x80 ? xx(43, 80) : 0) ^ (i & 0x40 ? xx(21, c0) : 0) ^ \
-	(i & 0x20 ? xx(10, e0) : 0) ^ (i & 0x10 ? xx(08, 70) : 0) ^ \
-	(i & 0x08 ? xx(04, 38) : 0) ^ (i & 0x04 ? xx(02, 1c) : 0) ^ \
-	(i & 0x02 ? xx(01, 0e) : 0) ^ (i & 0x01 ? xx(00, 87) : 0) \
+	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
+	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
+	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
+	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
 )
 
 #define xda_lle(i) ( \
-	(i & 0x80 ? xx(e1, 00) : 0) ^ (i & 0x40 ? xx(70, 80) : 0) ^ \
-	(i & 0x20 ? xx(38, 40) : 0) ^ (i & 0x10 ? xx(1c, 20) : 0) ^ \
-	(i & 0x08 ? xx(0e, 10) : 0) ^ (i & 0x04 ? xx(07, 08) : 0) ^ \
-	(i & 0x02 ? xx(03, 84) : 0) ^ (i & 0x01 ? xx(01, c2) : 0) \
+	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
+	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
+	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
+	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
 )
 
 static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
-- 
cgit v1.2.3


From 18f9f789aceb0dd0efd137f6821d91f372d043f5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 14 Feb 2017 13:43:29 -0800
Subject: crypto: gf128mul - rename the byte overflow tables

Though the GF(2^128) byte overflow tables were named the "lle" and "bbe"
tables, they are not actually tied to these element formats
specifically, but rather to particular a "bit endianness".  For example,
the bbe table is actually used for both bbe and ble multiplication.
Therefore, rename the tables to "le" and "be" and update the comment to
explain this.

Cc: Alex Cope <alexcope@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/gf128mul.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'crypto/gf128mul.c')

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index c050cf6f..1fde1c79 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -88,31 +88,46 @@
 	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
 }
 
-/*	Given the value i in 0..255 as the byte overflow when a field element
-    in GHASH is multiplied by x^8, this function will return the values that
-    are generated in the lo 16-bit word of the field value by applying the
-    modular polynomial. The values lo_byte and hi_byte are returned via the
-    macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into
-    memory as required by a suitable definition of this macro operating on
-    the table above
-*/
+/*
+ * Given a value i in 0..255 as the byte overflow when a field element
+ * in GF(2^128) is multiplied by x^8, the following macro returns the
+ * 16-bit value that must be XOR-ed into the low-degree end of the
+ * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
+ *
+ * There are two versions of the macro, and hence two tables: one for
+ * the "be" convention where the highest-order bit is the coefficient of
+ * the highest-degree polynomial term, and one for the "le" convention
+ * where the highest-order bit is the coefficient of the lowest-degree
+ * polynomial term.  In both cases the values are stored in CPU byte
+ * endianness such that the coefficients are ordered consistently across
+ * bytes, i.e. in the "be" table bits 15..0 of the stored value
+ * correspond to the coefficients of x^15..x^0, and in the "le" table
+ * bits 15..0 correspond to the coefficients of x^0..x^15.
+ *
+ * Therefore, provided that the appropriate byte endianness conversions
+ * are done by the multiplication functions (and these must be in place
+ * anyway to support both little endian and big endian CPUs), the "be"
+ * table can be used for multiplications of both "bbe" and "ble"
+ * elements, and the "le" table can be used for multiplications of both
+ * "lle" and "lbe" elements.
+ */
 
-#define xda_bbe(i) ( \
+#define xda_be(i) ( \
 	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
 	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
 	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
 	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
 )
 
-#define xda_lle(i) ( \
+#define xda_le(i) ( \
 	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
 	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
 	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
 	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
 )
 
-static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
-static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe);
+static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
+static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
 
 /*
  * The following functions multiply a field element by x or by x^8 in
@@ -125,7 +140,7 @@ static void gf128mul_x_lle(be128 *r, const be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_lle[(b << 7) & 0xff];
+	u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
 
 	r->b = cpu_to_be64((b >> 1) | (a << 63));
 	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
@@ -135,7 +150,7 @@ static void gf128mul_x_bbe(be128 *r, const be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[a >> 63];
+	u64 _tt = gf128mul_table_be[a >> 63];
 
 	r->a = cpu_to_be64((a << 1) | (b >> 63));
 	r->b = cpu_to_be64((b << 1) ^ _tt);
@@ -145,7 +160,7 @@ void gf128mul_x_ble(be128 *r, const be128 *x)
 {
 	u64 a = le64_to_cpu(x->a);
 	u64 b = le64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[b >> 63];
+	u64 _tt = gf128mul_table_be[b >> 63];
 
 	r->a = cpu_to_le64((a << 1) ^ _tt);
 	r->b = cpu_to_le64((b << 1) | (a >> 63));
@@ -156,7 +171,7 @@ static void gf128mul_x8_lle(be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_lle[b & 0xff];
+	u64 _tt = gf128mul_table_le[b & 0xff];
 
 	x->b = cpu_to_be64((b >> 8) | (a << 56));
 	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
@@ -166,7 +181,7 @@ static void gf128mul_x8_bbe(be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
 	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_bbe[a >> 56];
+	u64 _tt = gf128mul_table_be[a >> 56];
 
 	x->a = cpu_to_be64((a << 8) | (b >> 56));
 	x->b = cpu_to_be64((b << 8) ^ _tt);
-- 
cgit v1.2.3


From e61411a03b44b036c309b833396cf105191baa9b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 14 Feb 2017 13:43:30 -0800
Subject: crypto: gf128mul - constify 4k and 64k multiplication tables

Constify the multiplication tables passed to the 4k and 64k
multiplication functions, as they are not modified by these functions.

Cc: Alex Cope <alexcope@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/gf128mul.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'crypto/gf128mul.c')

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 1fde1c79..04facc06 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -329,7 +329,7 @@ void gf128mul_free_64k(struct gf128mul_64k *t)
 }
 EXPORT_SYMBOL(gf128mul_free_64k);
 
-void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t)
+void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t)
 {
 	u8 *ap = (u8 *)a;
 	be128 r[1];
@@ -402,7 +402,7 @@ out:
 }
 EXPORT_SYMBOL(gf128mul_init_4k_bbe);
 
-void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
+void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
 {
 	u8 *ap = (u8 *)a;
 	be128 r[1];
@@ -417,7 +417,7 @@ void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
 }
 EXPORT_SYMBOL(gf128mul_4k_lle);
 
-void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t)
+void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t)
 {
 	u8 *ap = (u8 *)a;
 	be128 r[1];
-- 
cgit v1.2.3


From 2eefd4ca30d2450a94472b6fd9ffb3dacb893a77 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnáček <omosnacek@gmail.com>
Date: Sun, 2 Apr 2017 21:19:13 +0200
Subject: crypto: gf128mul - define gf128mul_x_* in gf128mul.h

The gf128mul_x_ble function is currently defined in gf128mul.c, because
it depends on the gf128mul_table_be multiplication table.

However, since the function is very small and only uses two values from
the table, it is better for it to be defined as inline function in
gf128mul.h. That way, the function can be inlined by the compiler for
better performance.

For consistency, the other gf128mul_x_* functions are also moved to the
header file. In addition, the code is rewritten to be constant-time.

After this change, the speed of the generic 'xts(aes)' implementation
increased from ~225 MiB/s to ~235 MiB/s (measured using 'cryptsetup
benchmark -c aes-xts-plain64' on an Intel system with CRYPTO_AES_X86_64
and CRYPTO_AES_NI_INTEL disabled).

Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com>
Reviewd-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/gf128mul.c | 33 +--------------------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

(limited to 'crypto/gf128mul.c')

diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c
index 04facc06..dc012129 100644
--- a/crypto/gf128mul.c
+++ b/crypto/gf128mul.c
@@ -130,43 +130,12 @@ static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
 static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
 
 /*
- * The following functions multiply a field element by x or by x^8 in
+ * The following functions multiply a field element by x^8 in
  * the polynomial field representation.  They use 64-bit word operations
  * to gain speed but compensate for machine endianness and hence work
  * correctly on both styles of machine.
  */
 
-static void gf128mul_x_lle(be128 *r, const be128 *x)
-{
-	u64 a = be64_to_cpu(x->a);
-	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
-
-	r->b = cpu_to_be64((b >> 1) | (a << 63));
-	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
-}
-
-static void gf128mul_x_bbe(be128 *r, const be128 *x)
-{
-	u64 a = be64_to_cpu(x->a);
-	u64 b = be64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_be[a >> 63];
-
-	r->a = cpu_to_be64((a << 1) | (b >> 63));
-	r->b = cpu_to_be64((b << 1) ^ _tt);
-}
-
-void gf128mul_x_ble(be128 *r, const be128 *x)
-{
-	u64 a = le64_to_cpu(x->a);
-	u64 b = le64_to_cpu(x->b);
-	u64 _tt = gf128mul_table_be[b >> 63];
-
-	r->a = cpu_to_le64((a << 1) ^ _tt);
-	r->b = cpu_to_le64((b << 1) | (a >> 63));
-}
-EXPORT_SYMBOL(gf128mul_x_ble);
-
 static void gf128mul_x8_lle(be128 *x)
 {
 	u64 a = be64_to_cpu(x->a);
-- 
cgit v1.2.3