Use the assembly version of salsa20_xmm6 by default, if possible

icc produces good code from the intrinsics-based translation,
clang produces okay code, but gcc doesn't perform very well ATM.

It's a bummer to have a 3rd implementation, but salsa20 is used quite
a lot in the library, so it deserves a special attention.

If the assembly code cannot be assembled, fall back to the reduced
version of the intrinsics-based translation.

So, in the final library, we always only get two implementations at most.
This commit is contained in:
Frank Denis 2017-02-27 00:05:37 +01:00
parent 7d29c0fbd7
commit e3b9907429
5 changed files with 81 additions and 22 deletions

View File

@ -118,6 +118,13 @@ libsodium_la_SOURCES += \
crypto_scalarmult/curve25519/ref10/x25519_ref10.h
endif
if HAVE_AMD64_ASM
libsodium_la_SOURCES += \
crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S \
crypto_stream/salsa20/xmm6/salsa20_xmm6.c \
crypto_stream/salsa20/xmm6/salsa20_xmm6.h
endif
noinst_HEADERS = \
crypto_scalarmult/curve25519/sandy2x/consts.S \
crypto_scalarmult/curve25519/sandy2x/fe51_mul.S \
@ -206,12 +213,16 @@ libsse2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
libsse2_la_SOURCES = \
crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c \
crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c \
crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h \
crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h
if !HAVE_AMD64_ASM
libsse2_la_SOURCES += \
crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c \
crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h \
crypto_stream/salsa20/xmm6int/u0.h \
crypto_stream/salsa20/xmm6int/u1.h \
crypto_stream/salsa20/xmm6int/u4.h
endif
libssse3_la_LDFLAGS = $(libsodium_la_LDFLAGS)
libssse3_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
@ -245,4 +256,7 @@ libavx2_la_SOURCES = \
crypto_stream/chacha20/dolbeau/u8.h \
crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c \
crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h \
crypto_stream/salsa20/xmm6int/u0.h \
crypto_stream/salsa20/xmm6int/u1.h \
crypto_stream/salsa20/xmm6int/u4.h
crypto_stream/salsa20/xmm6int/u8.h

View File

@ -4,8 +4,12 @@
#include "runtime.h"
#include "stream_salsa20.h"
#include "ref/salsa20_ref.h"
#ifdef HAVE_EMMINTRIN_H
#ifdef HAVE_AMD64_ASM
# include "xmm6/salsa20_xmm6.h"
#else
# include "ref/salsa20_ref.h"
#endif
#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
# include "xmm6int/salsa20_xmm6int-sse2.h"
#endif
#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
@ -13,9 +17,9 @@
# include "xmm6int/salsa20_xmm6int-avx2.h"
#endif
#if defined(HAVE_EMMINTRIN_H) && defined(__x86_64__)
#if HAVE_AMD64_ASM
static const crypto_stream_salsa20_implementation *implementation =
&crypto_stream_salsa20_xmm6int_sse2_implementation;
&crypto_stream_salsa20_xmm6_implementation;
#else
static const crypto_stream_salsa20_implementation *implementation =
&crypto_stream_salsa20_ref_implementation;
@ -66,8 +70,8 @@ crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES])
int
_crypto_stream_salsa20_pick_best_implementation(void)
{
#if defined(HAVE_EMMINTRIN_H) && defined(__x86_64__)
implementation = &crypto_stream_salsa20_xmm6int_sse2_implementation;
#ifdef HAVE_AMD64_ASM
implementation = &crypto_stream_salsa20_xmm6_implementation;
#else
implementation = &crypto_stream_salsa20_ref_implementation;
#endif
@ -79,7 +83,7 @@ _crypto_stream_salsa20_pick_best_implementation(void)
return 0;
}
#endif
#ifdef HAVE_EMMINTRIN_H
#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H)
if (sodium_runtime_has_sse2()) {
implementation = &crypto_stream_salsa20_xmm6int_sse2_implementation;
return 0;

View File

@ -3,14 +3,18 @@
.text
.p2align 5
.globl crypto_stream_salsa20
.globl _crypto_stream_salsa20
#ifdef __ELF__
.type crypto_stream_salsa20, @function
.type _crypto_stream_salsa20, @function
#ifdef ASM_HIDE_SYMBOL
ASM_HIDE_SYMBOL stream_salsa20_xmm6
ASM_HIDE_SYMBOL _stream_salsa20_xmm6
#endif
crypto_stream_salsa20:
_crypto_stream_salsa20:
.globl stream_salsa20_xmm6
.globl _stream_salsa20_xmm6
#ifdef __ELF__
.type stream_salsa20_xmm6, @function
.type _stream_salsa20_xmm6, @function
#endif
stream_salsa20_xmm6:
_stream_salsa20_xmm6:
mov %rsp,%r11
and $31,%r11
add $512,%r11
@ -39,14 +43,18 @@ jmp ._start
.text
.p2align 5
.globl crypto_stream_salsa20_xor_ic
.globl _crypto_stream_salsa20_xor_ic
#ifdef __ELF__
.type crypto_stream_salsa20_xor_ic, @function
.type _crypto_stream_salsa20_xor_ic, @function
#ifdef ASM_HIDE_SYMBOL
ASM_HIDE_SYMBOL stream_salsa20_xmm6_xor_ic
ASM_HIDE_SYMBOL _stream_salsa20_xmm6_xor_ic
#endif
crypto_stream_salsa20_xor_ic:
_crypto_stream_salsa20_xor_ic:
.globl stream_salsa20_xmm6_xor_ic
.globl _stream_salsa20_xmm6_xor_ic
#ifdef __ELF__
.type stream_salsa20_xmm6_xor_ic, @function
.type _stream_salsa20_xmm6_xor_ic, @function
#endif
stream_salsa20_xmm6_xor_ic:
_stream_salsa20_xmm6_xor_ic:
mov %rsp,%r11
and $31,%r11

View File

@ -0,0 +1,25 @@
#include <stdint.h>
#include "utils.h"
#include "../stream_salsa20.h"
#include "salsa20_xmm6.h"
#ifdef HAVE_AMD64_ASM
extern int stream_salsa20_xmm6(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
extern int stream_salsa20_xmm6_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n,
uint64_t ic, const unsigned char *k);
struct crypto_stream_salsa20_implementation
crypto_stream_salsa20_xmm6_implementation = {
SODIUM_C99(.stream =) stream_salsa20_xmm6,
SODIUM_C99(.stream_xor_ic =) stream_salsa20_xmm6_xor_ic,
};
#endif

View File

@ -0,0 +1,8 @@
#include <stdint.h>
#include "../stream_salsa20.h"
#include "crypto_stream_salsa20.h"
extern struct crypto_stream_salsa20_implementation
crypto_stream_salsa20_xmm6_implementation;