Revamp the salsa20 implmentations and structure

- Factorize core_salsa20{20,12,8}
- Add support for multiple salsa20 implementations
- Replace the assembly SSE2 implementation with its equivalent using intrisics
This commit is contained in:
Frank Denis 2017-02-26 16:49:15 +01:00
parent d203d87d2f
commit 9294e2e699
27 changed files with 1465 additions and 1553 deletions

View File

@ -102,6 +102,7 @@ AC_ARG_ENABLE(minimal,
[
AS_IF([test "x$enableval" = "xyes"], [
enable_minimal="yes"
AC_DEFINE([MINIMAL], [1], [Define for a minimal build, without deprecated functions and functions that high-level APIs depend on])
], [
enable_minimal="no"
])

View File

@ -16,8 +16,7 @@ libsodium_la_SOURCES = \
crypto_core/curve25519/ref10/curve25519_ref10.c \
crypto_core/hsalsa20/ref2/core_hsalsa20_ref2.c \
crypto_core/hsalsa20/core_hsalsa20.c \
crypto_core/salsa20/ref/core_salsa20_ref.c \
crypto_core/salsa20/core_salsa20.c \
crypto_core/salsa/ref/core_salsa_ref.c \
crypto_generichash/crypto_generichash.c \
crypto_generichash/blake2b/generichash_blake2.c \
crypto_generichash/blake2b/ref/blake2.h \
@ -81,6 +80,7 @@ libsodium_la_SOURCES = \
crypto_stream/chacha20/ref/stream_chacha20_ref.c \
crypto_stream/crypto_stream.c \
crypto_stream/salsa20/stream_salsa20.c \
crypto_stream/salsa20/ref/stream_salsa20_ref.c \
crypto_stream/xsalsa20/stream_xsalsa20.c \
crypto_verify/sodium/verify.c \
include/sodium/private/common.h \
@ -141,24 +141,11 @@ libsodium_la_SOURCES += \
crypto_scalarmult/curve25519/sandy2x/sandy2x.S
endif
if HAVE_AMD64_ASM
libsodium_la_SOURCES += \
crypto_stream/salsa20/amd64_xmm6/stream_salsa20_amd64_xmm6.S
else
libsodium_la_SOURCES += \
crypto_stream/salsa20/ref/stream_salsa20_ref.c \
crypto_stream/salsa20/ref/xor_salsa20_ref.c
endif
if !MINIMAL
libsodium_la_SOURCES += \
crypto_aead/xchacha20poly1305/sodium/aead_xchacha20poly1305.c \
crypto_box/curve25519xchacha20poly1305/box_curve25519xchacha20poly1305.c \
crypto_core/hchacha20/core_hchacha20.c \
crypto_core/salsa2012/core_salsa2012.c \
crypto_core/salsa2012/ref/core_salsa2012_ref.c \
crypto_core/salsa208/core_salsa208.c \
crypto_core/salsa208/ref/core_salsa208_ref.c \
crypto_secretbox/xchacha20poly1305/secretbox_xchacha20poly1305.c \
crypto_shorthash/siphash24/shorthash_siphashx24.c \
crypto_shorthash/siphash24/ref/shorthash_siphashx24_ref.c \
@ -174,10 +161,8 @@ libsodium_la_SOURCES += \
crypto_stream/aes128ctr/nacl/xor_afternm_aes128ctr.c \
crypto_stream/aes128ctr/stream_aes128ctr.c \
crypto_stream/salsa2012/ref/stream_salsa2012_ref.c \
crypto_stream/salsa2012/ref/xor_salsa2012.c \
crypto_stream/salsa2012/stream_salsa2012.c \
crypto_stream/salsa208/ref/stream_salsa208_ref.c \
crypto_stream/salsa208/ref/xor_salsa208.c \
crypto_stream/salsa208/stream_salsa208.c \
crypto_stream/xchacha20/stream_xchacha20.c
endif
@ -219,7 +204,10 @@ libsse2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
libsse2_la_SOURCES = \
crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c \
crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c \
crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h
crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h \
crypto_stream/salsa20/xmm6int/stream_salsa20_xmm6int.c \
crypto_stream/salsa20/xmm6int/u1.h \
crypto_stream/salsa20/xmm6int/u4.h
libssse3_la_LDFLAGS = $(libsodium_la_LDFLAGS)
libssse3_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \

View File

@ -0,0 +1,195 @@
#include <stdint.h>
#include <stdlib.h>
#include "crypto_core_salsa20.h"
#include "crypto_core_salsa2012.h"
#include "crypto_core_salsa208.h"
#include "private/common.h"
static void
crypto_core_salsa(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c,
const int rounds)
{
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14,
j15;
int i;
x0 = 0x61707865;
x5 = 0x3320646e;
x10 = 0x79622d32;
x15 = 0x6b206574;
if (c != NULL) {
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
}
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
for (i = 0; i < rounds; i += 2) {
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
}
STORE32_LE(out + 0, x0 + j0);
STORE32_LE(out + 4, x1 + j1);
STORE32_LE(out + 8, x2 + j2);
STORE32_LE(out + 12, x3 + j3);
STORE32_LE(out + 16, x4 + j4);
STORE32_LE(out + 20, x5 + j5);
STORE32_LE(out + 24, x6 + j6);
STORE32_LE(out + 28, x7 + j7);
STORE32_LE(out + 32, x8 + j8);
STORE32_LE(out + 36, x9 + j9);
STORE32_LE(out + 40, x10 + j10);
STORE32_LE(out + 44, x11 + j11);
STORE32_LE(out + 48, x12 + j12);
STORE32_LE(out + 52, x13 + j13);
STORE32_LE(out + 56, x14 + j14);
STORE32_LE(out + 60, x15 + j15);
}
int
crypto_core_salsa20(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c)
{
crypto_core_salsa(out, in, k, c, 20);
return 0;
}
size_t
crypto_core_salsa20_outputbytes(void)
{
return crypto_core_salsa20_OUTPUTBYTES;
}
size_t
crypto_core_salsa20_inputbytes(void)
{
return crypto_core_salsa20_INPUTBYTES;
}
size_t
crypto_core_salsa20_keybytes(void)
{
return crypto_core_salsa20_KEYBYTES;
}
size_t
crypto_core_salsa20_constbytes(void)
{
return crypto_core_salsa20_CONSTBYTES;
}
#ifndef MINIMAL
int
crypto_core_salsa2012(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c)
{
crypto_core_salsa(out, in, k, c, 12);
return 0;
}
size_t
crypto_core_salsa2012_outputbytes(void)
{
return crypto_core_salsa2012_OUTPUTBYTES;
}
size_t
crypto_core_salsa2012_inputbytes(void)
{
return crypto_core_salsa2012_INPUTBYTES;
}
size_t
crypto_core_salsa2012_keybytes(void)
{
return crypto_core_salsa2012_KEYBYTES;
}
size_t
crypto_core_salsa2012_constbytes(void)
{
return crypto_core_salsa2012_CONSTBYTES;
}
int
crypto_core_salsa208(unsigned char *out, const unsigned char *in,
const unsigned char *k, const unsigned char *c)
{
crypto_core_salsa(out, in, k, c, 8);
return 0;
}
size_t
crypto_core_salsa208_outputbytes(void)
{
return crypto_core_salsa208_OUTPUTBYTES;
}
size_t
crypto_core_salsa208_inputbytes(void)
{
return crypto_core_salsa208_INPUTBYTES;
}
size_t
crypto_core_salsa208_keybytes(void)
{
return crypto_core_salsa208_KEYBYTES;
}
size_t
crypto_core_salsa208_constbytes(void)
{
return crypto_core_salsa208_CONSTBYTES;
}
#endif

View File

@ -1,21 +0,0 @@
#include "crypto_core_salsa20.h"
size_t
crypto_core_salsa20_outputbytes(void) {
return crypto_core_salsa20_OUTPUTBYTES;
}
size_t
crypto_core_salsa20_inputbytes(void) {
return crypto_core_salsa20_INPUTBYTES;
}
size_t
crypto_core_salsa20_keybytes(void) {
return crypto_core_salsa20_KEYBYTES;
}
size_t
crypto_core_salsa20_constbytes(void) {
return crypto_core_salsa20_CONSTBYTES;
}

View File

@ -1,122 +0,0 @@
/*
version 20080912
D. J. Bernstein
Public domain.
*/
#include <stdint.h>
#include <stdlib.h>
#include "crypto_core_salsa20.h"
#include "private/common.h"
#define ROUNDS 20
#define U32C(v) (v##U)
int
crypto_core_salsa20(unsigned char *out,
const unsigned char *in,
const unsigned char *k,
const unsigned char *c)
{
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8,
x9, x10, x11, x12, x13, x14, x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8,
j9, j10, j11, j12, j13, j14, j15;
int i;
if (c == NULL) {
j0 = x0 = U32C(0x61707865);
j5 = x5 = U32C(0x3320646e);
j10 = x10 = U32C(0x79622d32);
j15 = x15 = U32C(0x6b206574);
} else {
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
}
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
for (i = ROUNDS; i > 0; i -= 2) {
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
}
x0 += j0;
x1 += j1;
x2 += j2;
x3 += j3;
x4 += j4;
x5 += j5;
x6 += j6;
x7 += j7;
x8 += j8;
x9 += j9;
x10 += j10;
x11 += j11;
x12 += j12;
x13 += j13;
x14 += j14;
x15 += j15;
STORE32_LE(out + 0, x0);
STORE32_LE(out + 4, x1);
STORE32_LE(out + 8, x2);
STORE32_LE(out + 12, x3);
STORE32_LE(out + 16, x4);
STORE32_LE(out + 20, x5);
STORE32_LE(out + 24, x6);
STORE32_LE(out + 28, x7);
STORE32_LE(out + 32, x8);
STORE32_LE(out + 36, x9);
STORE32_LE(out + 40, x10);
STORE32_LE(out + 44, x11);
STORE32_LE(out + 48, x12);
STORE32_LE(out + 52, x13);
STORE32_LE(out + 56, x14);
STORE32_LE(out + 60, x15);
return 0;
}

View File

@ -1,21 +0,0 @@
#include "crypto_core_salsa2012.h"
size_t
crypto_core_salsa2012_outputbytes(void) {
return crypto_core_salsa2012_OUTPUTBYTES;
}
size_t
crypto_core_salsa2012_inputbytes(void) {
return crypto_core_salsa2012_INPUTBYTES;
}
size_t
crypto_core_salsa2012_keybytes(void) {
return crypto_core_salsa2012_KEYBYTES;
}
size_t
crypto_core_salsa2012_constbytes(void) {
return crypto_core_salsa2012_CONSTBYTES;
}

View File

@ -1,122 +0,0 @@
/*
version 20080913
D. J. Bernstein
Public domain.
*/
#include <stdint.h>
#include <stdlib.h>
#include "crypto_core_salsa2012.h"
#include "private/common.h"
#define ROUNDS 12
#define U32C(v) (v##U)
int
crypto_core_salsa2012(unsigned char *out,
const unsigned char *in,
const unsigned char *k,
const unsigned char *c)
{
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8,
x9, x10, x11, x12, x13, x14, x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8,
j9, j10, j11, j12, j13, j14, j15;
int i;
if (c == NULL) {
j0 = x0 = U32C(0x61707865);
j5 = x5 = U32C(0x3320646e);
j10 = x10 = U32C(0x79622d32);
j15 = x15 = U32C(0x6b206574);
} else {
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
}
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
for (i = ROUNDS; i > 0; i -= 2) {
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
}
x0 += j0;
x1 += j1;
x2 += j2;
x3 += j3;
x4 += j4;
x5 += j5;
x6 += j6;
x7 += j7;
x8 += j8;
x9 += j9;
x10 += j10;
x11 += j11;
x12 += j12;
x13 += j13;
x14 += j14;
x15 += j15;
STORE32_LE(out + 0, x0);
STORE32_LE(out + 4, x1);
STORE32_LE(out + 8, x2);
STORE32_LE(out + 12, x3);
STORE32_LE(out + 16, x4);
STORE32_LE(out + 20, x5);
STORE32_LE(out + 24, x6);
STORE32_LE(out + 28, x7);
STORE32_LE(out + 32, x8);
STORE32_LE(out + 36, x9);
STORE32_LE(out + 40, x10);
STORE32_LE(out + 44, x11);
STORE32_LE(out + 48, x12);
STORE32_LE(out + 52, x13);
STORE32_LE(out + 56, x14);
STORE32_LE(out + 60, x15);
return 0;
}

View File

@ -1,21 +0,0 @@
#include "crypto_core_salsa208.h"
size_t
crypto_core_salsa208_outputbytes(void) {
return crypto_core_salsa208_OUTPUTBYTES;
}
size_t
crypto_core_salsa208_inputbytes(void) {
return crypto_core_salsa208_INPUTBYTES;
}
size_t
crypto_core_salsa208_keybytes(void) {
return crypto_core_salsa208_KEYBYTES;
}
size_t
crypto_core_salsa208_constbytes(void) {
return crypto_core_salsa208_CONSTBYTES;
}

View File

@ -1,122 +0,0 @@
/*
version 20080913
D. J. Bernstein
Public domain.
*/
#include <stdint.h>
#include <stdlib.h>
#include "crypto_core_salsa208.h"
#include "private/common.h"
#define ROUNDS 8
#define U32C(v) (v##U)
int
crypto_core_salsa208(unsigned char *out,
const unsigned char *in,
const unsigned char *k,
const unsigned char *c)
{
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8,
x9, x10, x11, x12, x13, x14, x15;
uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8,
j9, j10, j11, j12, j13, j14, j15;
int i;
if (c == NULL) {
j0 = x0 = U32C(0x61707865);
j5 = x5 = U32C(0x3320646e);
j10 = x10 = U32C(0x79622d32);
j15 = x15 = U32C(0x6b206574);
} else {
j0 = x0 = LOAD32_LE(c + 0);
j5 = x5 = LOAD32_LE(c + 4);
j10 = x10 = LOAD32_LE(c + 8);
j15 = x15 = LOAD32_LE(c + 12);
}
j1 = x1 = LOAD32_LE(k + 0);
j2 = x2 = LOAD32_LE(k + 4);
j3 = x3 = LOAD32_LE(k + 8);
j4 = x4 = LOAD32_LE(k + 12);
j6 = x6 = LOAD32_LE(in + 0);
j7 = x7 = LOAD32_LE(in + 4);
j8 = x8 = LOAD32_LE(in + 8);
j9 = x9 = LOAD32_LE(in + 12);
j11 = x11 = LOAD32_LE(k + 16);
j12 = x12 = LOAD32_LE(k + 20);
j13 = x13 = LOAD32_LE(k + 24);
j14 = x14 = LOAD32_LE(k + 28);
for (i = ROUNDS; i > 0; i -= 2) {
x4 ^= ROTL32(x0 + x12, 7);
x8 ^= ROTL32(x4 + x0, 9);
x12 ^= ROTL32(x8 + x4, 13);
x0 ^= ROTL32(x12 + x8, 18);
x9 ^= ROTL32(x5 + x1, 7);
x13 ^= ROTL32(x9 + x5, 9);
x1 ^= ROTL32(x13 + x9, 13);
x5 ^= ROTL32(x1 + x13, 18);
x14 ^= ROTL32(x10 + x6, 7);
x2 ^= ROTL32(x14 + x10, 9);
x6 ^= ROTL32(x2 + x14, 13);
x10 ^= ROTL32(x6 + x2, 18);
x3 ^= ROTL32(x15 + x11, 7);
x7 ^= ROTL32(x3 + x15, 9);
x11 ^= ROTL32(x7 + x3, 13);
x15 ^= ROTL32(x11 + x7, 18);
x1 ^= ROTL32(x0 + x3, 7);
x2 ^= ROTL32(x1 + x0, 9);
x3 ^= ROTL32(x2 + x1, 13);
x0 ^= ROTL32(x3 + x2, 18);
x6 ^= ROTL32(x5 + x4, 7);
x7 ^= ROTL32(x6 + x5, 9);
x4 ^= ROTL32(x7 + x6, 13);
x5 ^= ROTL32(x4 + x7, 18);
x11 ^= ROTL32(x10 + x9, 7);
x8 ^= ROTL32(x11 + x10, 9);
x9 ^= ROTL32(x8 + x11, 13);
x10 ^= ROTL32(x9 + x8, 18);
x12 ^= ROTL32(x15 + x14, 7);
x13 ^= ROTL32(x12 + x15, 9);
x14 ^= ROTL32(x13 + x12, 13);
x15 ^= ROTL32(x14 + x13, 18);
}
x0 += j0;
x1 += j1;
x2 += j2;
x3 += j3;
x4 += j4;
x5 += j5;
x6 += j6;
x7 += j7;
x8 += j8;
x9 += j9;
x10 += j10;
x11 += j11;
x12 += j12;
x13 += j13;
x14 += j14;
x15 += j15;
STORE32_LE(out + 0, x0);
STORE32_LE(out + 4, x1);
STORE32_LE(out + 8, x2);
STORE32_LE(out + 12, x3);
STORE32_LE(out + 16, x4);
STORE32_LE(out + 20, x5);
STORE32_LE(out + 24, x6);
STORE32_LE(out + 28, x7);
STORE32_LE(out + 32, x8);
STORE32_LE(out + 36, x9);
STORE32_LE(out + 40, x10);
STORE32_LE(out + 44, x11);
STORE32_LE(out + 48, x12);
STORE32_LE(out + 52, x13);
STORE32_LE(out + 56, x14);
STORE32_LE(out + 60, x15);
return 0;
}

View File

@ -77,8 +77,8 @@ chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
}
static void
chacha_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c,
unsigned long long bytes)
chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c,
unsigned long long bytes)
{
uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14,
x15;
@ -235,7 +235,7 @@ stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
chacha_keysetup(&ctx, k);
chacha_ivsetup(&ctx, n, NULL);
memset(c, 0, clen);
chacha_encrypt_bytes(&ctx, c, c, clen);
chacha20_encrypt_bytes(&ctx, c, c, clen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
@ -254,7 +254,7 @@ stream_ietf_ref(unsigned char *c, unsigned long long clen,
chacha_keysetup(&ctx, k);
chacha_ietf_ivsetup(&ctx, n, NULL);
memset(c, 0, clen);
chacha_encrypt_bytes(&ctx, c, c, clen);
chacha20_encrypt_bytes(&ctx, c, c, clen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
@ -279,7 +279,7 @@ stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
STORE32_LE(&ic_bytes[4], ic_high);
chacha_keysetup(&ctx, k);
chacha_ivsetup(&ctx, n, ic_bytes);
chacha_encrypt_bytes(&ctx, m, c, mlen);
chacha20_encrypt_bytes(&ctx, m, c, mlen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
@ -299,7 +299,7 @@ stream_ietf_ref_xor_ic(unsigned char *c, const unsigned char *m,
STORE32_LE(ic_bytes, ic);
chacha_keysetup(&ctx, k);
chacha_ietf_ivsetup(&ctx, n, ic_bytes);
chacha_encrypt_bytes(&ctx, m, c, mlen);
chacha20_encrypt_bytes(&ctx, m, c, mlen);
sodium_memzero(&ctx, sizeof ctx);
return 0;

View File

@ -129,8 +129,8 @@ chacha_keysetup(chacha_ctx *ctx, const uint8_t *k)
}
static void
chacha_encrypt_bytes(chacha_ctx *ctx, const uint8_t *in, uint8_t *out,
unsigned long long inlen)
chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *in, uint8_t *out,
unsigned long long inlen)
{
CRYPTO_ALIGN(16)
unsigned chacha_const[] = { 0x61707865, 0x3320646E, 0x79622D32,
@ -258,7 +258,7 @@ stream_vec(unsigned char *c, unsigned long long clen, const unsigned char *n,
chacha_keysetup(&ctx, k);
chacha_ivsetup(&ctx, n, 0ULL);
memset(c, 0, clen);
chacha_encrypt_bytes(&ctx, c, c, clen);
chacha20_encrypt_bytes(&ctx, c, c, clen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
@ -277,7 +277,7 @@ stream_ietf_vec(unsigned char *c, unsigned long long clen,
chacha_keysetup(&ctx, k);
chacha_ietf_ivsetup(&ctx, n, 0ULL);
memset(c, 0, clen);
chacha_encrypt_bytes(&ctx, c, c, clen);
chacha20_encrypt_bytes(&ctx, c, c, clen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
@ -295,7 +295,7 @@ stream_vec_xor_ic(unsigned char *c, const unsigned char *m,
}
chacha_keysetup(&ctx, k);
chacha_ivsetup(&ctx, n, ic);
chacha_encrypt_bytes(&ctx, m, c, mlen);
chacha20_encrypt_bytes(&ctx, m, c, mlen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
@ -313,7 +313,7 @@ stream_ietf_vec_xor_ic(unsigned char *c, const unsigned char *m,
}
chacha_keysetup(&ctx, k);
chacha_ietf_ivsetup(&ctx, n, ic);
chacha_encrypt_bytes(&ctx, m, c, mlen);
chacha20_encrypt_bytes(&ctx, m, c, mlen);
sodium_memzero(&ctx, sizeof ctx);
return 0;

View File

@ -1,952 +0,0 @@
#ifdef HAVE_AMD64_ASM
.text
.p2align 5
.globl crypto_stream_salsa20
.globl _crypto_stream_salsa20
#ifdef __ELF__
.type crypto_stream_salsa20, @function
.type _crypto_stream_salsa20, @function
#endif
crypto_stream_salsa20:
_crypto_stream_salsa20:
mov %rsp,%r11
and $31,%r11
add $512,%r11
sub %r11,%rsp
movq %r11,416(%rsp)
movq %r12,424(%rsp)
movq %r13,432(%rsp)
movq %r14,440(%rsp)
movq %r15,448(%rsp)
movq %rbx,456(%rsp)
movq %rbp,464(%rsp)
mov %rsi,%r9
mov %rdi,%rdi
mov %rdi,%rsi
mov %rdx,%rdx
mov %rcx,%r10
cmp $0,%r9
jbe ._done
mov $0,%rax
mov %r9,%rcx
rep stosb
sub %r9,%rdi
movq $0,472(%rsp)
jmp ._start
.text
.p2align 5
.globl crypto_stream_salsa20_xor_ic
.globl _crypto_stream_salsa20_xor_ic
#ifdef __ELF__
.type crypto_stream_salsa20_xor_ic, @function
.type _crypto_stream_salsa20_xor_ic, @function
#endif
crypto_stream_salsa20_xor_ic:
_crypto_stream_salsa20_xor_ic:
mov %rsp,%r11
and $31,%r11
add $512,%r11
sub %r11,%rsp
movq %r11,416(%rsp)
movq %r12,424(%rsp)
movq %r13,432(%rsp)
movq %r14,440(%rsp)
movq %r15,448(%rsp)
movq %rbx,456(%rsp)
movq %rbp,464(%rsp)
mov %rdi,%rdi
mov %rsi,%rsi
mov %r9,%r10
movq %r8,472(%rsp)
mov %rdx,%r9
mov %rcx,%rdx
cmp $0,%r9
jbe ._done
._start:
movl 20(%r10),%ecx
movl 0(%r10),%r8d
movl 0(%rdx),%eax
movl 16(%r10),%r11d
movl %ecx,64(%rsp)
movl %r8d,4+64(%rsp)
movl %eax,8+64(%rsp)
movl %r11d,12+64(%rsp)
movl 24(%r10),%r8d
movl 4(%r10),%eax
movl 4(%rdx),%edx
movq 472(%rsp),%rcx
movl %ecx,80(%rsp)
movl %r8d,4+80(%rsp)
movl %eax,8+80(%rsp)
movl %edx,12+80(%rsp)
movl 12(%r10),%edx
shr $32,%rcx
movl 28(%r10),%r8d
movl 8(%r10),%eax
movl %edx,96(%rsp)
movl %ecx,4+96(%rsp)
movl %r8d,8+96(%rsp)
movl %eax,12+96(%rsp)
mov $1634760805,%rdx
mov $857760878,%rcx
mov $2036477234,%r8
mov $1797285236,%rax
movl %edx,112(%rsp)
movl %ecx,4+112(%rsp)
movl %r8d,8+112(%rsp)
movl %eax,12+112(%rsp)
cmp $256,%r9
jb ._bytesbetween1and255
movdqa 112(%rsp),%xmm0
pshufd $0x55,%xmm0,%xmm1
pshufd $0xaa,%xmm0,%xmm2
pshufd $0xff,%xmm0,%xmm3
pshufd $0x00,%xmm0,%xmm0
movdqa %xmm1,128(%rsp)
movdqa %xmm2,144(%rsp)
movdqa %xmm3,160(%rsp)
movdqa %xmm0,176(%rsp)
movdqa 64(%rsp),%xmm0
pshufd $0xaa,%xmm0,%xmm1
pshufd $0xff,%xmm0,%xmm2
pshufd $0x00,%xmm0,%xmm3
pshufd $0x55,%xmm0,%xmm0
movdqa %xmm1,192(%rsp)
movdqa %xmm2,208(%rsp)
movdqa %xmm3,224(%rsp)
movdqa %xmm0,240(%rsp)
movdqa 80(%rsp),%xmm0
pshufd $0xff,%xmm0,%xmm1
pshufd $0x55,%xmm0,%xmm2
pshufd $0xaa,%xmm0,%xmm0
movdqa %xmm1,256(%rsp)
movdqa %xmm2,272(%rsp)
movdqa %xmm0,288(%rsp)
movdqa 96(%rsp),%xmm0
pshufd $0x00,%xmm0,%xmm1
pshufd $0xaa,%xmm0,%xmm2
pshufd $0xff,%xmm0,%xmm0
movdqa %xmm1,304(%rsp)
movdqa %xmm2,320(%rsp)
movdqa %xmm0,336(%rsp)
.p2align 4
._bytesatleast256:
movq 472(%rsp),%rdx
mov %rdx,%rcx
shr $32,%rcx
movl %edx,352(%rsp)
movl %ecx,368(%rsp)
add $1,%rdx
mov %rdx,%rcx
shr $32,%rcx
movl %edx,4+352(%rsp)
movl %ecx,4+368(%rsp)
add $1,%rdx
mov %rdx,%rcx
shr $32,%rcx
movl %edx,8+352(%rsp)
movl %ecx,8+368(%rsp)
add $1,%rdx
mov %rdx,%rcx
shr $32,%rcx
movl %edx,12+352(%rsp)
movl %ecx,12+368(%rsp)
add $1,%rdx
mov %rdx,%rcx
shr $32,%rcx
movl %edx,80(%rsp)
movl %ecx,4+96(%rsp)
movq %rdx,472(%rsp)
movq %r9,480(%rsp)
mov $20,%rdx
movdqa 128(%rsp),%xmm0
movdqa 144(%rsp),%xmm1
movdqa 160(%rsp),%xmm2
movdqa 320(%rsp),%xmm3
movdqa 336(%rsp),%xmm4
movdqa 192(%rsp),%xmm5
movdqa 208(%rsp),%xmm6
movdqa 240(%rsp),%xmm7
movdqa 256(%rsp),%xmm8
movdqa 272(%rsp),%xmm9
movdqa 288(%rsp),%xmm10
movdqa 368(%rsp),%xmm11
movdqa 176(%rsp),%xmm12
movdqa 224(%rsp),%xmm13
movdqa 304(%rsp),%xmm14
movdqa 352(%rsp),%xmm15
.p2align 4
._mainloop1:
movdqa %xmm1,384(%rsp)
movdqa %xmm2,400(%rsp)
movdqa %xmm13,%xmm1
paddd %xmm12,%xmm1
movdqa %xmm1,%xmm2
pslld $7,%xmm1
pxor %xmm1,%xmm14
psrld $25,%xmm2
pxor %xmm2,%xmm14
movdqa %xmm7,%xmm1
paddd %xmm0,%xmm1
movdqa %xmm1,%xmm2
pslld $7,%xmm1
pxor %xmm1,%xmm11
psrld $25,%xmm2
pxor %xmm2,%xmm11
movdqa %xmm12,%xmm1
paddd %xmm14,%xmm1
movdqa %xmm1,%xmm2
pslld $9,%xmm1
pxor %xmm1,%xmm15
psrld $23,%xmm2
pxor %xmm2,%xmm15
movdqa %xmm0,%xmm1
paddd %xmm11,%xmm1
movdqa %xmm1,%xmm2
pslld $9,%xmm1
pxor %xmm1,%xmm9
psrld $23,%xmm2
pxor %xmm2,%xmm9
movdqa %xmm14,%xmm1
paddd %xmm15,%xmm1
movdqa %xmm1,%xmm2
pslld $13,%xmm1
pxor %xmm1,%xmm13
psrld $19,%xmm2
pxor %xmm2,%xmm13
movdqa %xmm11,%xmm1
paddd %xmm9,%xmm1
movdqa %xmm1,%xmm2
pslld $13,%xmm1
pxor %xmm1,%xmm7
psrld $19,%xmm2
pxor %xmm2,%xmm7
movdqa %xmm15,%xmm1
paddd %xmm13,%xmm1
movdqa %xmm1,%xmm2
pslld $18,%xmm1
pxor %xmm1,%xmm12
psrld $14,%xmm2
pxor %xmm2,%xmm12
movdqa 384(%rsp),%xmm1
movdqa %xmm12,384(%rsp)
movdqa %xmm9,%xmm2
paddd %xmm7,%xmm2
movdqa %xmm2,%xmm12
pslld $18,%xmm2
pxor %xmm2,%xmm0
psrld $14,%xmm12
pxor %xmm12,%xmm0
movdqa %xmm5,%xmm2
paddd %xmm1,%xmm2
movdqa %xmm2,%xmm12
pslld $7,%xmm2
pxor %xmm2,%xmm3
psrld $25,%xmm12
pxor %xmm12,%xmm3
movdqa 400(%rsp),%xmm2
movdqa %xmm0,400(%rsp)
movdqa %xmm6,%xmm0
paddd %xmm2,%xmm0
movdqa %xmm0,%xmm12
pslld $7,%xmm0
pxor %xmm0,%xmm4
psrld $25,%xmm12
pxor %xmm12,%xmm4
movdqa %xmm1,%xmm0
paddd %xmm3,%xmm0
movdqa %xmm0,%xmm12
pslld $9,%xmm0
pxor %xmm0,%xmm10
psrld $23,%xmm12
pxor %xmm12,%xmm10
movdqa %xmm2,%xmm0
paddd %xmm4,%xmm0
movdqa %xmm0,%xmm12
pslld $9,%xmm0
pxor %xmm0,%xmm8
psrld $23,%xmm12
pxor %xmm12,%xmm8
movdqa %xmm3,%xmm0
paddd %xmm10,%xmm0
movdqa %xmm0,%xmm12
pslld $13,%xmm0
pxor %xmm0,%xmm5
psrld $19,%xmm12
pxor %xmm12,%xmm5
movdqa %xmm4,%xmm0
paddd %xmm8,%xmm0
movdqa %xmm0,%xmm12
pslld $13,%xmm0
pxor %xmm0,%xmm6
psrld $19,%xmm12
pxor %xmm12,%xmm6
movdqa %xmm10,%xmm0
paddd %xmm5,%xmm0
movdqa %xmm0,%xmm12
pslld $18,%xmm0
pxor %xmm0,%xmm1
psrld $14,%xmm12
pxor %xmm12,%xmm1
movdqa 384(%rsp),%xmm0
movdqa %xmm1,384(%rsp)
movdqa %xmm4,%xmm1
paddd %xmm0,%xmm1
movdqa %xmm1,%xmm12
pslld $7,%xmm1
pxor %xmm1,%xmm7
psrld $25,%xmm12
pxor %xmm12,%xmm7
movdqa %xmm8,%xmm1
paddd %xmm6,%xmm1
movdqa %xmm1,%xmm12
pslld $18,%xmm1
pxor %xmm1,%xmm2
psrld $14,%xmm12
pxor %xmm12,%xmm2
movdqa 400(%rsp),%xmm12
movdqa %xmm2,400(%rsp)
movdqa %xmm14,%xmm1
paddd %xmm12,%xmm1
movdqa %xmm1,%xmm2
pslld $7,%xmm1
pxor %xmm1,%xmm5
psrld $25,%xmm2
pxor %xmm2,%xmm5
movdqa %xmm0,%xmm1
paddd %xmm7,%xmm1
movdqa %xmm1,%xmm2
pslld $9,%xmm1
pxor %xmm1,%xmm10
psrld $23,%xmm2
pxor %xmm2,%xmm10
movdqa %xmm12,%xmm1
paddd %xmm5,%xmm1
movdqa %xmm1,%xmm2
pslld $9,%xmm1
pxor %xmm1,%xmm8
psrld $23,%xmm2
pxor %xmm2,%xmm8
movdqa %xmm7,%xmm1
paddd %xmm10,%xmm1
movdqa %xmm1,%xmm2
pslld $13,%xmm1
pxor %xmm1,%xmm4
psrld $19,%xmm2
pxor %xmm2,%xmm4
movdqa %xmm5,%xmm1
paddd %xmm8,%xmm1
movdqa %xmm1,%xmm2
pslld $13,%xmm1
pxor %xmm1,%xmm14
psrld $19,%xmm2
pxor %xmm2,%xmm14
movdqa %xmm10,%xmm1
paddd %xmm4,%xmm1
movdqa %xmm1,%xmm2
pslld $18,%xmm1
pxor %xmm1,%xmm0
psrld $14,%xmm2
pxor %xmm2,%xmm0
movdqa 384(%rsp),%xmm1
movdqa %xmm0,384(%rsp)
movdqa %xmm8,%xmm0
paddd %xmm14,%xmm0
movdqa %xmm0,%xmm2
pslld $18,%xmm0
pxor %xmm0,%xmm12
psrld $14,%xmm2
pxor %xmm2,%xmm12
movdqa %xmm11,%xmm0
paddd %xmm1,%xmm0
movdqa %xmm0,%xmm2
pslld $7,%xmm0
pxor %xmm0,%xmm6
psrld $25,%xmm2
pxor %xmm2,%xmm6
movdqa 400(%rsp),%xmm2
movdqa %xmm12,400(%rsp)
movdqa %xmm3,%xmm0
paddd %xmm2,%xmm0
movdqa %xmm0,%xmm12
pslld $7,%xmm0
pxor %xmm0,%xmm13
psrld $25,%xmm12
pxor %xmm12,%xmm13
movdqa %xmm1,%xmm0
paddd %xmm6,%xmm0
movdqa %xmm0,%xmm12
pslld $9,%xmm0
pxor %xmm0,%xmm15
psrld $23,%xmm12
pxor %xmm12,%xmm15
movdqa %xmm2,%xmm0
paddd %xmm13,%xmm0
movdqa %xmm0,%xmm12
pslld $9,%xmm0
pxor %xmm0,%xmm9
psrld $23,%xmm12
pxor %xmm12,%xmm9
movdqa %xmm6,%xmm0
paddd %xmm15,%xmm0
movdqa %xmm0,%xmm12
pslld $13,%xmm0
pxor %xmm0,%xmm11
psrld $19,%xmm12
pxor %xmm12,%xmm11
movdqa %xmm13,%xmm0
paddd %xmm9,%xmm0
movdqa %xmm0,%xmm12
pslld $13,%xmm0
pxor %xmm0,%xmm3
psrld $19,%xmm12
pxor %xmm12,%xmm3
movdqa %xmm15,%xmm0
paddd %xmm11,%xmm0
movdqa %xmm0,%xmm12
pslld $18,%xmm0
pxor %xmm0,%xmm1
psrld $14,%xmm12
pxor %xmm12,%xmm1
movdqa %xmm9,%xmm0
paddd %xmm3,%xmm0
movdqa %xmm0,%xmm12
pslld $18,%xmm0
pxor %xmm0,%xmm2
psrld $14,%xmm12
pxor %xmm12,%xmm2
movdqa 384(%rsp),%xmm12
movdqa 400(%rsp),%xmm0
sub $2,%rdx
ja ._mainloop1
paddd 176(%rsp),%xmm12
paddd 240(%rsp),%xmm7
paddd 288(%rsp),%xmm10
paddd 336(%rsp),%xmm4
movd %xmm12,%rdx
movd %xmm7,%rcx
movd %xmm10,%r8
movd %xmm4,%r9
pshufd $0x39,%xmm12,%xmm12
pshufd $0x39,%xmm7,%xmm7
pshufd $0x39,%xmm10,%xmm10
pshufd $0x39,%xmm4,%xmm4
xorl 0(%rsi),%edx
xorl 4(%rsi),%ecx
xorl 8(%rsi),%r8d
xorl 12(%rsi),%r9d
movl %edx,0(%rdi)
movl %ecx,4(%rdi)
movl %r8d,8(%rdi)
movl %r9d,12(%rdi)
movd %xmm12,%rdx
movd %xmm7,%rcx
movd %xmm10,%r8
movd %xmm4,%r9
pshufd $0x39,%xmm12,%xmm12
pshufd $0x39,%xmm7,%xmm7
pshufd $0x39,%xmm10,%xmm10
pshufd $0x39,%xmm4,%xmm4
xorl 64(%rsi),%edx
xorl 68(%rsi),%ecx
xorl 72(%rsi),%r8d
xorl 76(%rsi),%r9d
movl %edx,64(%rdi)
movl %ecx,68(%rdi)
movl %r8d,72(%rdi)
movl %r9d,76(%rdi)
movd %xmm12,%rdx
movd %xmm7,%rcx
movd %xmm10,%r8
movd %xmm4,%r9
pshufd $0x39,%xmm12,%xmm12
pshufd $0x39,%xmm7,%xmm7
pshufd $0x39,%xmm10,%xmm10
pshufd $0x39,%xmm4,%xmm4
xorl 128(%rsi),%edx
xorl 132(%rsi),%ecx
xorl 136(%rsi),%r8d
xorl 140(%rsi),%r9d
movl %edx,128(%rdi)
movl %ecx,132(%rdi)
movl %r8d,136(%rdi)
movl %r9d,140(%rdi)
movd %xmm12,%rdx
movd %xmm7,%rcx
movd %xmm10,%r8
movd %xmm4,%r9
xorl 192(%rsi),%edx
xorl 196(%rsi),%ecx
xorl 200(%rsi),%r8d
xorl 204(%rsi),%r9d
movl %edx,192(%rdi)
movl %ecx,196(%rdi)
movl %r8d,200(%rdi)
movl %r9d,204(%rdi)
paddd 304(%rsp),%xmm14
paddd 128(%rsp),%xmm0
paddd 192(%rsp),%xmm5
paddd 256(%rsp),%xmm8
movd %xmm14,%rdx
movd %xmm0,%rcx
movd %xmm5,%r8
movd %xmm8,%r9
pshufd $0x39,%xmm14,%xmm14
pshufd $0x39,%xmm0,%xmm0
pshufd $0x39,%xmm5,%xmm5
pshufd $0x39,%xmm8,%xmm8
xorl 16(%rsi),%edx
xorl 20(%rsi),%ecx
xorl 24(%rsi),%r8d
xorl 28(%rsi),%r9d
movl %edx,16(%rdi)
movl %ecx,20(%rdi)
movl %r8d,24(%rdi)
movl %r9d,28(%rdi)
movd %xmm14,%rdx
movd %xmm0,%rcx
movd %xmm5,%r8
movd %xmm8,%r9
pshufd $0x39,%xmm14,%xmm14
pshufd $0x39,%xmm0,%xmm0
pshufd $0x39,%xmm5,%xmm5
pshufd $0x39,%xmm8,%xmm8
xorl 80(%rsi),%edx
xorl 84(%rsi),%ecx
xorl 88(%rsi),%r8d
xorl 92(%rsi),%r9d
movl %edx,80(%rdi)
movl %ecx,84(%rdi)
movl %r8d,88(%rdi)
movl %r9d,92(%rdi)
movd %xmm14,%rdx
movd %xmm0,%rcx
movd %xmm5,%r8
movd %xmm8,%r9
pshufd $0x39,%xmm14,%xmm14
pshufd $0x39,%xmm0,%xmm0
pshufd $0x39,%xmm5,%xmm5
pshufd $0x39,%xmm8,%xmm8
xorl 144(%rsi),%edx
xorl 148(%rsi),%ecx
xorl 152(%rsi),%r8d
xorl 156(%rsi),%r9d
movl %edx,144(%rdi)
movl %ecx,148(%rdi)
movl %r8d,152(%rdi)
movl %r9d,156(%rdi)
movd %xmm14,%rdx
movd %xmm0,%rcx
movd %xmm5,%r8
movd %xmm8,%r9
xorl 208(%rsi),%edx
xorl 212(%rsi),%ecx
xorl 216(%rsi),%r8d
xorl 220(%rsi),%r9d
movl %edx,208(%rdi)
movl %ecx,212(%rdi)
movl %r8d,216(%rdi)
movl %r9d,220(%rdi)
paddd 352(%rsp),%xmm15
paddd 368(%rsp),%xmm11
paddd 144(%rsp),%xmm1
paddd 208(%rsp),%xmm6
movd %xmm15,%rdx
movd %xmm11,%rcx
movd %xmm1,%r8
movd %xmm6,%r9
pshufd $0x39,%xmm15,%xmm15
pshufd $0x39,%xmm11,%xmm11
pshufd $0x39,%xmm1,%xmm1
pshufd $0x39,%xmm6,%xmm6
xorl 32(%rsi),%edx
xorl 36(%rsi),%ecx
xorl 40(%rsi),%r8d
xorl 44(%rsi),%r9d
movl %edx,32(%rdi)
movl %ecx,36(%rdi)
movl %r8d,40(%rdi)
movl %r9d,44(%rdi)
movd %xmm15,%rdx
movd %xmm11,%rcx
movd %xmm1,%r8
movd %xmm6,%r9
pshufd $0x39,%xmm15,%xmm15
pshufd $0x39,%xmm11,%xmm11
pshufd $0x39,%xmm1,%xmm1
pshufd $0x39,%xmm6,%xmm6
xorl 96(%rsi),%edx
xorl 100(%rsi),%ecx
xorl 104(%rsi),%r8d
xorl 108(%rsi),%r9d
movl %edx,96(%rdi)
movl %ecx,100(%rdi)
movl %r8d,104(%rdi)
movl %r9d,108(%rdi)
movd %xmm15,%rdx
movd %xmm11,%rcx
movd %xmm1,%r8
movd %xmm6,%r9
pshufd $0x39,%xmm15,%xmm15
pshufd $0x39,%xmm11,%xmm11
pshufd $0x39,%xmm1,%xmm1
pshufd $0x39,%xmm6,%xmm6
xorl 160(%rsi),%edx
xorl 164(%rsi),%ecx
xorl 168(%rsi),%r8d
xorl 172(%rsi),%r9d
movl %edx,160(%rdi)
movl %ecx,164(%rdi)
movl %r8d,168(%rdi)
movl %r9d,172(%rdi)
movd %xmm15,%rdx
movd %xmm11,%rcx
movd %xmm1,%r8
movd %xmm6,%r9
xorl 224(%rsi),%edx
xorl 228(%rsi),%ecx
xorl 232(%rsi),%r8d
xorl 236(%rsi),%r9d
movl %edx,224(%rdi)
movl %ecx,228(%rdi)
movl %r8d,232(%rdi)
movl %r9d,236(%rdi)
paddd 224(%rsp),%xmm13
paddd 272(%rsp),%xmm9
paddd 320(%rsp),%xmm3
paddd 160(%rsp),%xmm2
movd %xmm13,%rdx
movd %xmm9,%rcx
movd %xmm3,%r8
movd %xmm2,%r9
pshufd $0x39,%xmm13,%xmm13
pshufd $0x39,%xmm9,%xmm9
pshufd $0x39,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
xorl 48(%rsi),%edx
xorl 52(%rsi),%ecx
xorl 56(%rsi),%r8d
xorl 60(%rsi),%r9d
movl %edx,48(%rdi)
movl %ecx,52(%rdi)
movl %r8d,56(%rdi)
movl %r9d,60(%rdi)
movd %xmm13,%rdx
movd %xmm9,%rcx
movd %xmm3,%r8
movd %xmm2,%r9
pshufd $0x39,%xmm13,%xmm13
pshufd $0x39,%xmm9,%xmm9
pshufd $0x39,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
xorl 112(%rsi),%edx
xorl 116(%rsi),%ecx
xorl 120(%rsi),%r8d
xorl 124(%rsi),%r9d
movl %edx,112(%rdi)
movl %ecx,116(%rdi)
movl %r8d,120(%rdi)
movl %r9d,124(%rdi)
movd %xmm13,%rdx
movd %xmm9,%rcx
movd %xmm3,%r8
movd %xmm2,%r9
pshufd $0x39,%xmm13,%xmm13
pshufd $0x39,%xmm9,%xmm9
pshufd $0x39,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
xorl 176(%rsi),%edx
xorl 180(%rsi),%ecx
xorl 184(%rsi),%r8d
xorl 188(%rsi),%r9d
movl %edx,176(%rdi)
movl %ecx,180(%rdi)
movl %r8d,184(%rdi)
movl %r9d,188(%rdi)
movd %xmm13,%rdx
movd %xmm9,%rcx
movd %xmm3,%r8
movd %xmm2,%r9
xorl 240(%rsi),%edx
xorl 244(%rsi),%ecx
xorl 248(%rsi),%r8d
xorl 252(%rsi),%r9d
movl %edx,240(%rdi)
movl %ecx,244(%rdi)
movl %r8d,248(%rdi)
movl %r9d,252(%rdi)
movq 480(%rsp),%r9
sub $256,%r9
add $256,%rsi
add $256,%rdi
cmp $256,%r9
jae ._bytesatleast256
cmp $0,%r9
jbe ._done
._bytesbetween1and255:
cmp $64,%r9
jae ._nocopy
mov %rdi,%rdx
leaq 0(%rsp),%rdi
mov %r9,%rcx
rep movsb
leaq 0(%rsp),%rdi
leaq 0(%rsp),%rsi
._nocopy:
movq %r9,480(%rsp)
movdqa 112(%rsp),%xmm0
movdqa 64(%rsp),%xmm1
movdqa 80(%rsp),%xmm2
movdqa 96(%rsp),%xmm3
movdqa %xmm1,%xmm4
mov $20,%rcx
.p2align 4
._mainloop2:
paddd %xmm0,%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm6
pslld $7,%xmm4
psrld $25,%xmm6
pxor %xmm4,%xmm3
pxor %xmm6,%xmm3
paddd %xmm3,%xmm5
movdqa %xmm3,%xmm4
movdqa %xmm5,%xmm6
pslld $9,%xmm5
psrld $23,%xmm6
pxor %xmm5,%xmm2
pshufd $0x93,%xmm3,%xmm3
pxor %xmm6,%xmm2
paddd %xmm2,%xmm4
movdqa %xmm2,%xmm5
movdqa %xmm4,%xmm6
pslld $13,%xmm4
psrld $19,%xmm6
pxor %xmm4,%xmm1
pshufd $0x4e,%xmm2,%xmm2
pxor %xmm6,%xmm1
paddd %xmm1,%xmm5
movdqa %xmm3,%xmm4
movdqa %xmm5,%xmm6
pslld $18,%xmm5
psrld $14,%xmm6
pxor %xmm5,%xmm0
pshufd $0x39,%xmm1,%xmm1
pxor %xmm6,%xmm0
paddd %xmm0,%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm6
pslld $7,%xmm4
psrld $25,%xmm6
pxor %xmm4,%xmm1
pxor %xmm6,%xmm1
paddd %xmm1,%xmm5
movdqa %xmm1,%xmm4
movdqa %xmm5,%xmm6
pslld $9,%xmm5
psrld $23,%xmm6
pxor %xmm5,%xmm2
pshufd $0x93,%xmm1,%xmm1
pxor %xmm6,%xmm2
paddd %xmm2,%xmm4
movdqa %xmm2,%xmm5
movdqa %xmm4,%xmm6
pslld $13,%xmm4
psrld $19,%xmm6
pxor %xmm4,%xmm3
pshufd $0x4e,%xmm2,%xmm2
pxor %xmm6,%xmm3
paddd %xmm3,%xmm5
movdqa %xmm1,%xmm4
movdqa %xmm5,%xmm6
pslld $18,%xmm5
psrld $14,%xmm6
pxor %xmm5,%xmm0
pshufd $0x39,%xmm3,%xmm3
pxor %xmm6,%xmm0
paddd %xmm0,%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm6
pslld $7,%xmm4
psrld $25,%xmm6
pxor %xmm4,%xmm3
pxor %xmm6,%xmm3
paddd %xmm3,%xmm5
movdqa %xmm3,%xmm4
movdqa %xmm5,%xmm6
pslld $9,%xmm5
psrld $23,%xmm6
pxor %xmm5,%xmm2
pshufd $0x93,%xmm3,%xmm3
pxor %xmm6,%xmm2
paddd %xmm2,%xmm4
movdqa %xmm2,%xmm5
movdqa %xmm4,%xmm6
pslld $13,%xmm4
psrld $19,%xmm6
pxor %xmm4,%xmm1
pshufd $0x4e,%xmm2,%xmm2
pxor %xmm6,%xmm1
paddd %xmm1,%xmm5
movdqa %xmm3,%xmm4
movdqa %xmm5,%xmm6
pslld $18,%xmm5
psrld $14,%xmm6
pxor %xmm5,%xmm0
pshufd $0x39,%xmm1,%xmm1
pxor %xmm6,%xmm0
paddd %xmm0,%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm6
pslld $7,%xmm4
psrld $25,%xmm6
pxor %xmm4,%xmm1
pxor %xmm6,%xmm1
paddd %xmm1,%xmm5
movdqa %xmm1,%xmm4
movdqa %xmm5,%xmm6
pslld $9,%xmm5
psrld $23,%xmm6
pxor %xmm5,%xmm2
pshufd $0x93,%xmm1,%xmm1
pxor %xmm6,%xmm2
paddd %xmm2,%xmm4
movdqa %xmm2,%xmm5
movdqa %xmm4,%xmm6
pslld $13,%xmm4
psrld $19,%xmm6
pxor %xmm4,%xmm3
pshufd $0x4e,%xmm2,%xmm2
pxor %xmm6,%xmm3
sub $4,%rcx
paddd %xmm3,%xmm5
movdqa %xmm1,%xmm4
movdqa %xmm5,%xmm6
pslld $18,%xmm5
pxor %xmm7,%xmm7
psrld $14,%xmm6
pxor %xmm5,%xmm0
pshufd $0x39,%xmm3,%xmm3
pxor %xmm6,%xmm0
ja ._mainloop2
paddd 112(%rsp),%xmm0
paddd 64(%rsp),%xmm1
paddd 80(%rsp),%xmm2
paddd 96(%rsp),%xmm3
movd %xmm0,%rcx
movd %xmm1,%r8
movd %xmm2,%r9
movd %xmm3,%rax
pshufd $0x39,%xmm0,%xmm0
pshufd $0x39,%xmm1,%xmm1
pshufd $0x39,%xmm2,%xmm2
pshufd $0x39,%xmm3,%xmm3
xorl 0(%rsi),%ecx
xorl 48(%rsi),%r8d
xorl 32(%rsi),%r9d
xorl 16(%rsi),%eax
movl %ecx,0(%rdi)
movl %r8d,48(%rdi)
movl %r9d,32(%rdi)
movl %eax,16(%rdi)
movd %xmm0,%rcx
movd %xmm1,%r8
movd %xmm2,%r9
movd %xmm3,%rax
pshufd $0x39,%xmm0,%xmm0
pshufd $0x39,%xmm1,%xmm1
pshufd $0x39,%xmm2,%xmm2
pshufd $0x39,%xmm3,%xmm3
xorl 20(%rsi),%ecx
xorl 4(%rsi),%r8d
xorl 52(%rsi),%r9d
xorl 36(%rsi),%eax
movl %ecx,20(%rdi)
movl %r8d,4(%rdi)
movl %r9d,52(%rdi)
movl %eax,36(%rdi)
movd %xmm0,%rcx
movd %xmm1,%r8
movd %xmm2,%r9
movd %xmm3,%rax
pshufd $0x39,%xmm0,%xmm0
pshufd $0x39,%xmm1,%xmm1
pshufd $0x39,%xmm2,%xmm2
pshufd $0x39,%xmm3,%xmm3
xorl 40(%rsi),%ecx
xorl 24(%rsi),%r8d
xorl 8(%rsi),%r9d
xorl 56(%rsi),%eax
movl %ecx,40(%rdi)
movl %r8d,24(%rdi)
movl %r9d,8(%rdi)
movl %eax,56(%rdi)
movd %xmm0,%rcx
movd %xmm1,%r8
movd %xmm2,%r9
movd %xmm3,%rax
xorl 60(%rsi),%ecx
xorl 44(%rsi),%r8d
xorl 28(%rsi),%r9d
xorl 12(%rsi),%eax
movl %ecx,60(%rdi)
movl %r8d,44(%rdi)
movl %r9d,28(%rdi)
movl %eax,12(%rdi)
movq 480(%rsp),%r9
movq 472(%rsp),%rcx
add $1,%rcx
mov %rcx,%r8
shr $32,%r8
movl %ecx,80(%rsp)
movl %r8d,4+96(%rsp)
movq %rcx,472(%rsp)
cmp $64,%r9
ja ._bytesatleast65
jae ._bytesatleast64
mov %rdi,%rsi
mov %rdx,%rdi
mov %r9,%rcx
rep movsb
._bytesatleast64:
._done:
movq 416(%rsp),%r11
movq 424(%rsp),%r12
movq 432(%rsp),%r13
movq 440(%rsp),%r14
movq 448(%rsp),%r15
movq 456(%rsp),%rbx
movq 464(%rsp),%rbp
add %r11,%rsp
xor %rax,%rax
mov %rsi,%rdx
ret
._bytesatleast65:
sub $64,%r9
add $64,%rdi
add $64,%rsi
jmp ._bytesbetween1and255
#endif
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

View File

@ -4,15 +4,17 @@ D. J. Bernstein
Public domain.
*/
#include <stdint.h>
#include "crypto_core_salsa20.h"
#include "crypto_stream_salsa20.h"
#include "utils.h"
#ifndef HAVE_AMD64_ASM
int
crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k)
static int
stream_ref(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k)
{
unsigned char in[16];
unsigned char block[64];
@ -34,18 +36,15 @@ crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
}
while (clen >= 64) {
crypto_core_salsa20(c, in, kcopy, NULL);
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
in[i] = u;
u >>= 8;
}
clen -= 64;
c += 64;
}
if (clen) {
crypto_core_salsa20(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)clen; ++i) {
@ -58,4 +57,61 @@ crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
return 0;
}
static int
stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k)
{
unsigned char in[16];
unsigned char block[64];
unsigned char kcopy[32];
unsigned int i;
unsigned int u;
if (!mlen) {
return 0;
}
for (i = 0; i < 32; ++i) {
kcopy[i] = k[i];
}
for (i = 0; i < 8; ++i) {
in[i] = n[i];
}
for (i = 8; i < 16; ++i) {
in[i] = (unsigned char)(ic & 0xff);
ic >>= 8;
}
while (mlen >= 64) {
crypto_core_salsa20(block, in, kcopy, NULL);
for (i = 0; i < 64; ++i) {
c[i] = m[i] ^ block[i];
}
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
in[i] = u;
u >>= 8;
}
mlen -= 64;
c += 64;
m += 64;
}
if (mlen) {
crypto_core_salsa20(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)mlen; ++i) {
c[i] = m[i] ^ block[i];
}
}
sodium_memzero(block, sizeof block);
sodium_memzero(kcopy, sizeof kcopy);
return 0;
}
struct crypto_stream_salsa20_implementation
crypto_stream_salsa20_ref_implementation = {
SODIUM_C99(.stream =) stream_ref,
SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic,
};
#endif

View File

@ -0,0 +1,16 @@
#include <stdint.h>
#include "../stream_salsa20.h"
#include "crypto_stream_salsa20.h"
extern struct crypto_stream_salsa20_implementation
crypto_stream_salsa20_ref_implementation;
int crypto_stream_salsa20_ref(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int crypto_stream_salsa20_ref_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);

View File

@ -1,69 +0,0 @@
/*
version 20140420
D. J. Bernstein
Public domain.
*/
#include <stdint.h>
#include "crypto_core_salsa20.h"
#include "crypto_stream_salsa20.h"
#include "utils.h"
#ifndef HAVE_AMD64_ASM
int
crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
uint64_t ic, const unsigned char *k)
{
unsigned char in[16];
unsigned char block[64];
unsigned char kcopy[32];
unsigned int i;
unsigned int u;
if (!mlen) {
return 0;
}
for (i = 0; i < 32; ++i) {
kcopy[i] = k[i];
}
for (i = 0; i < 8; ++i) {
in[i] = n[i];
}
for (i = 8; i < 16; ++i) {
in[i] = (unsigned char)(ic & 0xff);
ic >>= 8;
}
while (mlen >= 64) {
crypto_core_salsa20(block, in, kcopy, NULL);
for (i = 0; i < 64; ++i) {
c[i] = m[i] ^ block[i];
}
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
in[i] = u;
u >>= 8;
}
mlen -= 64;
c += 64;
m += 64;
}
if (mlen) {
crypto_core_salsa20(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)mlen; ++i) {
c[i] = m[i] ^ block[i];
}
}
sodium_memzero(block, sizeof block);
sodium_memzero(kcopy, sizeof kcopy);
return 0;
}
#endif

View File

@ -1,5 +1,19 @@
#include "crypto_stream_salsa20.h"
#include "stream_salsa20.h"
#include "randombytes.h"
#include "runtime.h"
#include "ref/stream_salsa20_ref.h"
#ifdef HAVE_EMMINTRIN_H
# include "xmm6int/stream_salsa20_xmm6int.h"
#endif
#if defined(HAVE_EMMINTRIN_H) && defined(__x86_64__)
static const crypto_stream_salsa20_implementation *implementation =
&crypto_stream_salsa20_xmm6int_implementation;
#else
static const crypto_stream_salsa20_implementation *implementation =
&crypto_stream_salsa20_ref_implementation;
#endif
size_t
crypto_stream_salsa20_keybytes(void)
@ -13,12 +27,28 @@ crypto_stream_salsa20_noncebytes(void)
return crypto_stream_salsa20_NONCEBYTES;
}
int
crypto_stream_salsa20(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k)
{
return implementation->stream(c, clen, n, k);
}
int
crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k)
{
return implementation->stream_xor_ic(c, m, mlen, n, ic, k);
}
int
crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k)
{
return crypto_stream_salsa20_xor_ic(c, m, mlen, n, 0U, k);
return implementation->stream_xor_ic(c, m, mlen, n, 0U, k);
}
void
@ -26,3 +56,20 @@ crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES])
{
randombytes_buf(k, crypto_stream_salsa20_KEYBYTES);
}
int
_crypto_stream_salsa20_pick_best_implementation(void)
{
#if defined(HAVE_EMMINTRIN_H) && defined(__x86_64__)
implementation = &crypto_stream_salsa20_xmm6int_implementation;
#else
implementation = &crypto_stream_salsa20_ref_implementation;
#endif
#ifdef HAVE_EMMINTRIN_H
if (sodium_runtime_has_sse2()) {
implementation = &crypto_stream_salsa20_xmm6int_implementation;
}
#endif
return 0;
}

View File

@ -0,0 +1,16 @@
#ifndef stream_salsa20_H
#define stream_salsa20_H
#include <stdint.h>
typedef struct crypto_stream_salsa20_implementation {
int (*stream)(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int (*stream_xor_ic)(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);
} crypto_stream_salsa20_implementation;
#endif

View File

@ -0,0 +1,185 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#if defined(HAVE_EMMINTRIN_H) || \
(defined(_MSC_VER) && \
(defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
# ifdef __GNUC__
# pragma GCC target("sse2")
# endif
# include <emmintrin.h>
#endif
#include "crypto_stream_salsa20.h"
#include "private/common.h"
#include "utils.h"
#include "../stream_salsa20.h"
#include "stream_salsa20_xmm6int.h"
#define ROUNDS 20
typedef struct salsa_ctx {
uint32_t input[16];
} salsa_ctx;
static const int TR[16] = {
0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3
};
static void
salsa20_wordtobyte_tr(uint8_t output[64], const uint32_t input[16])
{
uint32_t x[16];
int i;
for (i = 0; i < 16; i++) {
x[TR[i]] = input[TR[i]];
}
for (i = 20; i > 0; i -= 2) {
x[TR[4]] ^= ROTL32(x[TR[0]] + x[TR[12]], 7);
x[TR[8]] ^= ROTL32(x[TR[4]] + x[TR[0]], 9);
x[TR[12]] ^= ROTL32(x[TR[8]] + x[TR[4]], 13);
x[TR[0]] ^= ROTL32(x[TR[12]] + x[TR[8]], 18);
x[TR[9]] ^= ROTL32(x[TR[5]] + x[TR[1]], 7);
x[TR[13]] ^= ROTL32(x[TR[9]] + x[TR[5]], 9);
x[TR[1]] ^= ROTL32(x[TR[13]] + x[TR[9]], 13);
x[TR[5]] ^= ROTL32(x[TR[1]] + x[TR[13]], 18);
x[TR[14]] ^= ROTL32(x[TR[10]] + x[TR[6]], 7);
x[TR[2]] ^= ROTL32(x[TR[14]] + x[TR[10]], 9);
x[TR[6]] ^= ROTL32(x[TR[2]] + x[TR[14]], 13);
x[TR[10]] ^= ROTL32(x[TR[6]] + x[TR[2]], 18);
x[TR[3]] ^= ROTL32(x[TR[15]] + x[TR[11]], 7);
x[TR[7]] ^= ROTL32(x[TR[3]] + x[TR[15]], 9);
x[TR[11]] ^= ROTL32(x[TR[7]] + x[TR[3]], 13);
x[TR[15]] ^= ROTL32(x[TR[11]] + x[TR[7]], 18);
x[TR[1]] ^= ROTL32(x[TR[0]] + x[TR[3]], 7);
x[TR[2]] ^= ROTL32(x[TR[1]] + x[TR[0]], 9);
x[TR[3]] ^= ROTL32(x[TR[2]] + x[TR[1]], 13);
x[TR[0]] ^= ROTL32(x[TR[3]] + x[TR[2]], 18);
x[TR[6]] ^= ROTL32(x[TR[5]] + x[TR[4]], 7);
x[TR[7]] ^= ROTL32(x[TR[6]] + x[TR[5]], 9);
x[TR[4]] ^= ROTL32(x[TR[7]] + x[TR[6]], 13);
x[TR[5]] ^= ROTL32(x[TR[4]] + x[TR[7]], 18);
x[TR[11]] ^= ROTL32(x[TR[10]] + x[TR[9]], 7);
x[TR[8]] ^= ROTL32(x[TR[11]] + x[TR[10]], 9);
x[TR[9]] ^= ROTL32(x[TR[8]] + x[TR[11]], 13);
x[TR[10]] ^= ROTL32(x[TR[9]] + x[TR[8]], 18);
x[TR[12]] ^= ROTL32(x[TR[15]] + x[TR[14]], 7);
x[TR[13]] ^= ROTL32(x[TR[12]] + x[TR[15]], 9);
x[TR[14]] ^= ROTL32(x[TR[13]] + x[TR[12]], 13);
x[TR[15]] ^= ROTL32(x[TR[14]] + x[TR[13]], 18);
}
for (i = 0; i < 16; i++) {
x[TR[i]] += input[TR[i]];
}
for (i = 0; i < 16; i++) {
STORE32_LE(output + 4 * i, x[TR[i]]);
}
}
static void
salsa_keysetup(salsa_ctx *ctx, const uint8_t *k)
{
ctx->input[TR[1]] = LOAD32_LE(k + 0);
ctx->input[TR[2]] = LOAD32_LE(k + 4);
ctx->input[TR[3]] = LOAD32_LE(k + 8);
ctx->input[TR[4]] = LOAD32_LE(k + 12);
ctx->input[TR[11]] = LOAD32_LE(k + 16);
ctx->input[TR[12]] = LOAD32_LE(k + 20);
ctx->input[TR[13]] = LOAD32_LE(k + 24);
ctx->input[TR[14]] = LOAD32_LE(k + 28);
ctx->input[TR[0]] = 0x61707865;
ctx->input[TR[5]] = 0x3320646e;
ctx->input[TR[10]] = 0x79622d32;
ctx->input[TR[15]] = 0x6b206574;
}
static void
salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter)
{
ctx->input[TR[6]] = LOAD32_LE(iv + 0);
ctx->input[TR[7]] = LOAD32_LE(iv + 4);
ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0);
ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4);
}
static void
salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c,
unsigned long long bytes)
{
uint8_t partialblock[64];
uint32_t * const x = &ctx->input[0];
int i;
if (!bytes) {
return; /* LCOV_EXCL_LINE */
}
if (bytes > 64ULL * (1ULL << 32) - 64ULL) {
abort();
}
#include "u4.h"
#include "u1.h"
if (!bytes) {
return;
}
salsa20_wordtobyte_tr(partialblock, x);
for (i = 0; i < bytes; i++) {
c[i] = m[i] ^ partialblock[i];
}
}
static int
stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n,
const unsigned char *k)
{
struct salsa_ctx ctx;
if (!clen) {
return 0;
}
COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8);
salsa_keysetup(&ctx, k);
salsa_ivsetup(&ctx, n, NULL);
memset(c, 0, clen);
salsa20_encrypt_bytes(&ctx, c, c, clen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
}
static int
stream_ref_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n, uint64_t ic,
const unsigned char *k)
{
struct salsa_ctx ctx;
uint8_t ic_bytes[8];
uint32_t ic_high;
uint32_t ic_low;
if (!mlen) {
return 0;
}
ic_high = (uint32_t) (ic >> 32);
ic_low = (uint32_t) (ic);
STORE32_LE(&ic_bytes[0], ic_low);
STORE32_LE(&ic_bytes[4], ic_high);
salsa_keysetup(&ctx, k);
salsa_ivsetup(&ctx, n, ic_bytes);
salsa20_encrypt_bytes(&ctx, m, c, mlen);
sodium_memzero(&ctx, sizeof ctx);
return 0;
}
struct crypto_stream_salsa20_implementation
crypto_stream_salsa20_xmm6int_implementation = {
SODIUM_C99(.stream =) stream_ref,
SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic
};

View File

@ -0,0 +1,16 @@
#include <stdint.h>
#include "../stream_salsa20.h"
#include "crypto_stream_salsa20.h"
extern struct crypto_stream_salsa20_implementation
crypto_stream_salsa20_xmm6int_implementation;
int crypto_stream_salsa20_xmm6int(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k);
int crypto_stream_salsa20_xmm6int_xor_ic(unsigned char *c, const unsigned char *m,
unsigned long long mlen,
const unsigned char *n, uint64_t ic,
const unsigned char *k);

View File

@ -0,0 +1,220 @@
while (bytes >= 64) {
__m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0));
__m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4));
__m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8));
__m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12));
__m128i a0;
__m128i a1;
__m128i a2;
__m128i a3;
__m128i a4;
__m128i a5;
__m128i a6;
__m128i a7;
__m128i b0;
__m128i b1;
__m128i b2;
__m128i b3;
__m128i b4;
__m128i b5;
__m128i b6;
__m128i b7;
uint32_t in8;
uint32_t in9;
a0 = diag1;
for (i = 0; i < 20; i += 4) {
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
a0 = _mm_add_epi32(a0, diag0);
a1 = diag0;
b0 = a0;
a0 = _mm_slli_epi32(a0, 7);
b0 = _mm_srli_epi32(b0, 25);
diag3 = _mm_xor_si128(diag3, a0);
diag3 = _mm_xor_si128(diag3, b0);
a1 = _mm_add_epi32(a1, diag3);
a2 = diag3;
b1 = a1;
a1 = _mm_slli_epi32(a1, 9);
b1 = _mm_srli_epi32(b1, 23);
diag2 = _mm_xor_si128(diag2, a1);
diag3 = _mm_shuffle_epi32(diag3, 0x93);
diag2 = _mm_xor_si128(diag2, b1);
a2 = _mm_add_epi32(a2, diag2);
a3 = diag2;
b2 = a2;
a2 = _mm_slli_epi32(a2, 13);
b2 = _mm_srli_epi32(b2, 19);
diag1 = _mm_xor_si128(diag1, a2);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag1 = _mm_xor_si128(diag1, b2);
a3 = _mm_add_epi32(a3, diag1);
a4 = diag3;
b3 = a3;
a3 = _mm_slli_epi32(a3, 18);
b3 = _mm_srli_epi32(b3, 14);
diag0 = _mm_xor_si128(diag0, a3);
diag1 = _mm_shuffle_epi32(diag1, 0x39);
diag0 = _mm_xor_si128(diag0, b3);
a4 = _mm_add_epi32(a4, diag0);
a5 = diag0;
b4 = a4;
a4 = _mm_slli_epi32(a4, 7);
b4 = _mm_srli_epi32(b4, 25);
diag1 = _mm_xor_si128(diag1, a4);
diag1 = _mm_xor_si128(diag1, b4);
a5 = _mm_add_epi32(a5, diag1);
a6 = diag1;
b5 = a5;
a5 = _mm_slli_epi32(a5, 9);
b5 = _mm_srli_epi32(b5, 23);
diag2 = _mm_xor_si128(diag2, a5);
diag1 = _mm_shuffle_epi32(diag1, 0x93);
diag2 = _mm_xor_si128(diag2, b5);
a6 = _mm_add_epi32(a6, diag2);
a7 = diag2;
b6 = a6;
a6 = _mm_slli_epi32(a6, 13);
b6 = _mm_srli_epi32(b6, 19);
diag3 = _mm_xor_si128(diag3, a6);
diag2 = _mm_shuffle_epi32(diag2, 0x4e);
diag3 = _mm_xor_si128(diag3, b6);
a7 = _mm_add_epi32(a7, diag3);
a0 = diag1;
b7 = a7;
a7 = _mm_slli_epi32(a7, 18);
b7 = _mm_srli_epi32(b7, 14);
diag0 = _mm_xor_si128(diag0, a7);
diag3 = _mm_shuffle_epi32(diag3, 0x39);
diag0 = _mm_xor_si128(diag0, b7);
}
diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0)));
diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4)));
diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8)));
diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12)));
#define ONEQUAD_SHUFFLE(A, B, C, D) \
do { \
uint32_t in##A = _mm_cvtsi128_si32(diag0); \
uint32_t in##B = _mm_cvtsi128_si32(diag1); \
uint32_t in##C = _mm_cvtsi128_si32(diag2); \
uint32_t in##D = _mm_cvtsi128_si32(diag3); \
diag0 = _mm_shuffle_epi32(diag0, 0x39); \
diag1 = _mm_shuffle_epi32(diag1, 0x39); \
diag2 = _mm_shuffle_epi32(diag2, 0x39); \
diag3 = _mm_shuffle_epi32(diag3, 0x39); \
in##A ^= *(uint32_t *) (m + (A * 4)); \
in##B ^= *(uint32_t *) (m + (B * 4)); \
in##C ^= *(uint32_t *) (m + (C * 4)); \
in##D ^= *(uint32_t *) (m + (D * 4)); \
*(uint32_t *) (c + (A * 4)) = in##A; \
*(uint32_t *) (c + (B * 4)) = in##B; \
*(uint32_t *) (c + (C * 4)) = in##C; \
*(uint32_t *) (c + (D * 4)) = in##D; \
} while (0)
#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D)
ONEQUAD(0, 12, 8, 4);
ONEQUAD(5, 1, 13, 9);
ONEQUAD(10, 6, 2, 14);
ONEQUAD(15, 11, 7, 3);
#undef ONEQUAD
#undef ONEQUAD_SHUFFLE
in8 = x[8];
in9 = x[13];
in8++;
if (in8 == 0) {
in9++;
}
x[8] = in8;
x[13] = in9;
c += 64;
m += 64;
bytes -= 64;
}

View File

@ -0,0 +1,532 @@
if (bytes >= 256) {
__m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
y15;
__m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
z15;
__m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
orig9, orig10, orig11, orig12, orig13, orig14, orig15;
uint32_t in8;
uint32_t in9;
/* element broadcast immediate for _mm_shuffle_epi32 are in order:
0x00, 0x55, 0xaa, 0xff */
z0 = _mm_loadu_si128((__m128i *) (x + 0));
z5 = _mm_shuffle_epi32(z0, 0x55);
z10 = _mm_shuffle_epi32(z0, 0xaa);
z15 = _mm_shuffle_epi32(z0, 0xff);
z0 = _mm_shuffle_epi32(z0, 0x00);
z1 = _mm_loadu_si128((__m128i *) (x + 4));
z6 = _mm_shuffle_epi32(z1, 0xaa);
z11 = _mm_shuffle_epi32(z1, 0xff);
z12 = _mm_shuffle_epi32(z1, 0x00);
z1 = _mm_shuffle_epi32(z1, 0x55);
z2 = _mm_loadu_si128((__m128i *) (x + 8));
z7 = _mm_shuffle_epi32(z2, 0xff);
z13 = _mm_shuffle_epi32(z2, 0x55);
z2 = _mm_shuffle_epi32(z2, 0xaa);
/* no z8 -> first half of the nonce, will fill later */
z3 = _mm_loadu_si128((__m128i *) (x + 12));
z4 = _mm_shuffle_epi32(z3, 0x00);
z14 = _mm_shuffle_epi32(z3, 0xaa);
z3 = _mm_shuffle_epi32(z3, 0xff);
/* no z9 -> second half of the nonce, will fill later */
orig0 = z0;
orig1 = z1;
orig2 = z2;
orig3 = z3;
orig4 = z4;
orig5 = z5;
orig6 = z6;
orig7 = z7;
orig10 = z10;
orig11 = z11;
orig12 = z12;
orig13 = z13;
orig14 = z14;
orig15 = z15;
while (bytes >= 256) {
/* vector implementation for z8 and z9 */
/* not sure if it helps for only 4 blocks */
const __m128i addv8 = _mm_set_epi64x(1, 0);
const __m128i addv9 = _mm_set_epi64x(3, 2);
__m128i t8, t9;
uint64_t in89;
in8 = x[8];
in9 = x[13]; // see arrays above for the address translation
in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
t8 = _mm_set1_epi64x(in89);
t9 = _mm_set1_epi64x(in89);
z8 = _mm_add_epi64(addv8, t8);
z9 = _mm_add_epi64(addv9, t9);
t8 = _mm_unpacklo_epi32(z8, z9);
t9 = _mm_unpackhi_epi32(z8, z9);
z8 = _mm_unpacklo_epi32(t8, t9);
z9 = _mm_unpackhi_epi32(t8, t9);
orig8 = z8;
orig9 = z9;
in89 += 4;
x[8] = in89 & 0xFFFFFFFF;
x[13] = (in89 >> 32) & 0xFFFFFFFF;
z5 = orig5;
z10 = orig10;
z15 = orig15;
z14 = orig14;
z3 = orig3;
z6 = orig6;
z11 = orig11;
z1 = orig1;
z7 = orig7;
z13 = orig13;
z2 = orig2;
z9 = orig9;
z0 = orig0;
z12 = orig12;
z4 = orig4;
z8 = orig8;
for (i = 0; i < 20; i += 2) {
/* the inner loop is a direct translation (regexp search/replace)
* from the amd64-xmm6 ASM */
__m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
r14, r15;
y4 = z12;
y4 = _mm_add_epi32(y4, z0);
r4 = y4;
y4 = _mm_slli_epi32(y4, 7);
z4 = _mm_xor_si128(z4, y4);
r4 = _mm_srli_epi32(r4, 25);
z4 = _mm_xor_si128(z4, r4);
y9 = z1;
y9 = _mm_add_epi32(y9, z5);
r9 = y9;
y9 = _mm_slli_epi32(y9, 7);
z9 = _mm_xor_si128(z9, y9);
r9 = _mm_srli_epi32(r9, 25);
z9 = _mm_xor_si128(z9, r9);
y8 = z0;
y8 = _mm_add_epi32(y8, z4);
r8 = y8;
y8 = _mm_slli_epi32(y8, 9);
z8 = _mm_xor_si128(z8, y8);
r8 = _mm_srli_epi32(r8, 23);
z8 = _mm_xor_si128(z8, r8);
y13 = z5;
y13 = _mm_add_epi32(y13, z9);
r13 = y13;
y13 = _mm_slli_epi32(y13, 9);
z13 = _mm_xor_si128(z13, y13);
r13 = _mm_srli_epi32(r13, 23);
z13 = _mm_xor_si128(z13, r13);
y12 = z4;
y12 = _mm_add_epi32(y12, z8);
r12 = y12;
y12 = _mm_slli_epi32(y12, 13);
z12 = _mm_xor_si128(z12, y12);
r12 = _mm_srli_epi32(r12, 19);
z12 = _mm_xor_si128(z12, r12);
y1 = z9;
y1 = _mm_add_epi32(y1, z13);
r1 = y1;
y1 = _mm_slli_epi32(y1, 13);
z1 = _mm_xor_si128(z1, y1);
r1 = _mm_srli_epi32(r1, 19);
z1 = _mm_xor_si128(z1, r1);
y0 = z8;
y0 = _mm_add_epi32(y0, z12);
r0 = y0;
y0 = _mm_slli_epi32(y0, 18);
z0 = _mm_xor_si128(z0, y0);
r0 = _mm_srli_epi32(r0, 14);
z0 = _mm_xor_si128(z0, r0);
y5 = z13;
y5 = _mm_add_epi32(y5, z1);
r5 = y5;
y5 = _mm_slli_epi32(y5, 18);
z5 = _mm_xor_si128(z5, y5);
r5 = _mm_srli_epi32(r5, 14);
z5 = _mm_xor_si128(z5, r5);
y14 = z6;
y14 = _mm_add_epi32(y14, z10);
r14 = y14;
y14 = _mm_slli_epi32(y14, 7);
z14 = _mm_xor_si128(z14, y14);
r14 = _mm_srli_epi32(r14, 25);
z14 = _mm_xor_si128(z14, r14);
y3 = z11;
y3 = _mm_add_epi32(y3, z15);
r3 = y3;
y3 = _mm_slli_epi32(y3, 7);
z3 = _mm_xor_si128(z3, y3);
r3 = _mm_srli_epi32(r3, 25);
z3 = _mm_xor_si128(z3, r3);
y2 = z10;
y2 = _mm_add_epi32(y2, z14);
r2 = y2;
y2 = _mm_slli_epi32(y2, 9);
z2 = _mm_xor_si128(z2, y2);
r2 = _mm_srli_epi32(r2, 23);
z2 = _mm_xor_si128(z2, r2);
y7 = z15;
y7 = _mm_add_epi32(y7, z3);
r7 = y7;
y7 = _mm_slli_epi32(y7, 9);
z7 = _mm_xor_si128(z7, y7);
r7 = _mm_srli_epi32(r7, 23);
z7 = _mm_xor_si128(z7, r7);
y6 = z14;
y6 = _mm_add_epi32(y6, z2);
r6 = y6;
y6 = _mm_slli_epi32(y6, 13);
z6 = _mm_xor_si128(z6, y6);
r6 = _mm_srli_epi32(r6, 19);
z6 = _mm_xor_si128(z6, r6);
y11 = z3;
y11 = _mm_add_epi32(y11, z7);
r11 = y11;
y11 = _mm_slli_epi32(y11, 13);
z11 = _mm_xor_si128(z11, y11);
r11 = _mm_srli_epi32(r11, 19);
z11 = _mm_xor_si128(z11, r11);
y10 = z2;
y10 = _mm_add_epi32(y10, z6);
r10 = y10;
y10 = _mm_slli_epi32(y10, 18);
z10 = _mm_xor_si128(z10, y10);
r10 = _mm_srli_epi32(r10, 14);
z10 = _mm_xor_si128(z10, r10);
y1 = z3;
y1 = _mm_add_epi32(y1, z0);
r1 = y1;
y1 = _mm_slli_epi32(y1, 7);
z1 = _mm_xor_si128(z1, y1);
r1 = _mm_srli_epi32(r1, 25);
z1 = _mm_xor_si128(z1, r1);
y15 = z7;
y15 = _mm_add_epi32(y15, z11);
r15 = y15;
y15 = _mm_slli_epi32(y15, 18);
z15 = _mm_xor_si128(z15, y15);
r15 = _mm_srli_epi32(r15, 14);
z15 = _mm_xor_si128(z15, r15);
y6 = z4;
y6 = _mm_add_epi32(y6, z5);
r6 = y6;
y6 = _mm_slli_epi32(y6, 7);
z6 = _mm_xor_si128(z6, y6);
r6 = _mm_srli_epi32(r6, 25);
z6 = _mm_xor_si128(z6, r6);
y2 = z0;
y2 = _mm_add_epi32(y2, z1);
r2 = y2;
y2 = _mm_slli_epi32(y2, 9);
z2 = _mm_xor_si128(z2, y2);
r2 = _mm_srli_epi32(r2, 23);
z2 = _mm_xor_si128(z2, r2);
y7 = z5;
y7 = _mm_add_epi32(y7, z6);
r7 = y7;
y7 = _mm_slli_epi32(y7, 9);
z7 = _mm_xor_si128(z7, y7);
r7 = _mm_srli_epi32(r7, 23);
z7 = _mm_xor_si128(z7, r7);
y3 = z1;
y3 = _mm_add_epi32(y3, z2);
r3 = y3;
y3 = _mm_slli_epi32(y3, 13);
z3 = _mm_xor_si128(z3, y3);
r3 = _mm_srli_epi32(r3, 19);
z3 = _mm_xor_si128(z3, r3);
y4 = z6;
y4 = _mm_add_epi32(y4, z7);
r4 = y4;
y4 = _mm_slli_epi32(y4, 13);
z4 = _mm_xor_si128(z4, y4);
r4 = _mm_srli_epi32(r4, 19);
z4 = _mm_xor_si128(z4, r4);
y0 = z2;
y0 = _mm_add_epi32(y0, z3);
r0 = y0;
y0 = _mm_slli_epi32(y0, 18);
z0 = _mm_xor_si128(z0, y0);
r0 = _mm_srli_epi32(r0, 14);
z0 = _mm_xor_si128(z0, r0);
y5 = z7;
y5 = _mm_add_epi32(y5, z4);
r5 = y5;
y5 = _mm_slli_epi32(y5, 18);
z5 = _mm_xor_si128(z5, y5);
r5 = _mm_srli_epi32(r5, 14);
z5 = _mm_xor_si128(z5, r5);
y11 = z9;
y11 = _mm_add_epi32(y11, z10);
r11 = y11;
y11 = _mm_slli_epi32(y11, 7);
z11 = _mm_xor_si128(z11, y11);
r11 = _mm_srli_epi32(r11, 25);
z11 = _mm_xor_si128(z11, r11);
y12 = z14;
y12 = _mm_add_epi32(y12, z15);
r12 = y12;
y12 = _mm_slli_epi32(y12, 7);
z12 = _mm_xor_si128(z12, y12);
r12 = _mm_srli_epi32(r12, 25);
z12 = _mm_xor_si128(z12, r12);
y8 = z10;
y8 = _mm_add_epi32(y8, z11);
r8 = y8;
y8 = _mm_slli_epi32(y8, 9);
z8 = _mm_xor_si128(z8, y8);
r8 = _mm_srli_epi32(r8, 23);
z8 = _mm_xor_si128(z8, r8);
y13 = z15;
y13 = _mm_add_epi32(y13, z12);
r13 = y13;
y13 = _mm_slli_epi32(y13, 9);
z13 = _mm_xor_si128(z13, y13);
r13 = _mm_srli_epi32(r13, 23);
z13 = _mm_xor_si128(z13, r13);
y9 = z11;
y9 = _mm_add_epi32(y9, z8);
r9 = y9;
y9 = _mm_slli_epi32(y9, 13);
z9 = _mm_xor_si128(z9, y9);
r9 = _mm_srli_epi32(r9, 19);
z9 = _mm_xor_si128(z9, r9);
y14 = z12;
y14 = _mm_add_epi32(y14, z13);
r14 = y14;
y14 = _mm_slli_epi32(y14, 13);
z14 = _mm_xor_si128(z14, y14);
r14 = _mm_srli_epi32(r14, 19);
z14 = _mm_xor_si128(z14, r14);
y10 = z8;
y10 = _mm_add_epi32(y10, z9);
r10 = y10;
y10 = _mm_slli_epi32(y10, 18);
z10 = _mm_xor_si128(z10, y10);
r10 = _mm_srli_epi32(r10, 14);
z10 = _mm_xor_si128(z10, r10);
y15 = z13;
y15 = _mm_add_epi32(y15, z14);
r15 = y15;
y15 = _mm_slli_epi32(y15, 18);
z15 = _mm_xor_si128(z15, y15);
r15 = _mm_srli_epi32(r15, 14);
z15 = _mm_xor_si128(z15, r15);
}
/* store data ; this macro replicates the original amd64-xmm6 code */
#define ONEQUAD_SHUFFLE(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
in##A ^= *(uint32_t *) (m + 0); \
in##B ^= *(uint32_t *) (m + 4); \
in##C ^= *(uint32_t *) (m + 8); \
in##D ^= *(uint32_t *) (m + 12); \
*(uint32_t *) (c + 0) = in##A; \
*(uint32_t *) (c + 4) = in##B; \
*(uint32_t *) (c + 8) = in##C; \
*(uint32_t *) (c + 12) = in##D; \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
in##A ^= *(uint32_t *) (m + 64); \
in##B ^= *(uint32_t *) (m + 68); \
in##C ^= *(uint32_t *) (m + 72); \
in##D ^= *(uint32_t *) (m + 76); \
*(uint32_t *) (c + 64) = in##A; \
*(uint32_t *) (c + 68) = in##B; \
*(uint32_t *) (c + 72) = in##C; \
*(uint32_t *) (c + 76) = in##D; \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
z##A = _mm_shuffle_epi32(z##A, 0x39); \
z##B = _mm_shuffle_epi32(z##B, 0x39); \
z##C = _mm_shuffle_epi32(z##C, 0x39); \
z##D = _mm_shuffle_epi32(z##D, 0x39); \
in##A ^= *(uint32_t *) (m + 128); \
in##B ^= *(uint32_t *) (m + 132); \
in##C ^= *(uint32_t *) (m + 136); \
in##D ^= *(uint32_t *) (m + 140); \
*(uint32_t *) (c + 128) = in##A; \
*(uint32_t *) (c + 132) = in##B; \
*(uint32_t *) (c + 136) = in##C; \
*(uint32_t *) (c + 140) = in##D; \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
in##A ^= *(uint32_t *) (m + 192); \
in##B ^= *(uint32_t *) (m + 196); \
in##C ^= *(uint32_t *) (m + 200); \
in##D ^= *(uint32_t *) (m + 204); \
*(uint32_t *) (c + 192) = in##A; \
*(uint32_t *) (c + 196) = in##B; \
*(uint32_t *) (c + 200) = in##C; \
*(uint32_t *) (c + 204) = in##D
/* store data ; this macro replaces shuffle+mov by a direct extract; not much
* difference */
#define ONEQUAD_EXTRACT(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
in##A = _mm_cvtsi128_si32(z##A); \
in##B = _mm_cvtsi128_si32(z##B); \
in##C = _mm_cvtsi128_si32(z##C); \
in##D = _mm_cvtsi128_si32(z##D); \
in##A ^= *(uint32_t *) (m + 0); \
in##B ^= *(uint32_t *) (m + 4); \
in##C ^= *(uint32_t *) (m + 8); \
in##D ^= *(uint32_t *) (m + 12); \
*(uint32_t *) (c + 0) = in##A; \
*(uint32_t *) (c + 4) = in##B; \
*(uint32_t *) (c + 8) = in##C; \
*(uint32_t *) (c + 12) = in##D; \
in##A = _mm_extract_epi32(z##A, 1); \
in##B = _mm_extract_epi32(z##B, 1); \
in##C = _mm_extract_epi32(z##C, 1); \
in##D = _mm_extract_epi32(z##D, 1); \
in##A ^= *(uint32_t *) (m + 64); \
in##B ^= *(uint32_t *) (m + 68); \
in##C ^= *(uint32_t *) (m + 72); \
in##D ^= *(uint32_t *) (m + 76); \
*(uint32_t *) (c + 64) = in##A; \
*(uint32_t *) (c + 68) = in##B; \
*(uint32_t *) (c + 72) = in##C; \
*(uint32_t *) (c + 76) = in##D; \
in##A = _mm_extract_epi32(z##A, 2); \
in##B = _mm_extract_epi32(z##B, 2); \
in##C = _mm_extract_epi32(z##C, 2); \
in##D = _mm_extract_epi32(z##D, 2); \
in##A ^= *(uint32_t *) (m + 128); \
in##B ^= *(uint32_t *) (m + 132); \
in##C ^= *(uint32_t *) (m + 136); \
in##D ^= *(uint32_t *) (m + 140); \
*(uint32_t *) (c + 128) = in##A; \
*(uint32_t *) (c + 132) = in##B; \
*(uint32_t *) (c + 136) = in##C; \
*(uint32_t *) (c + 140) = in##D; \
in##A = _mm_extract_epi32(z##A, 3); \
in##B = _mm_extract_epi32(z##B, 3); \
in##C = _mm_extract_epi32(z##C, 3); \
in##D = _mm_extract_epi32(z##D, 3); \
in##A ^= *(uint32_t *) (m + 192); \
in##B ^= *(uint32_t *) (m + 196); \
in##C ^= *(uint32_t *) (m + 200); \
in##D ^= *(uint32_t *) (m + 204); \
*(uint32_t *) (c + 192) = in##A; \
*(uint32_t *) (c + 196) = in##B; \
*(uint32_t *) (c + 200) = in##C; \
*(uint32_t *) (c + 204) = in##D
/* store data ; this macro first transpose data in-registers, and then store
* them in memory. much faster with icc. */
#define ONEQUAD_TRANSPOSE(A, B, C, D) \
z##A = _mm_add_epi32(z##A, orig##A); \
z##B = _mm_add_epi32(z##B, orig##B); \
z##C = _mm_add_epi32(z##C, orig##C); \
z##D = _mm_add_epi32(z##D, orig##D); \
y##A = _mm_unpacklo_epi32(z##A, z##B); \
y##B = _mm_unpacklo_epi32(z##C, z##D); \
y##C = _mm_unpackhi_epi32(z##A, z##B); \
y##D = _mm_unpackhi_epi32(z##C, z##D); \
z##A = _mm_unpacklo_epi64(y##A, y##B); \
z##B = _mm_unpackhi_epi64(y##A, y##B); \
z##C = _mm_unpacklo_epi64(y##C, y##D); \
z##D = _mm_unpackhi_epi64(y##C, y##D); \
y##A = _mm_xor_si128(z##A, _mm_loadu_si128((__m128i *) (m + 0))); \
_mm_storeu_si128((__m128i *) (c + 0), y##A); \
y##B = _mm_xor_si128(z##B, _mm_loadu_si128((__m128i *) (m + 64))); \
_mm_storeu_si128((__m128i *) (c + 64), y##B); \
y##C = _mm_xor_si128(z##C, _mm_loadu_si128((__m128i *) (m + 128))); \
_mm_storeu_si128((__m128i *) (c + 128), y##C); \
y##D = _mm_xor_si128(z##D, _mm_loadu_si128((__m128i *) (m + 192))); \
_mm_storeu_si128((__m128i *) (c + 192), y##D)
#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
ONEQUAD(0, 1, 2, 3);
m += 16;
c += 16;
ONEQUAD(4, 5, 6, 7);
m += 16;
c += 16;
ONEQUAD(8, 9, 10, 11);
m += 16;
c += 16;
ONEQUAD(12, 13, 14, 15);
m -= 48;
c -= 48;
#undef ONEQUAD
#undef ONEQUAD_TRANSPOSE
#undef ONEQUAD_EXTRACT
#undef ONEQUAD_SHUFFLE
bytes -= 256;
c += 256;
m += 256;
}
}

View File

@ -4,6 +4,8 @@ D. J. Bernstein
Public domain.
*/
#include <stdint.h>
#include "crypto_core_salsa2012.h"
#include "crypto_stream_salsa2012.h"
#include "utils.h"
@ -32,18 +34,15 @@ crypto_stream_salsa2012(unsigned char *c, unsigned long long clen,
}
while (clen >= 64) {
crypto_core_salsa2012(c, in, kcopy, NULL);
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
in[i] = u;
u >>= 8;
}
clen -= 64;
c += 64;
}
if (clen) {
crypto_core_salsa2012(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)clen; ++i) {
@ -55,3 +54,53 @@ crypto_stream_salsa2012(unsigned char *c, unsigned long long clen,
return 0;
}
int
crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k)
{
unsigned char in[16];
unsigned char block[64];
unsigned char kcopy[32];
unsigned int i;
unsigned int u;
if (!mlen) {
return 0;
}
for (i = 0; i < 32; ++i) {
kcopy[i] = k[i];
}
for (i = 0; i < 8; ++i) {
in[i] = n[i];
}
for (i = 8; i < 16; ++i) {
in[i] = 0;
}
while (mlen >= 64) {
crypto_core_salsa2012(block, in, kcopy, NULL);
for (i = 0; i < 64; ++i) {
c[i] = m[i] ^ block[i];
}
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
in[i] = u;
u >>= 8;
}
mlen -= 64;
c += 64;
m += 64;
}
if (mlen) {
crypto_core_salsa2012(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)mlen; ++i) {
c[i] = m[i] ^ block[i];
}
}
sodium_memzero(block, sizeof block);
sodium_memzero(kcopy, sizeof kcopy);
return 0;
}

View File

@ -4,10 +4,57 @@ D. J. Bernstein
Public domain.
*/
#include <stdint.h>
#include "crypto_core_salsa2012.h"
#include "crypto_stream_salsa2012.h"
#include "utils.h"
int
crypto_stream_salsa2012(unsigned char *c, unsigned long long clen,
const unsigned char *n, const unsigned char *k)
{
unsigned char in[16];
unsigned char block[64];
unsigned char kcopy[32];
unsigned int i;
unsigned int u;
if (!clen) {
return 0;
}
for (i = 0; i < 32; ++i) {
kcopy[i] = k[i];
}
for (i = 0; i < 8; ++i) {
in[i] = n[i];
}
for (i = 8; i < 16; ++i) {
in[i] = 0;
}
while (clen >= 64) {
crypto_core_salsa2012(c, in, kcopy, NULL);
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
in[i] = u;
u >>= 8;
}
clen -= 64;
c += 64;
}
if (clen) {
crypto_core_salsa2012(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)clen; ++i) {
c[i] = block[i];
}
}
sodium_memzero(block, sizeof block);
sodium_memzero(kcopy, sizeof kcopy);
return 0;
}
int
crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
@ -46,7 +93,6 @@ crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m,
c += 64;
m += 64;
}
if (mlen) {
crypto_core_salsa2012(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)mlen; ++i) {

View File

@ -4,6 +4,8 @@ D. J. Bernstein
Public domain.
*/
#include <stdint.h>
#include "crypto_core_salsa208.h"
#include "crypto_stream_salsa208.h"
#include "utils.h"
@ -32,7 +34,6 @@ crypto_stream_salsa208(unsigned char *c, unsigned long long clen,
}
while (clen >= 64) {
crypto_core_salsa208(c, in, kcopy, NULL);
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
@ -42,7 +43,6 @@ crypto_stream_salsa208(unsigned char *c, unsigned long long clen,
clen -= 64;
c += 64;
}
if (clen) {
crypto_core_salsa208(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)clen; ++i) {
@ -54,3 +54,53 @@ crypto_stream_salsa208(unsigned char *c, unsigned long long clen,
return 0;
}
int
crypto_stream_salsa208_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k)
{
unsigned char in[16];
unsigned char block[64];
unsigned char kcopy[32];
unsigned int i;
unsigned int u;
if (!mlen) {
return 0;
}
for (i = 0; i < 32; ++i) {
kcopy[i] = k[i];
}
for (i = 0; i < 8; ++i) {
in[i] = n[i];
}
for (i = 8; i < 16; ++i) {
in[i] = 0;
}
while (mlen >= 64) {
crypto_core_salsa208(block, in, kcopy, NULL);
for (i = 0; i < 64; ++i) {
c[i] = m[i] ^ block[i];
}
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
in[i] = u;
u >>= 8;
}
mlen -= 64;
c += 64;
m += 64;
}
if (mlen) {
crypto_core_salsa208(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)mlen; ++i) {
c[i] = m[i] ^ block[i];
}
}
sodium_memzero(block, sizeof block);
sodium_memzero(kcopy, sizeof kcopy);
return 0;
}

View File

@ -1,60 +0,0 @@
/*
version 20140420
D. J. Bernstein
Public domain.
*/
#include "crypto_core_salsa208.h"
#include "crypto_stream_salsa208.h"
#include "utils.h"
int
crypto_stream_salsa208_xor(unsigned char *c, const unsigned char *m,
unsigned long long mlen, const unsigned char *n,
const unsigned char *k)
{
unsigned char in[16];
unsigned char block[64];
unsigned char kcopy[32];
unsigned int i;
unsigned int u;
if (!mlen) {
return 0;
}
for (i = 0; i < 32; ++i) {
kcopy[i] = k[i];
}
for (i = 0; i < 8; ++i) {
in[i] = n[i];
}
for (i = 8; i < 16; ++i) {
in[i] = 0;
}
while (mlen >= 64) {
crypto_core_salsa208(block, in, kcopy, NULL);
for (i = 0; i < 64; ++i) {
c[i] = m[i] ^ block[i];
}
u = 1;
for (i = 8; i < 16; ++i) {
u += (unsigned int)in[i];
in[i] = u;
u >>= 8;
}
mlen -= 64;
c += 64;
m += 64;
}
if (mlen) {
crypto_core_salsa208(block, in, kcopy, NULL);
for (i = 0; i < (unsigned int)mlen; ++i) {
c[i] = m[i] ^ block[i];
}
}
sodium_memzero(block, sizeof block);
sodium_memzero(kcopy, sizeof kcopy);
return 0;
}

View File

@ -46,6 +46,10 @@ int crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m,
SODIUM_EXPORT
void crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]);
/* ------------------------------------------------------------------------- */
int _crypto_stream_salsa20_pick_best_implementation(void);
#ifdef __cplusplus
}
#endif

View File

@ -55,6 +55,7 @@ sodium_init(void)
_crypto_onetimeauth_poly1305_pick_best_implementation();
_crypto_scalarmult_curve25519_pick_best_implementation();
_crypto_stream_chacha20_pick_best_implementation();
_crypto_stream_salsa20_pick_best_implementation();
initialized = 1;
if (sodium_crit_leave() != 0) {
return -1;