Monero
Toggle main menu visibility
Loading...
Searching...
No Matches
external
randomx
src
blake2
blamka-round-avx2.h
Go to the documentation of this file.
1
/*
2
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
3
4
All rights reserved.
5
6
Redistribution and use in source and binary forms, with or without
7
modification, are permitted provided that the following conditions are met:
8
* Redistributions of source code must retain the above copyright
9
notice, this list of conditions and the following disclaimer.
10
* Redistributions in binary form must reproduce the above copyright
11
notice, this list of conditions and the following disclaimer in the
12
documentation and/or other materials provided with the distribution.
13
* Neither the name of the copyright holder nor the
14
names of its contributors may be used to endorse or promote products
15
derived from this software without specific prior written permission.
16
17
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
*/
28
29
/* Original code from Argon2 reference source code package used under CC0 Licence
30
* https://github.com/P-H-C/phc-winner-argon2
31
* Copyright 2015
32
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
33
*/
34
35
#ifndef BLAKE_ROUND_MKA_OPT_H
36
#define BLAKE_ROUND_MKA_OPT_H
37
38
#include "
blake2-impl.h
"
39
40
#ifdef __GNUC__
41
#include <x86intrin.h>
42
#else
43
#include <intrin.h>
44
#endif
45
46
#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
47
#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
48
#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
49
#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
50
51
#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
52
do { \
53
__m256i ml = _mm256_mul_epu32(A0, B0); \
54
ml = _mm256_add_epi64(ml, ml); \
55
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
56
D0 = _mm256_xor_si256(D0, A0); \
57
D0 = rotr32(D0); \
58
\
59
ml = _mm256_mul_epu32(C0, D0); \
60
ml = _mm256_add_epi64(ml, ml); \
61
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
62
\
63
B0 = _mm256_xor_si256(B0, C0); \
64
B0 = rotr24(B0); \
65
\
66
ml = _mm256_mul_epu32(A1, B1); \
67
ml = _mm256_add_epi64(ml, ml); \
68
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
69
D1 = _mm256_xor_si256(D1, A1); \
70
D1 = rotr32(D1); \
71
\
72
ml = _mm256_mul_epu32(C1, D1); \
73
ml = _mm256_add_epi64(ml, ml); \
74
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
75
\
76
B1 = _mm256_xor_si256(B1, C1); \
77
B1 = rotr24(B1); \
78
} while((void)0, 0);
79
80
#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
81
do { \
82
__m256i ml = _mm256_mul_epu32(A0, B0); \
83
ml = _mm256_add_epi64(ml, ml); \
84
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
85
D0 = _mm256_xor_si256(D0, A0); \
86
D0 = rotr16(D0); \
87
\
88
ml = _mm256_mul_epu32(C0, D0); \
89
ml = _mm256_add_epi64(ml, ml); \
90
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
91
B0 = _mm256_xor_si256(B0, C0); \
92
B0 = rotr63(B0); \
93
\
94
ml = _mm256_mul_epu32(A1, B1); \
95
ml = _mm256_add_epi64(ml, ml); \
96
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
97
D1 = _mm256_xor_si256(D1, A1); \
98
D1 = rotr16(D1); \
99
\
100
ml = _mm256_mul_epu32(C1, D1); \
101
ml = _mm256_add_epi64(ml, ml); \
102
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
103
B1 = _mm256_xor_si256(B1, C1); \
104
B1 = rotr63(B1); \
105
} while((void)0, 0);
106
107
#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
108
do { \
109
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
110
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
111
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
112
\
113
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
114
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
115
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
116
} while((void)0, 0);
117
118
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
119
do { \
120
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
121
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
122
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
123
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
124
\
125
tmp1 = C0; \
126
C0 = C1; \
127
C1 = tmp1; \
128
\
129
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
130
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
131
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
132
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
133
} while(0);
134
135
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
136
do { \
137
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
138
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
139
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
140
\
141
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
142
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
143
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
144
} while((void)0, 0);
145
146
#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
147
do { \
148
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
149
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
150
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
151
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
152
\
153
tmp1 = C0; \
154
C0 = C1; \
155
C1 = tmp1; \
156
\
157
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
158
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
159
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
160
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
161
} while((void)0, 0);
162
163
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
164
do{ \
165
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
166
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
167
\
168
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
169
\
170
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
171
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
172
\
173
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
174
} while((void)0, 0);
175
176
#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
177
do{ \
178
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
179
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
180
\
181
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
182
\
183
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
184
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
185
\
186
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
187
} while((void)0, 0);
188
189
#endif
/* BLAKE_ROUND_MKA_OPT_H */
blake2-impl.h
Generated on
for Monero by
1.17.0