mpi_sparc.c (5733B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 /* Multiplication performance enhancements for sparc v8+vis CPUs. */ 6 7 #include "mpi-priv.h" 8 #include <stddef.h> 9 #include <sys/systeminfo.h> 10 #include <strings.h> 11 12 /* In the functions below, */ 13 /* vector y must be 8-byte aligned, and n must be even */ 14 /* returns carry out of high order word of result */ 15 /* maximum n is 256 */ 16 17 /* vector x += vector y * scaler a; where y is of length n words. */ 18 extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a); 19 20 /* vector z = vector x + vector y * scaler a; where y is of length n words. */ 21 extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y, 22 int n, mp_digit a); 23 24 /* v8 versions of these functions run on any Sparc v8 CPU. */ 25 26 /* This trick works on Sparc V8 CPUs with the Workshop compilers. */ 27 #define MP_MUL_DxD(a, b, Phi, Plo) \ 28 { \ 29 unsigned long long product = (unsigned long long)a * b; \ 30 Plo = (mp_digit)product; \ 31 Phi = (mp_digit)(product >> MP_DIGIT_BIT); \ 32 } 33 34 /* c = a * b */ 35 static void 36 v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 37 { 38 #if !defined(MP_NO_MP_WORD) 39 mp_digit d = 0; 40 41 /* Inner product: Digits of a */ 42 while (a_len--) { 43 mp_word w = ((mp_word)b * *a++) + d; 44 *c++ = ACCUM(w); 45 d = CARRYOUT(w); 46 } 47 *c = d; 48 #else 49 mp_digit carry = 0; 50 while (a_len--) { 51 mp_digit a_i = *a++; 52 mp_digit a0b0, a1b1; 53 54 MP_MUL_DxD(a_i, b, a1b1, a0b0); 55 56 a0b0 += carry; 57 if (a0b0 < carry) 58 ++a1b1; 59 *c++ = a0b0; 60 carry = a1b1; 61 } 62 *c = carry; 63 #endif 64 } 65 66 /* c += a * b */ 67 static void 68 v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 69 { 70 #if !defined(MP_NO_MP_WORD) 71 mp_digit d = 0; 72 73 /* Inner product: Digits of a */ 74 while (a_len--) { 75 mp_word w = ((mp_word)b * *a++) + *c + d; 76 *c++ = ACCUM(w); 77 d = CARRYOUT(w); 78 } 79 *c = d; 80 #else 81 mp_digit carry = 0; 82 while (a_len--) { 83 mp_digit a_i = *a++; 84 mp_digit a0b0, a1b1; 85 86 MP_MUL_DxD(a_i, b, a1b1, a0b0); 87 88 a0b0 += carry; 89 if (a0b0 < carry) 90 ++a1b1; 91 a0b0 += a_i = *c; 92 if (a0b0 < a_i) 93 ++a1b1; 94 *c++ = a0b0; 95 carry = a1b1; 96 } 97 *c = carry; 98 #endif 99 } 100 101 /* Presently, this is only used by the Montgomery arithmetic code. */ 102 /* c += a * b */ 103 static void 104 v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 105 { 106 #if !defined(MP_NO_MP_WORD) 107 mp_digit d = 0; 108 109 /* Inner product: Digits of a */ 110 while (a_len--) { 111 mp_word w = ((mp_word)b * *a++) + *c + d; 112 *c++ = ACCUM(w); 113 d = CARRYOUT(w); 114 } 115 116 while (d) { 117 mp_word w = (mp_word)*c + d; 118 *c++ = ACCUM(w); 119 d = CARRYOUT(w); 120 } 121 #else 122 mp_digit carry = 0; 123 while (a_len--) { 124 mp_digit a_i = *a++; 125 mp_digit a0b0, a1b1; 126 127 MP_MUL_DxD(a_i, b, a1b1, a0b0); 128 129 a0b0 += carry; 130 if (a0b0 < carry) 131 ++a1b1; 132 133 a0b0 += a_i = *c; 134 if (a0b0 < a_i) 135 ++a1b1; 136 137 *c++ = a0b0; 138 carry = a1b1; 139 } 140 while (carry) { 141 mp_digit c_i = *c; 142 carry += c_i; 143 *c++ = carry; 144 carry = carry < c_i; 145 } 146 #endif 147 } 148 149 /* These functions run only on v8plus+vis or v9+vis CPUs. */ 150 151 /* c = a * b */ 152 void 153 s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 154 { 155 mp_digit d; 156 mp_digit x[258]; 157 if (a_len <= 256) { 158 if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { 159 mp_digit *px; 160 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; 161 memcpy(px, a, a_len * sizeof(*a)); 162 a = px; 163 if (a_len & 1) { 164 px[a_len] = 0; 165 } 166 } 167 s_mp_setz(c, a_len + 1); 168 d = mul_add_inp(c, a, a_len, b); 169 c[a_len] = d; 170 } else { 171 v8_mpv_mul_d(a, a_len, b, c); 172 } 173 } 174 175 /* c += a * b, where a is a_len words long. */ 176 void 177 s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 178 { 179 mp_digit d; 180 mp_digit x[258]; 181 if (a_len <= 256) { 182 if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { 183 mp_digit *px; 184 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; 185 memcpy(px, a, a_len * sizeof(*a)); 186 a = px; 187 if (a_len & 1) { 188 px[a_len] = 0; 189 } 190 } 191 d = mul_add_inp(c, a, a_len, b); 192 c[a_len] = d; 193 } else { 194 v8_mpv_mul_d_add(a, a_len, b, c); 195 } 196 } 197 198 /* c += a * b, where a is y words long. */ 199 void 200 s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 201 { 202 mp_digit d; 203 mp_digit x[258]; 204 if (a_len <= 256) { 205 if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { 206 mp_digit *px; 207 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; 208 memcpy(px, a, a_len * sizeof(*a)); 209 a = px; 210 if (a_len & 1) { 211 px[a_len] = 0; 212 } 213 } 214 d = mul_add_inp(c, a, a_len, b); 215 if (d) { 216 c += a_len; 217 do { 218 mp_digit sum = d + *c; 219 *c++ = sum; 220 d = sum < d; 221 } while (d); 222 } 223 } else { 224 v8_mpv_mul_d_add_prop(a, a_len, b, c); 225 } 226 }