From 49c96d9f54cb4ecf9c12791caada9b06e0bb3103 Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Fri, 31 Jan 2020 18:42:18 +0100 Subject: [PATCH 01/11] feat: experimental assembly support (cherry picked from commit ffc03eb7592daf97de123740b07e822bd54bcd2f) --- ff_derive/src/lib.rs | 275 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 270 insertions(+), 5 deletions(-) diff --git a/ff_derive/src/lib.rs b/ff_derive/src/lib.rs index f522c22..9ce7755 100644 --- a/ff_derive/src/lib.rs +++ b/ff_derive/src/lib.rs @@ -807,6 +807,203 @@ fn prime_field_impl( a: proc_macro2::TokenStream, b: proc_macro2::TokenStream, limbs: usize, + ) -> proc_macro2::TokenStream { + if limbs == 4 && cfg!(target_arch = "x86_64") { + mul_impl_asm4(a, b) + } else { + mul_impl_default(a, b, limbs) + } + } + + fn mul_impl_asm4( + a: proc_macro2::TokenStream, + b: proc_macro2::TokenStream, + ) -> proc_macro2::TokenStream { + // x86_64 asm for four limbs + + let mut gen = proc_macro2::TokenStream::new(); + gen.extend(quote!{ + // println!("multiply before {:?} {:?}", (#a.0).0, (#b.0).0); + // println!("foo"); + // Can remove all #b xor rax, rax; however see a minor perf hit due to false flag dependencies. + unsafe { + asm!( + "# ============ PLEASE STAY \n\ + xor rax, rax \n\ + push rbp \n\ + mov rdx, [rsi + 8*0] \n\ + mulx r9, r8, [rdi + 8*0] \n\ + mulx r10, rbx, [rdi + 8*1] \n\ + adcx r9, rbx \n\ + mulx r11, rbx, [rdi + 8*2] \n\ + adcx r10, rbx \n\ + mulx r12, rbx, [rdi + 8*3] \n\ + adcx r11, rbx \n\ + adcx r12, rax \n\ + xor rax, rax \n\ + mov rdx, [rsi + 8*1] \n\ + mulx rbx, rbp, [rdi + 8*0] \n\ + adcx r9, rbp \n\ + adox r10, rbx \n\ + mulx rbx, rbp, [rdi + 8*1] \n\ + adcx r10, rbp \n\ + adox r11, rbx \n\ + mulx rbx, rbp, [rdi + 8*2] \n\ + adcx r11, rbp \n\ + adox r12, rbx \n\ + mulx r13, rbp, [rdi + 8*3] \n\ + adcx r12, rbp \n\ + adox r13, rax \n\ + adcx r13, rax \n\ + xor rax, rax \n\ + mov rdx, [rsi + 8*2] \n\ + mulx rbx, rbp, [rdi + 8*0] \n\ + adcx r10, rbp \n\ + adox r11, rbx \n\ + mulx rbx, rbp, [rdi + 8*1] \n\ + adcx r11, rbp \n\ + adox r12, rbx \n\ + mulx rbx, rbp, [rdi + 8*2] \n\ + adcx r12, rbp \n\ + adox r13, rbx \n\ + mulx r14, rbp, [rdi + 8*3] \n\ + adcx r13, rbp \n\ + adox r14, rax \n\ + adcx r14, rax \n\ + xor rax, rax \n\ + mov rdx, [rsi + 8*3] \n\ + mulx rbx, rbp, [rdi + 8*0] \n\ + adcx r11, rbp \n\ + adox r12, rbx \n\ + mulx rbx, rbp, [rdi + 8*1] \n\ + adcx r12, rbp \n\ + adox r13, rbx \n\ + mulx rbx, rbp, [rdi + 8*2] \n\ + adcx r13, rbp \n\ + adox r14, rbx \n\ + mulx r15, rbp, [rdi + 8*3] \n\ + adcx r14, rbp \n\ + adox r15, rax \n\ + adcx r15, rax \n\ + xor rax, rax \n\ + mov rdx, -4294967297 \n\ + mulx rbp, rdx, r8 \n\ + mov rcx, 18446744069414584321 \n\ + mulx rbx, rbp, rcx \n\ + adox r8, rbp \n\ + adcx r9, rbx \n\ + mov rcx, 6034159408538082302 \n\ + mulx rbx, rbp, rcx \n\ + adox r9, rbp \n\ + adcx r10, rbx \n\ + mov rcx, 3691218898639771653 \n\ + mulx rbx, rbp, rcx \n\ + adox r10, rbp \n\ + adcx r11, rbx \n\ + mov r8, 8353516859464449352 \n\ + mulx rbx, rbp, r8 \n\ + adox r11, rbp \n\ + adcx r12, rbx \n\ + adox r12, rax \n\ + adcx r13, rax \n\ + adox r13, rax \n\ + adcx r14, rax \n\ + adox r14, rax \n\ + adcx r15, rax \n\ + adox r15, rax \n\ + mov rdx, -4294967297 \n\ + mulx rbp, rdx, r9 \n\ + mov rcx, 18446744069414584321 \n\ + mulx rbx, rbp, rcx \n\ + adox r9, rbp \n\ + adcx r10, rbx \n\ + mov rcx, 6034159408538082302 \n\ + mulx rbx, rbp, rcx \n\ + adox r10, rbp \n\ + adcx r11, rbx \n\ + mov r9, 3691218898639771653 \n\ + mulx rbx, rbp, r9 \n\ + adox r11, rbp \n\ + adcx r12, rbx \n\ + mulx rbx, rbp, r8 \n\ + adox r12, rbp \n\ + adcx r13, rbx \n\ + adox r13, rax \n\ + adcx r14, rax \n\ + adox r14, rax \n\ + adcx r15, rax \n\ + adox r15, rax \n\ + mov rdx, -4294967297 \n\ + mulx rbp, rdx, r10 \n\ + mov rcx, 18446744069414584321 \n\ + mulx rbx, rbp, rcx \n\ + adox r10, rbp \n\ + adcx r11, rbx \n\ + mov r10, 6034159408538082302 \n\ + mulx rbx, rbp, r10 \n\ + adox r11, rbp \n\ + adcx r12, rbx \n\ + mulx rbx, rbp, r9 \n\ + adox r12, rbp \n\ + adcx r13, rbx \n\ + mulx rbx, rbp, r8 \n\ + adox r13, rbp \n\ + adcx r14, rbx \n\ + adox r14, rax \n\ + adcx r15, rax \n\ + adox r15, rax \n\ + mov rdx, -4294967297 \n\ + mulx rbp, rdx, r11 \n\ + mulx rbx, rbp, rcx \n\ + adox r11, rbp \n\ + adcx r12, rbx \n\ + mulx rbx, rbp, r10 \n\ + adox r12, rbp \n\ + mov [rdi + 8*0], r12 \n\ + adcx r13, rbx \n\ + mulx rbx, rbp, r9 \n\ + adox r13, rbp \n\ + mov [rdi + 8*1], r13 \n\ + adcx r14, rbx \n\ + mulx rbx, rbp, r8 \n\ + adox r14, rbp \n\ + mov [rdi + 8*2], r14 \n\ + adcx r15, rbx \n\ + adox r15, rax \n\ + mov [rdi + 8*3], r15 \n\ + pop rbp \n\ + sub r12, rcx \n\ + sbb r13, r10 \n\ + sbb r14, r9 \n\ + sbb r15, r8 \n\ + jb .L1${:uid} \n\ + mov [rdi + 8*1], r13 \n\ + mov [rdi + 8*0], r12 \n\ + mov [rdi + 8*2], r14 \n\ + mov [rdi + 8*3], r15 \n\ + .L1${:uid}: \n" + : "=&{rdi}"(&((#a.0).0[0])) + : "{rdi}"(&((#a.0).0[0])), "{rsi}"(&((#b.0).0[0])) + : "rax", "rdx", "rbp", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" + : "intel", "volatile" + ); + } + + // with printouts before an after this works + // println!("multiply after {:?} {:?}", (#a.0).0, (#b.0).0); + + // if (#a.0).0[0] == 0 { + // println!("low result is 0 {:?}", (#a.0).0); + // } + }); + + gen + } + + fn mul_impl_default( + a: proc_macro2::TokenStream, + b: proc_macro2::TokenStream, + limbs: usize, ) -> proc_macro2::TokenStream { let mut gen = proc_macro2::TokenStream::new(); @@ -1053,11 +1250,79 @@ fn prime_field_impl( impl<'r> ::core::ops::AddAssign<&'r #name> for #name { #[inline] fn add_assign(&mut self, other: &#name) { - // This cannot exceed the backing capacity. - self.add_nocarry(other); - - // However, it may need to be reduced. - self.reduce(); + if #limbs == 4 && cfg!(target_arch = "x86_64") { + // This cannot exceed the backing capacity. + use std::arch::x86_64::*; + use std::mem; + + unsafe { + let mut carry = _addcarry_u64( + 0, + (self.0).0[0], + (other.0).0[0], + &mut (self.0).0[0] + ); + carry = _addcarry_u64( + carry, (self.0).0[1], + (other.0).0[1], + &mut (self.0).0[1] + ); + carry = _addcarry_u64( + carry, (self.0).0[2], + (other.0).0[2], + &mut (self.0).0[2] + ); + _addcarry_u64( + carry, + (self.0).0[3], + (other.0).0[3], + &mut (self.0).0[3] + ); + + let mut s_sub: [u64; 4] = mem::uninitialized(); + + carry = _subborrow_u64( + 0, + (self.0).0[0], + MODULUS.0[0], + &mut s_sub[0] + ); + carry = _subborrow_u64( + carry, + (self.0).0[1], + MODULUS.0[1], + &mut s_sub[1] + ); + carry = _subborrow_u64( + carry, + (self.0).0[2], + MODULUS.0[2], + &mut s_sub[2] + ); + carry = _subborrow_u64( + carry, + (self.0).0[3], + MODULUS.0[3], + &mut s_sub[3] + ); + + if carry == 0 { + // Direct assign fails since size can be 4 or 6 + // Obviously code doesn't work at all for size 6 + // (self.0).0 = s_sub; + (self.0).0[0] = s_sub[0]; + (self.0).0[1] = s_sub[1]; + (self.0).0[2] = s_sub[2]; + (self.0).0[3] = s_sub[3]; + } + } + } else { + // This cannot exceed the backing capacity. + self.0.add_nocarry(&other.0); + + // However, it may need to be reduced. + self.reduce(); + } } } From 67d5fff6a74506262b489f95e3df058317f33493 Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Fri, 31 Jan 2020 23:00:03 +0100 Subject: [PATCH 02/11] switch to linked asm (cherry picked from commit 2561b7f6ec67910004f1b0c12835c580e48ab2bf) --- Cargo.toml | 3 + asm/mul_4.S | 284 +++++++++++++++++++++++++++++++++++++++++++ build.rs | 11 ++ ff_derive/src/lib.rs | 183 +--------------------------- src/asm.rs | 45 +++++++ src/lib.rs | 7 ++ 6 files changed, 356 insertions(+), 177 deletions(-) create mode 100644 asm/mul_4.S create mode 100644 build.rs create mode 100644 src/asm.rs diff --git a/Cargo.toml b/Cargo.toml index 0affa9a..3d4ae89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,9 @@ ff_derive = { version = "0.8", path = "ff_derive", optional = true } rand_core = { version = "0.6", default-features = false } subtle = { version = "2.2.1", default-features = false, features = ["i128"] } +[build-dependencies] +cc = "1.0.50" + [features] default = ["bits", "std"] bits = ["bitvec"] diff --git a/asm/mul_4.S b/asm/mul_4.S new file mode 100644 index 0000000..c2b8108 --- /dev/null +++ b/asm/mul_4.S @@ -0,0 +1,284 @@ +.macro mul_256 a b + xor %rax, %rax + mov 0x00\a, %rdx + mulx 0x00\b, %r8, %r9 + mulx 0x08\b, %rbx, %r10 + adcx %rbx, %r9 + mulx 0x10\b, %rbx, %r11 + adcx %rbx, %r10 + mulx 0x18\b, %rbx, %r12 + adcx %rbx, %r11 + adcx %rax, %r12 + xor %rax, %rax + mov 0x08\a, %rdx + mulx 0x00\b, %rbp, %rbx + adcx %rbp, %r9 + adox %rbx, %r10 + mulx 0x08\b, %rbp, %rbx + adcx %rbp, %r10 + adox %rbx, %r11 + mulx 0x10\b, %rbp, %rbx + adcx %rbp, %r11 + adox %rbx, %r12 + mulx 0x18\b, %rbp, %r13 + adcx %rbp, %r12 + adox %rax, %r13 + adcx %rax, %r13 + xor %rax, %rax + mov 0x10\a, %rdx + mulx 0x00\b, %rbp, %rbx + adcx %rbp, %r10 + adox %rbx, %r11 + mulx 0x08\b, %rbp, %rbx + adcx %rbp, %r11 + adox %rbx, %r12 + mulx 0x10\b, %rbp, %rbx + adcx %rbp, %r12 + adox %rbx, %r13 + mulx 0x18\b, %rbp, %r14 + adcx %rbp, %r13 + adox %rax, %r14 + adcx %rax, %r14 + xor %rax, %rax + mov 0x18\a, %rdx + mulx 0x00\b, %rbp, %rbx + adcx %rbp, %r11 + adox %rbx, %r12 + mulx 0x08\b, %rbp, %rbx + adcx %rbp, %r12 + adox %rbx, %r13 + mulx 0x10\b, %rbp, %rbx + adcx %rbp, %r13 + adox %rbx, %r14 + mulx 0x18\b, %rbp, %r15 + adcx %rbp, %r14 + adox %rax, %r15 + adcx %rax, %r15 +.endm + +.macro red_256 res + mov %rsi, 0x200(%rsp) + lea .LM(%rip), %rsi + mov %r13, 0x1e8(%rsp) + mov %r14, 0x1f0(%rsp) + mov %r15, 0x1f8(%rsp) + xor %r13, %r13 + xor %r14, %r14 + xor %r15, %r15 + xor %rax, %rax + mov 0x300(%rsi), %rdx + mulx %r8, %rdx, %rbp + mulx 0x40(%rsi), %rbp, %rbx + adox %rbp, %r8 + adcx %rbx, %r9 + mulx 0x48(%rsi), %rbp, %rbx + adox %rbp, %r9 + adcx %rbx, %r10 + mulx 0x50(%rsi), %rbp, %rbx + adox %rbp, %r10 + adcx %rbx, %r11 + mulx 0x58(%rsi), %rbp, %rbx + adox %rbp, %r11 + adcx %rbx, %r12 + adox %rax, %r12 + adcx %rax, %r13 + adox %rax, %r13 + mov 0x300(%rsi), %rdx + mulx %r9, %rdx, %rbp + mulx 0x40(%rsi), %rbp, %rbx + adox %rbp, %r9 + adcx %rbx, %r10 + mulx 0x48(%rsi), %rbp, %rbx + adox %rbp, %r10 + adcx %rbx, %r11 + mulx 0x50(%rsi), %rbp, %rbx + adox %rbp, %r11 + adcx %rbx, %r12 + mulx 0x58(%rsi), %rbp, %rbx + adox %rbp, %r12 + adcx %rbx, %r13 + adox 0x1e8(%rsp), %r13 + adcx %rax, %r14 + adox %rax, %r14 + mov 0x300(%rsi), %rdx + mulx %r10, %rdx, %rbp + mulx 0x40(%rsi), %rbp, %rbx + adox %rbp, %r10 + adcx %rbx, %r11 + mulx 0x48(%rsi), %rbp, %rbx + adox %rbp, %r11 + adcx %rbx, %r12 + mulx 0x50(%rsi), %rbp, %rbx + adox %rbp, %r12 + adcx %rbx, %r13 + mulx 0x58(%rsi), %rbp, %rbx + adox %rbp, %r13 + adcx %rbx, %r14 + adox 0x1f0(%rsp), %r14 + adcx %rax, %r15 + adox %rax, %r15 + mov 0x300(%rsi), %rdx + mulx %r11, %rdx, %rbp + mulx 0x40(%rsi), %rbp, %rbx + adox %rbp, %r11 + adcx %rbx, %r12 + mulx 0x48(%rsi), %rbp, %rbx + adox %rbp, %r12 + adcx %rbx, %r13 + mulx 0x50(%rsi), %rbp, %rbx + adox %rbp, %r13 + adcx %rbx, %r14 + mulx 0x58(%rsi), %rbp, %rbx + adox %rbp, %r14 + adcx %rbx, %r15 + adox 0x1f8(%rsp), %r15 + adcx %rax, %r8 + adox %rax, %r8 + shl $0x3, %r8 + sub 0x00(%rsi,%r8,1), %r12 + sbb 0x08(%r8,%rsi,1), %r13 + sbb 0x10(%r8,%rsi,1), %r14 + sbb 0x18(%r8,%rsi,1), %r15 + mov 0x200(%rsp), %rsi + mov %r12, 0x00\res + mov %r13, 0x08\res + mov %r14, 0x10\res + mov %r15, 0x18\res +.endm + +.macro mod_mul_256 a b res + mul_256 \a, \b + red_256 \res +.endm + + +.LM: + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xffffffff00000001 + .quad 0x53bda402fffe5bfe + .quad 0x3339d80809a1d805 + .quad 0x73eda753299d7d48 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffffe00000002 + .quad 0xa77b4805fffcb7fd + .quad 0x6673b0101343b00a + .quad 0xe7db4ea6533afa90 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffffd00000003 + .quad 0xfb38ec08fffb13fc + .quad 0x99ad88181ce5880f + .quad 0x5bc8f5f97cd877d8 + .quad 0x0000000000000001 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffffc00000004 + .quad 0x4ef6900bfff96ffb + .quad 0xcce7602026876015 + .quad 0xcfb69d4ca675f520 + .quad 0x0000000000000001 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffffb00000005 + .quad 0xa2b4340efff7cbfa + .quad 0x2138283029381a + .quad 0x43a4449fd0137269 + .quad 0x0000000000000002 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffffa00000006 + .quad 0xf671d811fff627f9 + .quad 0x335b103039cb101f + .quad 0xb791ebf2f9b0efb1 + .quad 0x0000000000000002 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffff900000007 + .quad 0x4a2f7c14fff483f8 + .quad 0x6694e838436ce825 + .quad 0x2b7f9346234e6cf9 + .quad 0x0000000000000003 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xffffffff00000001 + .quad 0x53bda402fffe5bfe + .quad 0x3339d80809a1d805 + .quad 0x73eda753299d7d48 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffffe00000002 + .quad 0xa77b4805fffcb7fd + .quad 0x6673b0101343b00a + .quad 0xe7db4ea6533afa90 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffffe00000002 + .quad 0xa77b4805fffcb7fd + .quad 0x6673b0101343b00a + .quad 0xe7db4ea6533afa90 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .quad 0xfffffffeffffffff + +#ifdef __APPLE__ +.global _mod_mul_4w +_mod_mul_4w: +#else +.global mod_mul_4w +mod_mul_4w: +#endif + // p1 = rdi + // p2 = rsi + // result = rdx + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + sub $8*65, %rsp + mov %rdx, %rcx // rcx = result + // p1 * p2 + mod_mul_256 (%rdi), (%rsi), (%rcx) + //mov (%rcx), %rdx + //add $100, %rdx + //mov %rdx, (%rcx) + add $8*65, %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + ret diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..435e0b6 --- /dev/null +++ b/build.rs @@ -0,0 +1,11 @@ +#[cfg(target_arch = "x86_64")] +extern crate cc; + +fn main() { + if cfg!(target_arch = "x86_64") { + cc::Build::new() + .flag("-c") + .file("./asm/mul_4.S") + .compile("libff-derive-crypto.a"); + } +} diff --git a/ff_derive/src/lib.rs b/ff_derive/src/lib.rs index 9ce7755..8e454eb 100644 --- a/ff_derive/src/lib.rs +++ b/ff_derive/src/lib.rs @@ -822,179 +822,8 @@ fn prime_field_impl( // x86_64 asm for four limbs let mut gen = proc_macro2::TokenStream::new(); - gen.extend(quote!{ - // println!("multiply before {:?} {:?}", (#a.0).0, (#b.0).0); - // println!("foo"); - // Can remove all #b xor rax, rax; however see a minor perf hit due to false flag dependencies. - unsafe { - asm!( - "# ============ PLEASE STAY \n\ - xor rax, rax \n\ - push rbp \n\ - mov rdx, [rsi + 8*0] \n\ - mulx r9, r8, [rdi + 8*0] \n\ - mulx r10, rbx, [rdi + 8*1] \n\ - adcx r9, rbx \n\ - mulx r11, rbx, [rdi + 8*2] \n\ - adcx r10, rbx \n\ - mulx r12, rbx, [rdi + 8*3] \n\ - adcx r11, rbx \n\ - adcx r12, rax \n\ - xor rax, rax \n\ - mov rdx, [rsi + 8*1] \n\ - mulx rbx, rbp, [rdi + 8*0] \n\ - adcx r9, rbp \n\ - adox r10, rbx \n\ - mulx rbx, rbp, [rdi + 8*1] \n\ - adcx r10, rbp \n\ - adox r11, rbx \n\ - mulx rbx, rbp, [rdi + 8*2] \n\ - adcx r11, rbp \n\ - adox r12, rbx \n\ - mulx r13, rbp, [rdi + 8*3] \n\ - adcx r12, rbp \n\ - adox r13, rax \n\ - adcx r13, rax \n\ - xor rax, rax \n\ - mov rdx, [rsi + 8*2] \n\ - mulx rbx, rbp, [rdi + 8*0] \n\ - adcx r10, rbp \n\ - adox r11, rbx \n\ - mulx rbx, rbp, [rdi + 8*1] \n\ - adcx r11, rbp \n\ - adox r12, rbx \n\ - mulx rbx, rbp, [rdi + 8*2] \n\ - adcx r12, rbp \n\ - adox r13, rbx \n\ - mulx r14, rbp, [rdi + 8*3] \n\ - adcx r13, rbp \n\ - adox r14, rax \n\ - adcx r14, rax \n\ - xor rax, rax \n\ - mov rdx, [rsi + 8*3] \n\ - mulx rbx, rbp, [rdi + 8*0] \n\ - adcx r11, rbp \n\ - adox r12, rbx \n\ - mulx rbx, rbp, [rdi + 8*1] \n\ - adcx r12, rbp \n\ - adox r13, rbx \n\ - mulx rbx, rbp, [rdi + 8*2] \n\ - adcx r13, rbp \n\ - adox r14, rbx \n\ - mulx r15, rbp, [rdi + 8*3] \n\ - adcx r14, rbp \n\ - adox r15, rax \n\ - adcx r15, rax \n\ - xor rax, rax \n\ - mov rdx, -4294967297 \n\ - mulx rbp, rdx, r8 \n\ - mov rcx, 18446744069414584321 \n\ - mulx rbx, rbp, rcx \n\ - adox r8, rbp \n\ - adcx r9, rbx \n\ - mov rcx, 6034159408538082302 \n\ - mulx rbx, rbp, rcx \n\ - adox r9, rbp \n\ - adcx r10, rbx \n\ - mov rcx, 3691218898639771653 \n\ - mulx rbx, rbp, rcx \n\ - adox r10, rbp \n\ - adcx r11, rbx \n\ - mov r8, 8353516859464449352 \n\ - mulx rbx, rbp, r8 \n\ - adox r11, rbp \n\ - adcx r12, rbx \n\ - adox r12, rax \n\ - adcx r13, rax \n\ - adox r13, rax \n\ - adcx r14, rax \n\ - adox r14, rax \n\ - adcx r15, rax \n\ - adox r15, rax \n\ - mov rdx, -4294967297 \n\ - mulx rbp, rdx, r9 \n\ - mov rcx, 18446744069414584321 \n\ - mulx rbx, rbp, rcx \n\ - adox r9, rbp \n\ - adcx r10, rbx \n\ - mov rcx, 6034159408538082302 \n\ - mulx rbx, rbp, rcx \n\ - adox r10, rbp \n\ - adcx r11, rbx \n\ - mov r9, 3691218898639771653 \n\ - mulx rbx, rbp, r9 \n\ - adox r11, rbp \n\ - adcx r12, rbx \n\ - mulx rbx, rbp, r8 \n\ - adox r12, rbp \n\ - adcx r13, rbx \n\ - adox r13, rax \n\ - adcx r14, rax \n\ - adox r14, rax \n\ - adcx r15, rax \n\ - adox r15, rax \n\ - mov rdx, -4294967297 \n\ - mulx rbp, rdx, r10 \n\ - mov rcx, 18446744069414584321 \n\ - mulx rbx, rbp, rcx \n\ - adox r10, rbp \n\ - adcx r11, rbx \n\ - mov r10, 6034159408538082302 \n\ - mulx rbx, rbp, r10 \n\ - adox r11, rbp \n\ - adcx r12, rbx \n\ - mulx rbx, rbp, r9 \n\ - adox r12, rbp \n\ - adcx r13, rbx \n\ - mulx rbx, rbp, r8 \n\ - adox r13, rbp \n\ - adcx r14, rbx \n\ - adox r14, rax \n\ - adcx r15, rax \n\ - adox r15, rax \n\ - mov rdx, -4294967297 \n\ - mulx rbp, rdx, r11 \n\ - mulx rbx, rbp, rcx \n\ - adox r11, rbp \n\ - adcx r12, rbx \n\ - mulx rbx, rbp, r10 \n\ - adox r12, rbp \n\ - mov [rdi + 8*0], r12 \n\ - adcx r13, rbx \n\ - mulx rbx, rbp, r9 \n\ - adox r13, rbp \n\ - mov [rdi + 8*1], r13 \n\ - adcx r14, rbx \n\ - mulx rbx, rbp, r8 \n\ - adox r14, rbp \n\ - mov [rdi + 8*2], r14 \n\ - adcx r15, rbx \n\ - adox r15, rax \n\ - mov [rdi + 8*3], r15 \n\ - pop rbp \n\ - sub r12, rcx \n\ - sbb r13, r10 \n\ - sbb r14, r9 \n\ - sbb r15, r8 \n\ - jb .L1${:uid} \n\ - mov [rdi + 8*1], r13 \n\ - mov [rdi + 8*0], r12 \n\ - mov [rdi + 8*2], r14 \n\ - mov [rdi + 8*3], r15 \n\ - .L1${:uid}: \n" - : "=&{rdi}"(&((#a.0).0[0])) - : "{rdi}"(&((#a.0).0[0])), "{rsi}"(&((#b.0).0[0])) - : "rax", "rdx", "rbp", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" - : "intel", "volatile" - ); - } - - // with printouts before an after this works - // println!("multiply after {:?} {:?}", (#a.0).0, (#b.0).0); - - // if (#a.0).0[0] == 0 { - // println!("low result is 0 {:?}", (#a.0).0); - // } + gen.extend(quote! { + ::ff::mod_mul_4w_assign(&mut (#a.0).0, &(#b.0).0); }); gen @@ -1278,9 +1107,9 @@ fn prime_field_impl( (other.0).0[3], &mut (self.0).0[3] ); - + let mut s_sub: [u64; 4] = mem::uninitialized(); - + carry = _subborrow_u64( 0, (self.0).0[0], @@ -1305,7 +1134,7 @@ fn prime_field_impl( MODULUS.0[3], &mut s_sub[3] ); - + if carry == 0 { // Direct assign fails since size can be 4 or 6 // Obviously code doesn't work at all for size 6 @@ -1319,7 +1148,7 @@ fn prime_field_impl( } else { // This cannot exceed the backing capacity. self.0.add_nocarry(&other.0); - + // However, it may need to be reduced. self.reduce(); } diff --git a/src/asm.rs b/src/asm.rs new file mode 100644 index 0000000..bf01409 --- /dev/null +++ b/src/asm.rs @@ -0,0 +1,45 @@ +#[link(name = "ff-derive-crypto", kind = "static")] +extern "C" { + fn mod_mul_4w(a: &[u64; 4], b: &[u64; 4], res: &mut [u64; 4]); +} + +pub fn mod_mul_4w_assign(a: &mut [u64; 4], b: &[u64; 4]) { + let mut res = [0; 4]; + unsafe { + mod_mul_4w(&*a, b, &mut res); + } + std::mem::replace(a, res); +} + +#[cfg(test)] +mod tests { + use super::*; + + use rand_core::SeedableRng; + + #[test] + fn test_mod_mul() { + let mut x: [u64; 4] = [ + 7665858810281813592, + 16340119633057872346, + 4817051413996267933, + 2960177199463250197, + ]; + let y: [u64; 4] = [ + 12935154801682980781, + 13314970078575206070, + 2674023185838267390, + 551755778115450960, + ]; + let exp: [u64; 4] = [ + 12035708911089303301, + 16867479803567096087, + 8918020714254073494, + 3250221169924948371, + ]; + + mod_mul_4w_assign(&mut x, &y); + + assert_eq!(x[0..4], exp[0..4], "\nMod Mul error\n"); + } +} diff --git a/src/lib.rs b/src/lib.rs index ca18d3c..008bce1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,12 @@ #[cfg_attr(docsrs, doc(cfg(feature = "derive")))] pub use ff_derive::PrimeField; +#[cfg(target_arch = "x86_64")] +mod asm; + +#[cfg(target_arch = "x86_64")] +pub use asm::mod_mul_4w_assign; + #[cfg(feature = "bits")] #[cfg_attr(docsrs, doc(cfg(feature = "bits")))] pub use bitvec::view::BitViewSized; @@ -18,6 +24,7 @@ pub use bitvec::view::BitViewSized; use bitvec::{array::BitArray, order::Lsb0}; use core::fmt; use core::ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign}; + use rand_core::RngCore; use subtle::{ConditionallySelectable, CtOption}; From db32ef5a9239111f49681c7b2e6f8f8c910addf2 Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Mon, 3 Feb 2020 15:52:54 +0100 Subject: [PATCH 03/11] use fixed asm (cherry picked from commit ba0fb171afddd4ec9495d457df52b5a077f1fd60) --- asm/mul_4.S | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/asm/mul_4.S b/asm/mul_4.S index c2b8108..ffecabf 100644 --- a/asm/mul_4.S +++ b/asm/mul_4.S @@ -151,6 +151,32 @@ red_256 \res .endm +.macro red_below_m a name + mov 0x18\a, %r12 + shr $0x38, %r12 + and $0xC0, %r12 + mov 0x00\a, %r8 + sub 0x200(%rdx,%r12,1), %r8 + mov 0x08\a, %r9 + sbb 0x208(%rdx,%r12,1), %r9 + mov 0x10\a, %r10 + sbb 0x210(%rdx,%r12,1), %r10 + mov 0x18\a, %r11 + sbb 0x218(%rdx,%r12,1), %r11 + + jnc .Lred_done\name + + add 0x240(%rdx), %r8 + adc 0x248(%rdx), %r9 + adc 0x250(%rdx), %r10 + adc 0x258(%rdx), %r11 + +.Lred_done\name: + mov %r8, 0x00\a + mov %r9, 0x08\a + mov %r10, 0x10\a + mov %r11, 0x18\a +.endm .LM: .quad 0x0000000000000000 @@ -258,8 +284,8 @@ _mod_mul_4w: .global mod_mul_4w mod_mul_4w: #endif - // p1 = rdi - // p2 = rsi + // x = rdi + // y = rsi // result = rdx push %rbp push %rbx @@ -269,11 +295,12 @@ mod_mul_4w: push %r15 sub $8*65, %rsp mov %rdx, %rcx // rcx = result - // p1 * p2 + + // x * y mod_mul_256 (%rdi), (%rsi), (%rcx) - //mov (%rcx), %rdx - //add $100, %rdx - //mov %rdx, (%rcx) + lea .LM(%rip), %rdx + red_below_m (%rcx), mm + add $8*65, %rsp pop %r15 pop %r14 From da3e47cab17f4686a161e19ed17312c404119bd8 Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Wed, 5 Feb 2020 19:18:37 +0100 Subject: [PATCH 04/11] updated assembly (cherry picked from commit b3cba18751677cc92329e45fe98b9d7e1552b7b8) --- asm/mul_4.S | 222 +++++++++++-------------------------------- ff_derive/src/lib.rs | 1 + 2 files changed, 58 insertions(+), 165 deletions(-) diff --git a/asm/mul_4.S b/asm/mul_4.S index ffecabf..9220d71 100644 --- a/asm/mul_4.S +++ b/asm/mul_4.S @@ -1,3 +1,6 @@ +// A*B +// Schoolbook multiplication of four 64b limbs +// result in r8 - r15 .macro mul_256 a b xor %rax, %rax mov 0x00\a, %rdx @@ -56,225 +59,118 @@ adcx %rax, %r15 .endm -.macro red_256 res - mov %rsi, 0x200(%rsp) +// Montgomery reduction +// expects multiplication result in r8 - r15 +// See algo 14.32 from Handbook of Applied Cryptography +.macro red_256 res name + push %rsi lea .LM(%rip), %rsi - mov %r13, 0x1e8(%rsp) - mov %r14, 0x1f0(%rsp) - mov %r15, 0x1f8(%rsp) - xor %r13, %r13 - xor %r14, %r14 - xor %r15, %r15 xor %rax, %rax - mov 0x300(%rsi), %rdx + mov 0x20(%rsi), %rdx mulx %r8, %rdx, %rbp - mulx 0x40(%rsi), %rbp, %rbx + mulx 0x00(%rsi), %rbp, %rbx adox %rbp, %r8 adcx %rbx, %r9 - mulx 0x48(%rsi), %rbp, %rbx + mulx 0x08(%rsi), %rbp, %rbx adox %rbp, %r9 adcx %rbx, %r10 - mulx 0x50(%rsi), %rbp, %rbx + mulx 0x10(%rsi), %rbp, %rbx adox %rbp, %r10 adcx %rbx, %r11 - mulx 0x58(%rsi), %rbp, %rbx + mulx 0x18(%rsi), %rbp, %rbx adox %rbp, %r11 adcx %rbx, %r12 adox %rax, %r12 adcx %rax, %r13 adox %rax, %r13 - mov 0x300(%rsi), %rdx + adcx %rax, %r14 + adox %rax, %r14 + adcx %rax, %r15 + adox %rax, %r15 + mov 0x20(%rsi), %rdx mulx %r9, %rdx, %rbp - mulx 0x40(%rsi), %rbp, %rbx + mulx 0x00(%rsi), %rbp, %rbx adox %rbp, %r9 adcx %rbx, %r10 - mulx 0x48(%rsi), %rbp, %rbx + mulx 0x08(%rsi), %rbp, %rbx adox %rbp, %r10 adcx %rbx, %r11 - mulx 0x50(%rsi), %rbp, %rbx + mulx 0x10(%rsi), %rbp, %rbx adox %rbp, %r11 adcx %rbx, %r12 - mulx 0x58(%rsi), %rbp, %rbx + mulx 0x18(%rsi), %rbp, %rbx adox %rbp, %r12 adcx %rbx, %r13 - adox 0x1e8(%rsp), %r13 + adox %rax, %r13 adcx %rax, %r14 adox %rax, %r14 - mov 0x300(%rsi), %rdx + adcx %rax, %r15 + adox %rax, %r15 + mov 0x20(%rsi), %rdx mulx %r10, %rdx, %rbp - mulx 0x40(%rsi), %rbp, %rbx + mulx 0x00(%rsi), %rbp, %rbx adox %rbp, %r10 adcx %rbx, %r11 - mulx 0x48(%rsi), %rbp, %rbx + mulx 0x08(%rsi), %rbp, %rbx adox %rbp, %r11 adcx %rbx, %r12 - mulx 0x50(%rsi), %rbp, %rbx + mulx 0x10(%rsi), %rbp, %rbx adox %rbp, %r12 adcx %rbx, %r13 - mulx 0x58(%rsi), %rbp, %rbx + mulx 0x18(%rsi), %rbp, %rbx adox %rbp, %r13 adcx %rbx, %r14 - adox 0x1f0(%rsp), %r14 + adox %rax, %r14 adcx %rax, %r15 adox %rax, %r15 - mov 0x300(%rsi), %rdx + mov 0x20(%rsi), %rdx mulx %r11, %rdx, %rbp - mulx 0x40(%rsi), %rbp, %rbx + mov 0x00(%rsi), %r8 + mulx %r8, %rbp, %rbx adox %rbp, %r11 adcx %rbx, %r12 - mulx 0x48(%rsi), %rbp, %rbx + mov 0x08(%rsi), %r9 + mulx %r9, %rbp, %rbx adox %rbp, %r12 adcx %rbx, %r13 - mulx 0x50(%rsi), %rbp, %rbx + mov 0x10(%rsi), %r10 + mulx %r10, %rbp, %rbx adox %rbp, %r13 adcx %rbx, %r14 - mulx 0x58(%rsi), %rbp, %rbx + mov 0x18(%rsi), %r11 + mulx %r11, %rbp, %rbx adox %rbp, %r14 adcx %rbx, %r15 - adox 0x1f8(%rsp), %r15 - adcx %rax, %r8 - adox %rax, %r8 - shl $0x3, %r8 - sub 0x00(%rsi,%r8,1), %r12 - sbb 0x08(%r8,%rsi,1), %r13 - sbb 0x10(%r8,%rsi,1), %r14 - sbb 0x18(%r8,%rsi,1), %r15 - mov 0x200(%rsp), %rsi + adox %rax, %r15 + mov %r12, 0x00\res + mov %r13, 0x08\res + mov %r14, 0x10\res + mov %r15, 0x18\res + sub %r8, %r12 + sbb %r9, %r13 + sbb %r10, %r14 + sbb %r11, %r15 + jb .Lred_256\name mov %r12, 0x00\res mov %r13, 0x08\res mov %r14, 0x10\res mov %r15, 0x18\res +.Lred_256\name: + pop %rsi .endm -.macro mod_mul_256 a b res +.macro mod_mul_256 a b res name mul_256 \a, \b - red_256 \res -.endm - -.macro red_below_m a name - mov 0x18\a, %r12 - shr $0x38, %r12 - and $0xC0, %r12 - mov 0x00\a, %r8 - sub 0x200(%rdx,%r12,1), %r8 - mov 0x08\a, %r9 - sbb 0x208(%rdx,%r12,1), %r9 - mov 0x10\a, %r10 - sbb 0x210(%rdx,%r12,1), %r10 - mov 0x18\a, %r11 - sbb 0x218(%rdx,%r12,1), %r11 - - jnc .Lred_done\name - - add 0x240(%rdx), %r8 - adc 0x248(%rdx), %r9 - adc 0x250(%rdx), %r10 - adc 0x258(%rdx), %r11 - -.Lred_done\name: - mov %r8, 0x00\a - mov %r9, 0x08\a - mov %r10, 0x10\a - mov %r11, 0x18\a + red_256 \res, \name .endm +// BLS12-381 G1 order r used as modulus +// Montgomery constant -m^-1 mod b .LM: - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xffffffff00000001 - .quad 0x53bda402fffe5bfe - .quad 0x3339d80809a1d805 - .quad 0x73eda753299d7d48 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xfffffffe00000002 - .quad 0xa77b4805fffcb7fd - .quad 0x6673b0101343b00a - .quad 0xe7db4ea6533afa90 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xfffffffd00000003 - .quad 0xfb38ec08fffb13fc - .quad 0x99ad88181ce5880f - .quad 0x5bc8f5f97cd877d8 - .quad 0x0000000000000001 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xfffffffc00000004 - .quad 0x4ef6900bfff96ffb - .quad 0xcce7602026876015 - .quad 0xcfb69d4ca675f520 - .quad 0x0000000000000001 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xfffffffb00000005 - .quad 0xa2b4340efff7cbfa - .quad 0x2138283029381a - .quad 0x43a4449fd0137269 - .quad 0x0000000000000002 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xfffffffa00000006 - .quad 0xf671d811fff627f9 - .quad 0x335b103039cb101f - .quad 0xb791ebf2f9b0efb1 - .quad 0x0000000000000002 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xfffffff900000007 - .quad 0x4a2f7c14fff483f8 - .quad 0x6694e838436ce825 - .quad 0x2b7f9346234e6cf9 - .quad 0x0000000000000003 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 .quad 0xffffffff00000001 .quad 0x53bda402fffe5bfe .quad 0x3339d80809a1d805 .quad 0x73eda753299d7d48 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xfffffffe00000002 - .quad 0xa77b4805fffcb7fd - .quad 0x6673b0101343b00a - .quad 0xe7db4ea6533afa90 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0xfffffffe00000002 - .quad 0xa77b4805fffcb7fd - .quad 0x6673b0101343b00a - .quad 0xe7db4ea6533afa90 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 - .quad 0x0000000000000000 .quad 0xfffffffeffffffff #ifdef __APPLE__ @@ -293,15 +189,11 @@ mod_mul_4w: push %r13 push %r14 push %r15 - sub $8*65, %rsp mov %rdx, %rcx // rcx = result // x * y - mod_mul_256 (%rdi), (%rsi), (%rcx) - lea .LM(%rip), %rdx - red_below_m (%rcx), mm + mod_mul_256 (%rdi), (%rsi), (%rcx), mm - add $8*65, %rsp pop %r15 pop %r14 pop %r13 diff --git a/ff_derive/src/lib.rs b/ff_derive/src/lib.rs index 8e454eb..cb80423 100644 --- a/ff_derive/src/lib.rs +++ b/ff_derive/src/lib.rs @@ -821,6 +821,7 @@ fn prime_field_impl( ) -> proc_macro2::TokenStream { // x86_64 asm for four limbs + // TODO: add check for adx support let mut gen = proc_macro2::TokenStream::new(); gen.extend(quote! { ::ff::mod_mul_4w_assign(&mut (#a.0).0, &(#b.0).0); From 5a0543558a9877b12ca635afb07befd01dcff62c Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Thu, 6 Feb 2020 17:47:16 +0100 Subject: [PATCH 05/11] fixup and integrate adx check (cherry picked from commit 1cdee83ce831d486350dd5c52dbb016269179058) pick: Only the adx check. --- Cargo.toml | 1 + ff_derive/src/lib.rs | 9 +++++++-- src/asm.rs | 4 ++++ src/lib.rs | 2 +- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3d4ae89..40611f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ edition = "2018" bitvec = { version = "0.22", default-features = false, optional = true } byteorder = { version = "1", default-features = false, optional = true } ff_derive = { version = "0.8", path = "ff_derive", optional = true } +lazy_static = "1.4.0" rand_core = { version = "0.6", default-features = false } subtle = { version = "2.2.1", default-features = false, features = ["i128"] } diff --git a/ff_derive/src/lib.rs b/ff_derive/src/lib.rs index cb80423..d69e584 100644 --- a/ff_derive/src/lib.rs +++ b/ff_derive/src/lib.rs @@ -821,10 +821,15 @@ fn prime_field_impl( ) -> proc_macro2::TokenStream { // x86_64 asm for four limbs - // TODO: add check for adx support + let default_impl = mul_impl_default(a.clone(), b.clone(), 4); + let mut gen = proc_macro2::TokenStream::new(); gen.extend(quote! { - ::ff::mod_mul_4w_assign(&mut (#a.0).0, &(#b.0).0); + if *::ff::CPU_SUPPORTS_ADX_INSTRUCTION { + ::ff::mod_mul_4w_assign(&mut (#a.0).0, &(#b.0).0); + } else { + #default_impl + } }); gen diff --git a/src/asm.rs b/src/asm.rs index bf01409..9f21f28 100644 --- a/src/asm.rs +++ b/src/asm.rs @@ -1,3 +1,7 @@ +lazy_static::lazy_static! { + pub static ref CPU_SUPPORTS_ADX_INSTRUCTION: bool = is_x86_feature_detected!("adx"); +} + #[link(name = "ff-derive-crypto", kind = "static")] extern "C" { fn mod_mul_4w(a: &[u64; 4], b: &[u64; 4], res: &mut [u64; 4]); diff --git a/src/lib.rs b/src/lib.rs index 008bce1..0fdcdc0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,7 +14,7 @@ pub use ff_derive::PrimeField; mod asm; #[cfg(target_arch = "x86_64")] -pub use asm::mod_mul_4w_assign; +pub use asm::*; #[cfg(feature = "bits")] #[cfg_attr(docsrs, doc(cfg(feature = "bits")))] From b6c4d5756b6f64c311560c66f695ff90b14485d5 Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Mon, 10 Feb 2020 20:44:56 +0100 Subject: [PATCH 06/11] fix(fff_derive): only use asm when the right modulus is specified (cherry picked from commit 6c65c330b023198f927b95c2dcd5f471880d93d8) --- ff_derive/src/lib.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ff_derive/src/lib.rs b/ff_derive/src/lib.rs index d69e584..c3a3e3f 100644 --- a/ff_derive/src/lib.rs +++ b/ff_derive/src/lib.rs @@ -13,6 +13,9 @@ use std::str::FromStr; mod pow_fixed; +const BLS_381_FR_MODULUS: &str = + "52435875175126190479447740508185965837690552500527637822603658699938581184513"; + enum ReprEndianness { Big, Little, @@ -126,8 +129,9 @@ pub fn prime_field(input: proc_macro::TokenStream) -> proc_macro::TokenStream { let ast: syn::DeriveInput = syn::parse(input).unwrap(); // We're given the modulus p of the prime field - let modulus: BigUint = fetch_attr("PrimeFieldModulus", &ast.attrs) - .expect("Please supply a PrimeFieldModulus attribute") + let modulus_raw = fetch_attr("PrimeFieldModulus", &ast.attrs) + .expect("Please supply a PrimeFieldModulus attribute"); + let modulus: BigUint = modulus_raw .parse() .expect("PrimeFieldModulus should be a number"); @@ -178,6 +182,7 @@ pub fn prime_field(input: proc_macro::TokenStream) -> proc_macro::TokenStream { gen.extend(prime_field_impl( &ast.ident, &repr_ident, + &modulus_raw, &modulus, &endianness, limbs, @@ -637,6 +642,7 @@ fn prime_field_constants_and_sqrt( fn prime_field_impl( name: &syn::Ident, repr: &syn::Ident, + modulus_raw: &str, modulus: &BigUint, endianness: &ReprEndianness, limbs: usize, @@ -807,8 +813,9 @@ fn prime_field_impl( a: proc_macro2::TokenStream, b: proc_macro2::TokenStream, limbs: usize, + modulus_raw: &str, ) -> proc_macro2::TokenStream { - if limbs == 4 && cfg!(target_arch = "x86_64") { + if limbs == 4 && modulus_raw == BLS_381_FR_MODULUS && cfg!(target_arch = "x86_64") { mul_impl_asm4(a, b) } else { mul_impl_default(a, b, limbs) @@ -909,7 +916,7 @@ fn prime_field_impl( } let squaring_impl = sqr_impl(quote! {self}, limbs); - let multiply_impl = mul_impl(quote! {self}, quote! {other}, limbs); + let multiply_impl = mul_impl(quote! {self}, quote! {other}, limbs, modulus_raw); let invert_impl = inv_impl(quote! {self}, name, modulus); let montgomery_impl = mont_impl(limbs); From e0630e8c5083e814848561b3b4985687e48df091 Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Sat, 11 Apr 2020 14:29:24 +0200 Subject: [PATCH 07/11] fix: ensure to compile asm only on x86_64 (cherry picked from commit 39cb448c2574bf81e48d915e8f885d5ea062f109) --- Cargo.toml | 2 +- build.rs | 6 +- ff_derive/src/lib.rs | 206 +++++++++++++++++++++++++++---------------- 3 files changed, 131 insertions(+), 83 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 40611f4..7e736b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ lazy_static = "1.4.0" rand_core = { version = "0.6", default-features = false } subtle = { version = "2.2.1", default-features = false, features = ["i128"] } -[build-dependencies] +[target.'cfg(target_arch = "x86_64")'.build-dependencies] cc = "1.0.50" [features] diff --git a/build.rs b/build.rs index 435e0b6..a16f149 100644 --- a/build.rs +++ b/build.rs @@ -1,8 +1,6 @@ -#[cfg(target_arch = "x86_64")] -extern crate cc; - fn main() { - if cfg!(target_arch = "x86_64") { + let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + if target_arch == "x86_64" { cc::Build::new() .flag("-c") .file("./asm/mul_4.S") diff --git a/ff_derive/src/lib.rs b/ff_derive/src/lib.rs index c3a3e3f..3ecf98b 100644 --- a/ff_derive/src/lib.rs +++ b/ff_derive/src/lib.rs @@ -815,7 +815,7 @@ fn prime_field_impl( limbs: usize, modulus_raw: &str, ) -> proc_macro2::TokenStream { - if limbs == 4 && modulus_raw == BLS_381_FR_MODULUS && cfg!(target_arch = "x86_64") { + if limbs == 4 && modulus_raw == BLS_381_FR_MODULUS { mul_impl_asm4(a, b) } else { mul_impl_default(a, b, limbs) @@ -827,14 +827,20 @@ fn prime_field_impl( b: proc_macro2::TokenStream, ) -> proc_macro2::TokenStream { // x86_64 asm for four limbs - let default_impl = mul_impl_default(a.clone(), b.clone(), 4); let mut gen = proc_macro2::TokenStream::new(); gen.extend(quote! { - if *::ff::CPU_SUPPORTS_ADX_INSTRUCTION { - ::ff::mod_mul_4w_assign(&mut (#a.0).0, &(#b.0).0); - } else { + #[cfg(target_arch = "x86_64")] + { + if *::ff::CPU_SUPPORTS_ADX_INSTRUCTION { + ::ff::mod_mul_4w_assign(&mut (#a.0).0, &(#b.0).0); + } else { + #default_impl + } + } + #[cfg(not(target_arch = "x86_64"))] + { #default_impl } }); @@ -915,9 +921,125 @@ fn prime_field_impl( } } + fn add_assign_impl( + a: proc_macro2::TokenStream, + b: proc_macro2::TokenStream, + limbs: usize, + ) -> proc_macro2::TokenStream { + if limbs == 4 { + add_assign_asm_impl(a, b, limbs) + } else { + add_assign_default_impl(a, b, limbs) + } + } + + fn add_assign_asm_impl( + a: proc_macro2::TokenStream, + b: proc_macro2::TokenStream, + limbs: usize, + ) -> proc_macro2::TokenStream { + let mut gen = proc_macro2::TokenStream::new(); + let default_impl = add_assign_default_impl(a.clone(), b.clone(), limbs); + + gen.extend(quote! { + #[cfg(target_arch = "x86_64")] + { + // This cannot exceed the backing capacity. + use std::arch::x86_64::*; + use std::mem; + + unsafe { + let mut carry = _addcarry_u64( + 0, + (#a.0).0[0], + (#b.0).0[0], + &mut (#a.0).0[0] + ); + carry = _addcarry_u64( + carry, (#a.0).0[1], + (#b.0).0[1], + &mut (#a.0).0[1] + ); + carry = _addcarry_u64( + carry, (#a.0).0[2], + (#b.0).0[2], + &mut (#a.0).0[2] + ); + _addcarry_u64( + carry, + (#a.0).0[3], + (#b.0).0[3], + &mut (#a.0).0[3] + ); + + let mut s_sub: [u64; 4] = mem::uninitialized(); + + carry = _subborrow_u64( + 0, + (#a.0).0[0], + MODULUS.0[0], + &mut s_sub[0] + ); + carry = _subborrow_u64( + carry, + (#a.0).0[1], + MODULUS.0[1], + &mut s_sub[1] + ); + carry = _subborrow_u64( + carry, + (#a.0).0[2], + MODULUS.0[2], + &mut s_sub[2] + ); + carry = _subborrow_u64( + carry, + (#a.0).0[3], + MODULUS.0[3], + &mut s_sub[3] + ); + + if carry == 0 { + // Direct assign fails since size can be 4 or 6 + // Obviously code doesn't work at all for size 6 + // (#a).0 = s_sub; + (#a.0).0[0] = s_sub[0]; + (#a.0).0[1] = s_sub[1]; + (#a.0).0[2] = s_sub[2]; + (#a.0).0[3] = s_sub[3]; + } + } + } + #[cfg(not(target_arch = "x86_64"))] + { + #default_impl + } + }); + + gen + } + + fn add_assign_default_impl( + a: proc_macro2::TokenStream, + b: proc_macro2::TokenStream, + _limbs: usize, + ) -> proc_macro2::TokenStream { + let mut gen = proc_macro2::TokenStream::new(); + + gen.extend(quote! { + // This cannot exceed the backing capacity. + #a.0.add_nocarry(&#b.0); + + // However, it may need to be reduced. + #a.reduce(); + }); + gen + } + let squaring_impl = sqr_impl(quote! {self}, limbs); let multiply_impl = mul_impl(quote! {self}, quote! {other}, limbs, modulus_raw); let invert_impl = inv_impl(quote! {self}, name, modulus); + let add_assign = add_assign_impl(quote! {self}, quote! {other}, limbs); let montgomery_impl = mont_impl(limbs); // self.0[0].ct_eq(&other.0[0]) & self.0[1].ct_eq(&other.0[1]) & ... @@ -1092,79 +1214,7 @@ fn prime_field_impl( impl<'r> ::core::ops::AddAssign<&'r #name> for #name { #[inline] fn add_assign(&mut self, other: &#name) { - if #limbs == 4 && cfg!(target_arch = "x86_64") { - // This cannot exceed the backing capacity. - use std::arch::x86_64::*; - use std::mem; - - unsafe { - let mut carry = _addcarry_u64( - 0, - (self.0).0[0], - (other.0).0[0], - &mut (self.0).0[0] - ); - carry = _addcarry_u64( - carry, (self.0).0[1], - (other.0).0[1], - &mut (self.0).0[1] - ); - carry = _addcarry_u64( - carry, (self.0).0[2], - (other.0).0[2], - &mut (self.0).0[2] - ); - _addcarry_u64( - carry, - (self.0).0[3], - (other.0).0[3], - &mut (self.0).0[3] - ); - - let mut s_sub: [u64; 4] = mem::uninitialized(); - - carry = _subborrow_u64( - 0, - (self.0).0[0], - MODULUS.0[0], - &mut s_sub[0] - ); - carry = _subborrow_u64( - carry, - (self.0).0[1], - MODULUS.0[1], - &mut s_sub[1] - ); - carry = _subborrow_u64( - carry, - (self.0).0[2], - MODULUS.0[2], - &mut s_sub[2] - ); - carry = _subborrow_u64( - carry, - (self.0).0[3], - MODULUS.0[3], - &mut s_sub[3] - ); - - if carry == 0 { - // Direct assign fails since size can be 4 or 6 - // Obviously code doesn't work at all for size 6 - // (self.0).0 = s_sub; - (self.0).0[0] = s_sub[0]; - (self.0).0[1] = s_sub[1]; - (self.0).0[2] = s_sub[2]; - (self.0).0[3] = s_sub[3]; - } - } - } else { - // This cannot exceed the backing capacity. - self.0.add_nocarry(&other.0); - - // However, it may need to be reduced. - self.reduce(); - } + #add_assign } } From 5bdc5135a4261a9f34861eaf42df9120bf8c67ef Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Wed, 8 Jul 2020 00:04:30 +0200 Subject: [PATCH 08/11] fix building on non x864_64 archs (cherry picked from commit 407cc1ad9e572f3ab3683ccb2509d775a300f9ee) --- build.rs | 2 ++ src/asm.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index a16f149..5418037 100644 --- a/build.rs +++ b/build.rs @@ -1,5 +1,7 @@ fn main() { let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + + #[cfg(target_arch = "x86_64")] if target_arch == "x86_64" { cc::Build::new() .flag("-c") diff --git a/src/asm.rs b/src/asm.rs index 9f21f28..687d7c0 100644 --- a/src/asm.rs +++ b/src/asm.rs @@ -12,7 +12,7 @@ pub fn mod_mul_4w_assign(a: &mut [u64; 4], b: &[u64; 4]) { unsafe { mod_mul_4w(&*a, b, &mut res); } - std::mem::replace(a, res); + let _ = std::mem::replace(a, res); } #[cfg(test)] From 76f8fce2728b6ccac1126bed2d54d3108b215dda Mon Sep 17 00:00:00 2001 From: dignifiedquire Date: Wed, 8 Jul 2020 00:29:47 +0200 Subject: [PATCH 09/11] another build fix (cherry picked from commit bbe322358c33d30d34ccb32a64c6729c793b0a55) --- build.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index 5418037..fc3e906 100644 --- a/build.rs +++ b/build.rs @@ -1,7 +1,7 @@ +#[cfg(target_arch = "x86_64")] fn main() { let target_arch = std::env::var("CARGO_CFG_TARGET_ARCH").unwrap(); - #[cfg(target_arch = "x86_64")] if target_arch == "x86_64" { cc::Build::new() .flag("-c") @@ -9,3 +9,6 @@ fn main() { .compile("libff-derive-crypto.a"); } } + +#[cfg(not(target_arch = "x86_64"))] +fn main() {} From 8a375b943ff5a57a0ece2885b9aff68dc4bd2702 Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Mon, 31 May 2021 17:29:19 +0100 Subject: [PATCH 10/11] asm: Use core crate where possible --- ff_derive/src/lib.rs | 4 ++-- src/asm.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ff_derive/src/lib.rs b/ff_derive/src/lib.rs index 3ecf98b..888eabf 100644 --- a/ff_derive/src/lib.rs +++ b/ff_derive/src/lib.rs @@ -945,8 +945,8 @@ fn prime_field_impl( #[cfg(target_arch = "x86_64")] { // This cannot exceed the backing capacity. - use std::arch::x86_64::*; - use std::mem; + use core::arch::x86_64::*; + use core::mem; unsafe { let mut carry = _addcarry_u64( diff --git a/src/asm.rs b/src/asm.rs index 687d7c0..1512ff6 100644 --- a/src/asm.rs +++ b/src/asm.rs @@ -12,7 +12,7 @@ pub fn mod_mul_4w_assign(a: &mut [u64; 4], b: &[u64; 4]) { unsafe { mod_mul_4w(&*a, b, &mut res); } - let _ = std::mem::replace(a, res); + let _ = core::mem::replace(a, res); } #[cfg(test)] From fd5b9824313daf41418e819a2823bee2d58331a4 Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Mon, 31 May 2021 17:34:50 +0100 Subject: [PATCH 11/11] asm: Place behind a default-off `asm` feature flag --- Cargo.toml | 3 ++- src/asm.rs | 2 +- src/lib.rs | 9 ++++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7e736b7..1b8220f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ edition = "2018" bitvec = { version = "0.22", default-features = false, optional = true } byteorder = { version = "1", default-features = false, optional = true } ff_derive = { version = "0.8", path = "ff_derive", optional = true } -lazy_static = "1.4.0" +lazy_static = { version = "1.4.0", optional = true } rand_core = { version = "0.6", default-features = false } subtle = { version = "2.2.1", default-features = false, features = ["i128"] } @@ -26,6 +26,7 @@ cc = "1.0.50" [features] default = ["bits", "std"] +asm = ["lazy_static", "std"] bits = ["bitvec"] derive = ["byteorder", "ff_derive"] std = [] diff --git a/src/asm.rs b/src/asm.rs index 1512ff6..090f833 100644 --- a/src/asm.rs +++ b/src/asm.rs @@ -1,5 +1,5 @@ lazy_static::lazy_static! { - pub static ref CPU_SUPPORTS_ADX_INSTRUCTION: bool = is_x86_feature_detected!("adx"); + pub static ref CPU_SUPPORTS_ADX_INSTRUCTION: bool = std::is_x86_feature_detected!("adx"); } #[link(name = "ff-derive-crypto", kind = "static")] diff --git a/src/lib.rs b/src/lib.rs index 0fdcdc0..a23757b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,16 +4,19 @@ #![no_std] #![cfg_attr(docsrs, feature(doc_cfg))] #![deny(broken_intra_doc_links)] -#![forbid(unsafe_code)] +#![cfg_attr(not(feature = "asm"), forbid(unsafe_code))] + +#[cfg(feature = "std")] +extern crate std; #[cfg(feature = "derive")] #[cfg_attr(docsrs, doc(cfg(feature = "derive")))] pub use ff_derive::PrimeField; -#[cfg(target_arch = "x86_64")] +#[cfg(all(feature = "asm", target_arch = "x86_64"))] mod asm; -#[cfg(target_arch = "x86_64")] +#[cfg(all(feature = "asm", target_arch = "x86_64"))] pub use asm::*; #[cfg(feature = "bits")]