#!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # March 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that it # uses 256 bytes per-key table [+128 bytes shared table]. Even though # loops are aggressively modulo-scheduled in respect to references to # Htbl and Z.hi updates for 8 cycles per byte, measured performance is # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic # scheduling "glitch," because uprofile(1) indicates uniform sample # distribution, as if all instruction bundles execute in 1.5 cycles. # Meaning that it could have been even faster, yet 12 cycles is ~60% # better than gcc-generated code and ~80% than code generated by vendor # compiler. $cnt="v0"; # $0 $t0="t0"; $t1="t1"; $t2="t2"; $Thi0="t3"; # $4 $Tlo0="t4"; $Thi1="t5"; $Tlo1="t6"; $rem="t7"; # $8 ################# $Xi="a0"; # $16, input argument block $Htbl="a1"; $inp="a2"; $len="a3"; $nlo="a4"; # $20 $nhi="a5"; $Zhi="t8"; $Zlo="t9"; $Xhi="t10"; # $24 $Xlo="t11"; $remp="t12"; $rem_4bit="AT"; # $28 { my $N; sub loop() { $N++; $code.=<<___; .align 4 extbl $Xlo,7,$nlo and $nlo,0xf0,$nhi sll $nlo,4,$nlo and $nlo,0xf0,$nlo addq $nlo,$Htbl,$nlo ldq $Zlo,8($nlo) addq $nhi,$Htbl,$nhi ldq $Zhi,0($nlo) and $Zlo,0x0f,$remp sll $Zhi,60,$t0 lda $cnt,6(zero) extbl $Xlo,6,$nlo ldq $Tlo1,8($nhi) s8addq $remp,$rem_4bit,$remp ldq $Thi1,0($nhi) srl $Zlo,4,$Zlo ldq $rem,0($remp) srl $Zhi,4,$Zhi xor $t0,$Zlo,$Zlo and $nlo,0xf0,$nhi xor $Tlo1,$Zlo,$Zlo sll $nlo,4,$nlo xor $Thi1,$Zhi,$Zhi and $nlo,0xf0,$nlo addq $nlo,$Htbl,$nlo ldq $Tlo0,8($nlo) addq $nhi,$Htbl,$nhi ldq $Thi0,0($nlo) .Looplo$N: and $Zlo,0x0f,$remp sll $Zhi,60,$t0 subq $cnt,1,$cnt srl $Zlo,4,$Zlo ldq $Tlo1,8($nhi) xor $rem,$Zhi,$Zhi ldq $Thi1,0($nhi) s8addq $remp,$rem_4bit,$remp ldq $rem,0($remp) srl $Zhi,4,$Zhi xor $t0,$Zlo,$Zlo extbl $Xlo,$cnt,$nlo and $nlo,0xf0,$nhi xor $Thi0,$Zhi,$Zhi xor $Tlo0,$Zlo,$Zlo sll $nlo,4,$nlo and $Zlo,0x0f,$remp sll $Zhi,60,$t0 and $nlo,0xf0,$nlo srl $Zlo,4,$Zlo s8addq $remp,$rem_4bit,$remp xor $rem,$Zhi,$Zhi addq $nlo,$Htbl,$nlo addq $nhi,$Htbl,$nhi ldq $rem,0($remp) srl $Zhi,4,$Zhi ldq $Tlo0,8($nlo) xor $t0,$Zlo,$Zlo xor $Tlo1,$Zlo,$Zlo xor $Thi1,$Zhi,$Zhi ldq $Thi0,0($nlo) bne $cnt,.Looplo$N and $Zlo,0x0f,$remp sll $Zhi,60,$t0 lda $cnt,7(zero) srl $Zlo,4,$Zlo ldq $Tlo1,8($nhi) xor $rem,$Zhi,$Zhi ldq $Thi1,0($nhi) s8addq $remp,$rem_4bit,$remp ldq $rem,0($remp) srl $Zhi,4,$Zhi xor $t0,$Zlo,$Zlo extbl $Xhi,$cnt,$nlo and $nlo,0xf0,$nhi xor $Thi0,$Zhi,$Zhi xor $Tlo0,$Zlo,$Zlo sll $nlo,4,$nlo and $Zlo,0x0f,$remp sll $Zhi,60,$t0 and $nlo,0xf0,$nlo srl $Zlo,4,$Zlo s8addq $remp,$rem_4bit,$remp xor $rem,$Zhi,$Zhi addq $nlo,$Htbl,$nlo addq $nhi,$Htbl,$nhi ldq $rem,0($remp) srl $Zhi,4,$Zhi ldq $Tlo0,8($nlo) xor $t0,$Zlo,$Zlo xor $Tlo1,$Zlo,$Zlo xor $Thi1,$Zhi,$Zhi ldq $Thi0,0($nlo) unop .Loophi$N: and $Zlo,0x0f,$remp sll $Zhi,60,$t0 subq $cnt,1,$cnt srl $Zlo,4,$Zlo ldq $Tlo1,8($nhi) xor $rem,$Zhi,$Zhi ldq $Thi1,0($nhi) s8addq $remp,$rem_4bit,$remp ldq $rem,0($remp) srl $Zhi,4,$Zhi xor $t0,$Zlo,$Zlo extbl $Xhi,$cnt,$nlo and $nlo,0xf0,$nhi xor $Thi0,$Zhi,$Zhi xor $Tlo0,$Zlo,$Zlo sll $nlo,4,$nlo and $Zlo,0x0f,$remp sll $Zhi,60,$t0 and $nlo,0xf0,$nlo srl $Zlo,4,$Zlo s8addq $remp,$rem_4bit,$remp xor $rem,$Zhi,$Zhi addq $nlo,$Htbl,$nlo addq $nhi,$Htbl,$nhi ldq $rem,0($remp) srl $Zhi,4,$Zhi ldq $Tlo0,8($nlo) xor $t0,$Zlo,$Zlo xor $Tlo1,$Zlo,$Zlo xor $Thi1,$Zhi,$Zhi ldq $Thi0,0($nlo) bne $cnt,.Loophi$N and $Zlo,0x0f,$remp sll $Zhi,60,$t0 srl $Zlo,4,$Zlo ldq $Tlo1,8($nhi) xor $rem,$Zhi,$Zhi ldq $Thi1,0($nhi) s8addq $remp,$rem_4bit,$remp ldq $rem,0($remp) srl $Zhi,4,$Zhi xor $t0,$Zlo,$Zlo xor $Tlo0,$Zlo,$Zlo xor $Thi0,$Zhi,$Zhi and $Zlo,0x0f,$remp sll $Zhi,60,$t0 srl $Zlo,4,$Zlo s8addq $remp,$rem_4bit,$remp xor $rem,$Zhi,$Zhi ldq $rem,0($remp) srl $Zhi,4,$Zhi xor $Tlo1,$Zlo,$Zlo xor $Thi1,$Zhi,$Zhi xor $t0,$Zlo,$Zlo xor $rem,$Zhi,$Zhi ___ }} $code=<<___; #include .text .set noat .set noreorder .globl gcm_gmult_4bit .align 4 .ent gcm_gmult_4bit gcm_gmult_4bit: .frame sp,0,ra .prologue 0 ldq $Xlo,8($Xi) ldq $Xhi,0($Xi) lda $rem_4bit,rem_4bit ___ &loop(); $code.=<<___; srl $Zlo,24,$t0 # byte swap srl $Zlo,8,$t1 sll $Zlo,8,$t2 sll $Zlo,24,$Zlo zapnot $t0,0x11,$t0 zapnot $t1,0x22,$t1 zapnot $Zlo,0x88,$Zlo or $t0,$t1,$t0 zapnot $t2,0x44,$t2 or $Zlo,$t0,$Zlo srl $Zhi,24,$t0 srl $Zhi,8,$t1 or $Zlo,$t2,$Zlo sll $Zhi,8,$t2 sll $Zhi,24,$Zhi srl $Zlo,32,$Xlo sll $Zlo,32,$Zlo zapnot $t0,0x11,$t0 zapnot $t1,0x22,$t1 or $Zlo,$Xlo,$Xlo zapnot $Zhi,0x88,$Zhi or $t0,$t1,$t0 zapnot $t2,0x44,$t2 or $Zhi,$t0,$Zhi or $Zhi,$t2,$Zhi srl $Zhi,32,$Xhi sll $Zhi,32,$Zhi or $Zhi,$Xhi,$Xhi stq $Xlo,8($Xi) stq $Xhi,0($Xi) ret (ra) .end gcm_gmult_4bit ___ $inhi="s0"; $inlo="s1"; $code.=<<___; .globl gcm_ghash_4bit .align 4 .ent gcm_ghash_4bit gcm_ghash_4bit: lda sp,-32(sp) stq ra,0(sp) stq s0,8(sp) stq s1,16(sp) .mask 0x04000600,-32 .frame sp,32,ra .prologue 0 ldq_u $inhi,0($inp) ldq_u $Thi0,7($inp) ldq_u $inlo,8($inp) ldq_u $Tlo0,15($inp) ldq $Xhi,0($Xi) ldq $Xlo,8($Xi) lda $rem_4bit,rem_4bit .Louter: extql $inhi,$inp,$inhi extqh $Thi0,$inp,$Thi0 or $inhi,$Thi0,$inhi lda $inp,16($inp) extql $inlo,$inp,$inlo extqh $Tlo0,$inp,$Tlo0 or $inlo,$Tlo0,$inlo subq $len,16,$len xor $Xlo,$inlo,$Xlo xor $Xhi,$inhi,$Xhi ___ &loop(); $code.=<<___; srl $Zlo,24,$t0 # byte swap srl $Zlo,8,$t1 sll $Zlo,8,$t2 sll $Zlo,24,$Zlo zapnot $t0,0x11,$t0 zapnot $t1,0x22,$t1 zapnot $Zlo,0x88,$Zlo or $t0,$t1,$t0 zapnot $t2,0x44,$t2 or $Zlo,$t0,$Zlo srl $Zhi,24,$t0 srl $Zhi,8,$t1 or $Zlo,$t2,$Zlo sll $Zhi,8,$t2 sll $Zhi,24,$Zhi srl $Zlo,32,$Xlo sll $Zlo,32,$Zlo beq $len,.Ldone zapnot $t0,0x11,$t0 zapnot $t1,0x22,$t1 or $Zlo,$Xlo,$Xlo ldq_u $inhi,0($inp) zapnot $Zhi,0x88,$Zhi or $t0,$t1,$t0 zapnot $t2,0x44,$t2 ldq_u $Thi0,7($inp) or $Zhi,$t0,$Zhi or $Zhi,$t2,$Zhi ldq_u $inlo,8($inp) ldq_u $Tlo0,15($inp) srl $Zhi,32,$Xhi sll $Zhi,32,$Zhi or $Zhi,$Xhi,$Xhi br zero,.Louter .Ldone: zapnot $t0,0x11,$t0 zapnot $t1,0x22,$t1 or $Zlo,$Xlo,$Xlo zapnot $Zhi,0x88,$Zhi or $t0,$t1,$t0 zapnot $t2,0x44,$t2 or $Zhi,$t0,$Zhi or $Zhi,$t2,$Zhi srl $Zhi,32,$Xhi sll $Zhi,32,$Zhi or $Zhi,$Xhi,$Xhi stq $Xlo,8($Xi) stq $Xhi,0($Xi) .set noreorder /*ldq ra,0(sp)*/ ldq s0,8(sp) ldq s1,16(sp) lda sp,32(sp) ret (ra) .end gcm_ghash_4bit .section .rodata .align 4 rem_4bit: .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 .previous ___ $output=shift and open STDOUT,">$output"; print $code; close STDOUT;