#!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # Needs more work: key setup, CBC routine... # # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with # 128-bit key, which is ~40% better than 64-bit code generated by gcc # 4.0. But these are not the ones currently used! Their "compact" # counterparts are, for security reason. ppc_AES_encrypt_compact runs # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - # at 1/3 of ppc_AES_decrypt. # February 2010 # # Rescheduling instructions to favour Power6 pipeline gave 10% # performance improvement on the platform in question (and marginal # improvement even on others). It should be noted that Power6 fails # to process byte in 18 cycles, only in 23, because it fails to issue # 4 load instructions in two cycles, only in 3. As result non-compact # block subroutines are 25% slower than one would expect. Compact # functions scale better, because they have pure computational part, # which scales perfectly with clock frequency. To be specific # ppc_AES_encrypt_compact operates at 42 cycles per byte, while # ppc_AES_decrypt_compact - at 55 (in 64-bit build). $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; $LRSAVE =2*$SIZE_T; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; $LRSAVE =$SIZE_T; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; } else { die "nonsense $flavour"; } $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; $FRAME=32*$SIZE_T; sub _data_word() { my $i; while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; } } $sp="r1"; $toc="r2"; $inp="r3"; $out="r4"; $key="r5"; $Tbl0="r3"; $Tbl1="r6"; $Tbl2="r7"; $Tbl3="r2"; $s0="r8"; $s1="r9"; $s2="r10"; $s3="r11"; $t0="r12"; $t1="r13"; $t2="r14"; $t3="r15"; $acc00="r16"; $acc01="r17"; $acc02="r18"; $acc03="r19"; $acc04="r20"; $acc05="r21"; $acc06="r22"; $acc07="r23"; $acc08="r24"; $acc09="r25"; $acc10="r26"; $acc11="r27"; $acc12="r28"; $acc13="r29"; $acc14="r30"; $acc15="r31"; # stay away from TLS pointer if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; } else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; } $mask80=$Tbl2; $mask1b=$Tbl3; $code.=<<___; .machine "any" .text .align 7 LAES_Te: mflr r0 bcl 20,31,\$+4 mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry addi $Tbl0,$Tbl0,`128-8` mtlr r0 blr .space `64-12*4` LAES_Td: mflr r0 bcl 20,31,\$+4 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry addi $Tbl0,$Tbl0,`128-64-8+2048+256` mtlr r0 blr .space `128-64-12*4` ___ &_data_word( 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a); $code.=<<___; .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 ___ &_data_word( 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742); $code.=<<___; .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d .globl .aes_encrypt_internal .align 7 .aes_encrypt_internal: $STU $sp,-$FRAME($sp) mflr r0 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) $PUSH r17,`$FRAME-$SIZE_T*15`($sp) $PUSH r18,`$FRAME-$SIZE_T*14`($sp) $PUSH r19,`$FRAME-$SIZE_T*13`($sp) $PUSH r20,`$FRAME-$SIZE_T*12`($sp) $PUSH r21,`$FRAME-$SIZE_T*11`($sp) $PUSH r22,`$FRAME-$SIZE_T*10`($sp) $PUSH r23,`$FRAME-$SIZE_T*9`($sp) $PUSH r24,`$FRAME-$SIZE_T*8`($sp) $PUSH r25,`$FRAME-$SIZE_T*7`($sp) $PUSH r26,`$FRAME-$SIZE_T*6`($sp) $PUSH r27,`$FRAME-$SIZE_T*5`($sp) $PUSH r28,`$FRAME-$SIZE_T*4`($sp) $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) andi. $t0,$inp,3 andi. $t1,$out,3 or. $t0,$t0,$t1 bne Lenc_unaligned Lenc_unaligned_ok: lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) lwz $s3,12($inp) bl LAES_Te bl Lppc_AES_encrypt_compact stw $s0,0($out) stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) b Lenc_done Lenc_unaligned: subfic $t0,$inp,4096 subfic $t1,$out,4096 andi. $t0,$t0,4096-16 beq Lenc_xpage andi. $t1,$t1,4096-16 bne Lenc_unaligned_ok Lenc_xpage: lbz $acc00,0($inp) lbz $acc01,1($inp) lbz $acc02,2($inp) lbz $s0,3($inp) lbz $acc04,4($inp) lbz $acc05,5($inp) lbz $acc06,6($inp) lbz $s1,7($inp) lbz $acc08,8($inp) lbz $acc09,9($inp) lbz $acc10,10($inp) insrwi $s0,$acc00,8,0 lbz $s2,11($inp) insrwi $s1,$acc04,8,0 lbz $acc12,12($inp) insrwi $s0,$acc01,8,8 lbz $acc13,13($inp) insrwi $s1,$acc05,8,8 lbz $acc14,14($inp) insrwi $s0,$acc02,8,16 lbz $s3,15($inp) insrwi $s1,$acc06,8,16 insrwi $s2,$acc08,8,0 insrwi $s3,$acc12,8,0 insrwi $s2,$acc09,8,8 insrwi $s3,$acc13,8,8 insrwi $s2,$acc10,8,16 insrwi $s3,$acc14,8,16 bl LAES_Te bl Lppc_AES_encrypt_compact extrwi $acc00,$s0,8,0 extrwi $acc01,$s0,8,8 stb $acc00,0($out) extrwi $acc02,$s0,8,16 stb $acc01,1($out) stb $acc02,2($out) extrwi $acc04,$s1,8,0 stb $s0,3($out) extrwi $acc05,$s1,8,8 stb $acc04,4($out) extrwi $acc06,$s1,8,16 stb $acc05,5($out) stb $acc06,6($out) extrwi $acc08,$s2,8,0 stb $s1,7($out) extrwi $acc09,$s2,8,8 stb $acc08,8($out) extrwi $acc10,$s2,8,16 stb $acc09,9($out) stb $acc10,10($out) extrwi $acc12,$s3,8,0 stb $s2,11($out) extrwi $acc13,$s3,8,8 stb $acc12,12($out) extrwi $acc14,$s3,8,16 stb $acc13,13($out) stb $acc14,14($out) stb $s3,15($out) Lenc_done: $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) $POP r15,`$FRAME-$SIZE_T*17`($sp) $POP r16,`$FRAME-$SIZE_T*16`($sp) $POP r17,`$FRAME-$SIZE_T*15`($sp) $POP r18,`$FRAME-$SIZE_T*14`($sp) $POP r19,`$FRAME-$SIZE_T*13`($sp) $POP r20,`$FRAME-$SIZE_T*12`($sp) $POP r21,`$FRAME-$SIZE_T*11`($sp) $POP r22,`$FRAME-$SIZE_T*10`($sp) $POP r23,`$FRAME-$SIZE_T*9`($sp) $POP r24,`$FRAME-$SIZE_T*8`($sp) $POP r25,`$FRAME-$SIZE_T*7`($sp) $POP r26,`$FRAME-$SIZE_T*6`($sp) $POP r27,`$FRAME-$SIZE_T*5`($sp) $POP r28,`$FRAME-$SIZE_T*4`($sp) $POP r29,`$FRAME-$SIZE_T*3`($sp) $POP r30,`$FRAME-$SIZE_T*2`($sp) $POP r31,`$FRAME-$SIZE_T*1`($sp) mtlr r0 addi $sp,$sp,$FRAME blr .align 5 Lppc_AES_encrypt: lwz $acc00,240($key) addi $Tbl1,$Tbl0,3 lwz $t0,0($key) addi $Tbl2,$Tbl0,2 lwz $t1,4($key) addi $Tbl3,$Tbl0,1 lwz $t2,8($key) addi $acc00,$acc00,-1 lwz $t3,12($key) addi $key,$key,16 xor $s0,$s0,$t0 xor $s1,$s1,$t1 xor $s2,$s2,$t2 xor $s3,$s3,$t3 mtctr $acc00 .align 4 Lenc_loop: rlwinm $acc00,$s0,`32-24+3`,21,28 rlwinm $acc01,$s1,`32-24+3`,21,28 rlwinm $acc02,$s2,`32-24+3`,21,28 rlwinm $acc03,$s3,`32-24+3`,21,28 lwz $t0,0($key) rlwinm $acc04,$s1,`32-16+3`,21,28 lwz $t1,4($key) rlwinm $acc05,$s2,`32-16+3`,21,28 lwz $t2,8($key) rlwinm $acc06,$s3,`32-16+3`,21,28 lwz $t3,12($key) rlwinm $acc07,$s0,`32-16+3`,21,28 lwzx $acc00,$Tbl0,$acc00 rlwinm $acc08,$s2,`32-8+3`,21,28 lwzx $acc01,$Tbl0,$acc01 rlwinm $acc09,$s3,`32-8+3`,21,28 lwzx $acc02,$Tbl0,$acc02 rlwinm $acc10,$s0,`32-8+3`,21,28 lwzx $acc03,$Tbl0,$acc03 rlwinm $acc11,$s1,`32-8+3`,21,28 lwzx $acc04,$Tbl1,$acc04 rlwinm $acc12,$s3,`0+3`,21,28 lwzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s0,`0+3`,21,28 lwzx $acc06,$Tbl1,$acc06 rlwinm $acc14,$s1,`0+3`,21,28 lwzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s2,`0+3`,21,28 lwzx $acc08,$Tbl2,$acc08 xor $t0,$t0,$acc00 lwzx $acc09,$Tbl2,$acc09 xor $t1,$t1,$acc01 lwzx $acc10,$Tbl2,$acc10 xor $t2,$t2,$acc02 lwzx $acc11,$Tbl2,$acc11 xor $t3,$t3,$acc03 lwzx $acc12,$Tbl3,$acc12 xor $t0,$t0,$acc04 lwzx $acc13,$Tbl3,$acc13 xor $t1,$t1,$acc05 lwzx $acc14,$Tbl3,$acc14 xor $t2,$t2,$acc06 lwzx $acc15,$Tbl3,$acc15 xor $t3,$t3,$acc07 xor $t0,$t0,$acc08 xor $t1,$t1,$acc09 xor $t2,$t2,$acc10 xor $t3,$t3,$acc11 xor $s0,$t0,$acc12 xor $s1,$t1,$acc13 xor $s2,$t2,$acc14 xor $s3,$t3,$acc15 addi $key,$key,16 bdnz- Lenc_loop addi $Tbl2,$Tbl0,2048 nop lwz $t0,0($key) rlwinm $acc00,$s0,`32-24`,24,31 lwz $t1,4($key) rlwinm $acc01,$s1,`32-24`,24,31 lwz $t2,8($key) rlwinm $acc02,$s2,`32-24`,24,31 lwz $t3,12($key) rlwinm $acc03,$s3,`32-24`,24,31 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 rlwinm $acc04,$s1,`32-16`,24,31 lwz $acc09,`2048+32`($Tbl0) rlwinm $acc05,$s2,`32-16`,24,31 lwz $acc10,`2048+64`($Tbl0) rlwinm $acc06,$s3,`32-16`,24,31 lwz $acc11,`2048+96`($Tbl0) rlwinm $acc07,$s0,`32-16`,24,31 lwz $acc12,`2048+128`($Tbl0) rlwinm $acc08,$s2,`32-8`,24,31 lwz $acc13,`2048+160`($Tbl0) rlwinm $acc09,$s3,`32-8`,24,31 lwz $acc14,`2048+192`($Tbl0) rlwinm $acc10,$s0,`32-8`,24,31 lwz $acc15,`2048+224`($Tbl0) rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc00,$Tbl2,$acc00 rlwinm $acc12,$s3,`0`,24,31 lbzx $acc01,$Tbl2,$acc01 rlwinm $acc13,$s0,`0`,24,31 lbzx $acc02,$Tbl2,$acc02 rlwinm $acc14,$s1,`0`,24,31 lbzx $acc03,$Tbl2,$acc03 rlwinm $acc15,$s2,`0`,24,31 lbzx $acc04,$Tbl2,$acc04 rlwinm $s0,$acc00,24,0,7 lbzx $acc05,$Tbl2,$acc05 rlwinm $s1,$acc01,24,0,7 lbzx $acc06,$Tbl2,$acc06 rlwinm $s2,$acc02,24,0,7 lbzx $acc07,$Tbl2,$acc07 rlwinm $s3,$acc03,24,0,7 lbzx $acc08,$Tbl2,$acc08 rlwimi $s0,$acc04,16,8,15 lbzx $acc09,$Tbl2,$acc09 rlwimi $s1,$acc05,16,8,15 lbzx $acc10,$Tbl2,$acc10 rlwimi $s2,$acc06,16,8,15 lbzx $acc11,$Tbl2,$acc11 rlwimi $s3,$acc07,16,8,15 lbzx $acc12,$Tbl2,$acc12 rlwimi $s0,$acc08,8,16,23 lbzx $acc13,$Tbl2,$acc13 rlwimi $s1,$acc09,8,16,23 lbzx $acc14,$Tbl2,$acc14 rlwimi $s2,$acc10,8,16,23 lbzx $acc15,$Tbl2,$acc15 rlwimi $s3,$acc11,8,16,23 or $s0,$s0,$acc12 or $s1,$s1,$acc13 or $s2,$s2,$acc14 or $s3,$s3,$acc15 xor $s0,$s0,$t0 xor $s1,$s1,$t1 xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .align 4 Lppc_AES_encrypt_compact: lwz $acc00,240($key) addi $Tbl1,$Tbl0,2048 lwz $t0,0($key) lis $mask80,0x8080 lwz $t1,4($key) lis $mask1b,0x1b1b lwz $t2,8($key) ori $mask80,$mask80,0x8080 lwz $t3,12($key) ori $mask1b,$mask1b,0x1b1b addi $key,$key,16 mtctr $acc00 .align 4 Lenc_compact_loop: xor $s0,$s0,$t0 xor $s1,$s1,$t1 rlwinm $acc00,$s0,`32-24`,24,31 xor $s2,$s2,$t2 rlwinm $acc01,$s1,`32-24`,24,31 xor $s3,$s3,$t3 rlwinm $acc02,$s2,`32-24`,24,31 rlwinm $acc03,$s3,`32-24`,24,31 rlwinm $acc04,$s1,`32-16`,24,31 rlwinm $acc05,$s2,`32-16`,24,31 rlwinm $acc06,$s3,`32-16`,24,31 rlwinm $acc07,$s0,`32-16`,24,31 lbzx $acc00,$Tbl1,$acc00 rlwinm $acc08,$s2,`32-8`,24,31 lbzx $acc01,$Tbl1,$acc01 rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl1,$acc02 rlwinm $acc10,$s0,`32-8`,24,31 lbzx $acc03,$Tbl1,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl1,$acc04 rlwinm $acc12,$s3,`0`,24,31 lbzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s0,`0`,24,31 lbzx $acc06,$Tbl1,$acc06 rlwinm $acc14,$s1,`0`,24,31 lbzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s2,`0`,24,31 lbzx $acc08,$Tbl1,$acc08 rlwinm $s0,$acc00,24,0,7 lbzx $acc09,$Tbl1,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl1,$acc10 rlwinm $s2,$acc02,24,0,7 lbzx $acc11,$Tbl1,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl1,$acc12 rlwimi $s0,$acc04,16,8,15 lbzx $acc13,$Tbl1,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl1,$acc14 rlwimi $s2,$acc06,16,8,15 lbzx $acc15,$Tbl1,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 rlwimi $s2,$acc10,8,16,23 rlwimi $s3,$acc11,8,16,23 lwz $t0,0($key) or $s0,$s0,$acc12 lwz $t1,4($key) or $s1,$s1,$acc13 lwz $t2,8($key) or $s2,$s2,$acc14 lwz $t3,12($key) or $s3,$s3,$acc15 addi $key,$key,16 bdz Lenc_compact_done and $acc00,$s0,$mask80 # r1=r0&0x80808080 and $acc01,$s1,$mask80 and $acc02,$s2,$mask80 and $acc03,$s3,$mask80 srwi $acc04,$acc00,7 # r1>>7 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f srwi $acc05,$acc01,7 andc $acc09,$s1,$mask80 srwi $acc06,$acc02,7 andc $acc10,$s2,$mask80 srwi $acc07,$acc03,7 andc $acc11,$s3,$mask80 sub $acc00,$acc00,$acc04 # r1-(r1>>7) sub $acc01,$acc01,$acc05 sub $acc02,$acc02,$acc06 sub $acc03,$acc03,$acc07 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 add $acc09,$acc09,$acc09 add $acc10,$acc10,$acc10 add $acc11,$acc11,$acc11 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b and $acc01,$acc01,$mask1b and $acc02,$acc02,$mask1b and $acc03,$acc03,$mask1b xor $acc00,$acc00,$acc08 # r2 xor $acc01,$acc01,$acc09 rotlwi $acc12,$s0,16 # ROTATE(r0,16) xor $acc02,$acc02,$acc10 rotlwi $acc13,$s1,16 xor $acc03,$acc03,$acc11 rotlwi $acc14,$s2,16 xor $s0,$s0,$acc00 # r0^r2 rotlwi $acc15,$s3,16 xor $s1,$s1,$acc01 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) xor $s2,$s2,$acc02 rotrwi $s1,$s1,24 xor $s3,$s3,$acc03 rotrwi $s2,$s2,24 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 rotrwi $s3,$s3,24 xor $s1,$s1,$acc01 xor $s2,$s2,$acc02 xor $s3,$s3,$acc03 rotlwi $acc08,$acc12,8 # ROTATE(r0,24) xor $s0,$s0,$acc12 # rotlwi $acc09,$acc13,8 xor $s1,$s1,$acc13 rotlwi $acc10,$acc14,8 xor $s2,$s2,$acc14 rotlwi $acc11,$acc15,8 xor $s3,$s3,$acc15 xor $s0,$s0,$acc08 # xor $s1,$s1,$acc09 xor $s2,$s2,$acc10 xor $s3,$s3,$acc11 b Lenc_compact_loop .align 4 Lenc_compact_done: xor $s0,$s0,$t0 xor $s1,$s1,$t1 xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .globl .aes_decrypt_internal .align 7 .aes_decrypt_internal: $STU $sp,-$FRAME($sp) mflr r0 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) $PUSH r17,`$FRAME-$SIZE_T*15`($sp) $PUSH r18,`$FRAME-$SIZE_T*14`($sp) $PUSH r19,`$FRAME-$SIZE_T*13`($sp) $PUSH r20,`$FRAME-$SIZE_T*12`($sp) $PUSH r21,`$FRAME-$SIZE_T*11`($sp) $PUSH r22,`$FRAME-$SIZE_T*10`($sp) $PUSH r23,`$FRAME-$SIZE_T*9`($sp) $PUSH r24,`$FRAME-$SIZE_T*8`($sp) $PUSH r25,`$FRAME-$SIZE_T*7`($sp) $PUSH r26,`$FRAME-$SIZE_T*6`($sp) $PUSH r27,`$FRAME-$SIZE_T*5`($sp) $PUSH r28,`$FRAME-$SIZE_T*4`($sp) $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) $PUSH r0,`$FRAME+$LRSAVE`($sp) andi. $t0,$inp,3 andi. $t1,$out,3 or. $t0,$t0,$t1 bne Ldec_unaligned Ldec_unaligned_ok: lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) lwz $s3,12($inp) bl LAES_Td bl Lppc_AES_decrypt_compact stw $s0,0($out) stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) b Ldec_done Ldec_unaligned: subfic $t0,$inp,4096 subfic $t1,$out,4096 andi. $t0,$t0,4096-16 beq Ldec_xpage andi. $t1,$t1,4096-16 bne Ldec_unaligned_ok Ldec_xpage: lbz $acc00,0($inp) lbz $acc01,1($inp) lbz $acc02,2($inp) lbz $s0,3($inp) lbz $acc04,4($inp) lbz $acc05,5($inp) lbz $acc06,6($inp) lbz $s1,7($inp) lbz $acc08,8($inp) lbz $acc09,9($inp) lbz $acc10,10($inp) insrwi $s0,$acc00,8,0 lbz $s2,11($inp) insrwi $s1,$acc04,8,0 lbz $acc12,12($inp) insrwi $s0,$acc01,8,8 lbz $acc13,13($inp) insrwi $s1,$acc05,8,8 lbz $acc14,14($inp) insrwi $s0,$acc02,8,16 lbz $s3,15($inp) insrwi $s1,$acc06,8,16 insrwi $s2,$acc08,8,0 insrwi $s3,$acc12,8,0 insrwi $s2,$acc09,8,8 insrwi $s3,$acc13,8,8 insrwi $s2,$acc10,8,16 insrwi $s3,$acc14,8,16 bl LAES_Td bl Lppc_AES_decrypt_compact extrwi $acc00,$s0,8,0 extrwi $acc01,$s0,8,8 stb $acc00,0($out) extrwi $acc02,$s0,8,16 stb $acc01,1($out) stb $acc02,2($out) extrwi $acc04,$s1,8,0 stb $s0,3($out) extrwi $acc05,$s1,8,8 stb $acc04,4($out) extrwi $acc06,$s1,8,16 stb $acc05,5($out) stb $acc06,6($out) extrwi $acc08,$s2,8,0 stb $s1,7($out) extrwi $acc09,$s2,8,8 stb $acc08,8($out) extrwi $acc10,$s2,8,16 stb $acc09,9($out) stb $acc10,10($out) extrwi $acc12,$s3,8,0 stb $s2,11($out) extrwi $acc13,$s3,8,8 stb $acc12,12($out) extrwi $acc14,$s3,8,16 stb $acc13,13($out) stb $acc14,14($out) stb $s3,15($out) Ldec_done: $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) $POP r15,`$FRAME-$SIZE_T*17`($sp) $POP r16,`$FRAME-$SIZE_T*16`($sp) $POP r17,`$FRAME-$SIZE_T*15`($sp) $POP r18,`$FRAME-$SIZE_T*14`($sp) $POP r19,`$FRAME-$SIZE_T*13`($sp) $POP r20,`$FRAME-$SIZE_T*12`($sp) $POP r21,`$FRAME-$SIZE_T*11`($sp) $POP r22,`$FRAME-$SIZE_T*10`($sp) $POP r23,`$FRAME-$SIZE_T*9`($sp) $POP r24,`$FRAME-$SIZE_T*8`($sp) $POP r25,`$FRAME-$SIZE_T*7`($sp) $POP r26,`$FRAME-$SIZE_T*6`($sp) $POP r27,`$FRAME-$SIZE_T*5`($sp) $POP r28,`$FRAME-$SIZE_T*4`($sp) $POP r29,`$FRAME-$SIZE_T*3`($sp) $POP r30,`$FRAME-$SIZE_T*2`($sp) $POP r31,`$FRAME-$SIZE_T*1`($sp) mtlr r0 addi $sp,$sp,$FRAME blr .align 5 Lppc_AES_decrypt: lwz $acc00,240($key) addi $Tbl1,$Tbl0,3 lwz $t0,0($key) addi $Tbl2,$Tbl0,2 lwz $t1,4($key) addi $Tbl3,$Tbl0,1 lwz $t2,8($key) addi $acc00,$acc00,-1 lwz $t3,12($key) addi $key,$key,16 xor $s0,$s0,$t0 xor $s1,$s1,$t1 xor $s2,$s2,$t2 xor $s3,$s3,$t3 mtctr $acc00 .align 4 Ldec_loop: rlwinm $acc00,$s0,`32-24+3`,21,28 rlwinm $acc01,$s1,`32-24+3`,21,28 rlwinm $acc02,$s2,`32-24+3`,21,28 rlwinm $acc03,$s3,`32-24+3`,21,28 lwz $t0,0($key) rlwinm $acc04,$s3,`32-16+3`,21,28 lwz $t1,4($key) rlwinm $acc05,$s0,`32-16+3`,21,28 lwz $t2,8($key) rlwinm $acc06,$s1,`32-16+3`,21,28 lwz $t3,12($key) rlwinm $acc07,$s2,`32-16+3`,21,28 lwzx $acc00,$Tbl0,$acc00 rlwinm $acc08,$s2,`32-8+3`,21,28 lwzx $acc01,$Tbl0,$acc01 rlwinm $acc09,$s3,`32-8+3`,21,28 lwzx $acc02,$Tbl0,$acc02 rlwinm $acc10,$s0,`32-8+3`,21,28 lwzx $acc03,$Tbl0,$acc03 rlwinm $acc11,$s1,`32-8+3`,21,28 lwzx $acc04,$Tbl1,$acc04 rlwinm $acc12,$s1,`0+3`,21,28 lwzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s2,`0+3`,21,28 lwzx $acc06,$Tbl1,$acc06 rlwinm $acc14,$s3,`0+3`,21,28 lwzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s0,`0+3`,21,28 lwzx $acc08,$Tbl2,$acc08 xor $t0,$t0,$acc00 lwzx $acc09,$Tbl2,$acc09 xor $t1,$t1,$acc01 lwzx $acc10,$Tbl2,$acc10 xor $t2,$t2,$acc02 lwzx $acc11,$Tbl2,$acc11 xor $t3,$t3,$acc03 lwzx $acc12,$Tbl3,$acc12 xor $t0,$t0,$acc04 lwzx $acc13,$Tbl3,$acc13 xor $t1,$t1,$acc05 lwzx $acc14,$Tbl3,$acc14 xor $t2,$t2,$acc06 lwzx $acc15,$Tbl3,$acc15 xor $t3,$t3,$acc07 xor $t0,$t0,$acc08 xor $t1,$t1,$acc09 xor $t2,$t2,$acc10 xor $t3,$t3,$acc11 xor $s0,$t0,$acc12 xor $s1,$t1,$acc13 xor $s2,$t2,$acc14 xor $s3,$t3,$acc15 addi $key,$key,16 bdnz- Ldec_loop addi $Tbl2,$Tbl0,2048 nop lwz $t0,0($key) rlwinm $acc00,$s0,`32-24`,24,31 lwz $t1,4($key) rlwinm $acc01,$s1,`32-24`,24,31 lwz $t2,8($key) rlwinm $acc02,$s2,`32-24`,24,31 lwz $t3,12($key) rlwinm $acc03,$s3,`32-24`,24,31 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 rlwinm $acc04,$s3,`32-16`,24,31 lwz $acc09,`2048+32`($Tbl0) rlwinm $acc05,$s0,`32-16`,24,31 lwz $acc10,`2048+64`($Tbl0) lbzx $acc00,$Tbl2,$acc00 lwz $acc11,`2048+96`($Tbl0) lbzx $acc01,$Tbl2,$acc01 lwz $acc12,`2048+128`($Tbl0) rlwinm $acc06,$s1,`32-16`,24,31 lwz $acc13,`2048+160`($Tbl0) rlwinm $acc07,$s2,`32-16`,24,31 lwz $acc14,`2048+192`($Tbl0) rlwinm $acc08,$s2,`32-8`,24,31 lwz $acc15,`2048+224`($Tbl0) rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl2,$acc02 rlwinm $acc10,$s0,`32-8`,24,31 lbzx $acc03,$Tbl2,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl2,$acc04 rlwinm $acc12,$s1,`0`,24,31 lbzx $acc05,$Tbl2,$acc05 rlwinm $acc13,$s2,`0`,24,31 lbzx $acc06,$Tbl2,$acc06 rlwinm $acc14,$s3,`0`,24,31 lbzx $acc07,$Tbl2,$acc07 rlwinm $acc15,$s0,`0`,24,31 lbzx $acc08,$Tbl2,$acc08 rlwinm $s0,$acc00,24,0,7 lbzx $acc09,$Tbl2,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl2,$acc10 rlwinm $s2,$acc02,24,0,7 lbzx $acc11,$Tbl2,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl2,$acc12 rlwimi $s0,$acc04,16,8,15 lbzx $acc13,$Tbl2,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl2,$acc14 rlwimi $s2,$acc06,16,8,15 lbzx $acc15,$Tbl2,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 rlwimi $s2,$acc10,8,16,23 rlwimi $s3,$acc11,8,16,23 or $s0,$s0,$acc12 or $s1,$s1,$acc13 or $s2,$s2,$acc14 or $s3,$s3,$acc15 xor $s0,$s0,$t0 xor $s1,$s1,$t1 xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr .align 4 Lppc_AES_decrypt_compact: lwz $acc00,240($key) addi $Tbl1,$Tbl0,2048 lwz $t0,0($key) lis $mask80,0x8080 lwz $t1,4($key) lis $mask1b,0x1b1b lwz $t2,8($key) ori $mask80,$mask80,0x8080 lwz $t3,12($key) ori $mask1b,$mask1b,0x1b1b addi $key,$key,16 ___ $code.=<<___ if ($SIZE_T==8); insrdi $mask80,$mask80,32,0 insrdi $mask1b,$mask1b,32,0 ___ $code.=<<___; mtctr $acc00 .align 4 Ldec_compact_loop: xor $s0,$s0,$t0 xor $s1,$s1,$t1 rlwinm $acc00,$s0,`32-24`,24,31 xor $s2,$s2,$t2 rlwinm $acc01,$s1,`32-24`,24,31 xor $s3,$s3,$t3 rlwinm $acc02,$s2,`32-24`,24,31 rlwinm $acc03,$s3,`32-24`,24,31 rlwinm $acc04,$s3,`32-16`,24,31 rlwinm $acc05,$s0,`32-16`,24,31 rlwinm $acc06,$s1,`32-16`,24,31 rlwinm $acc07,$s2,`32-16`,24,31 lbzx $acc00,$Tbl1,$acc00 rlwinm $acc08,$s2,`32-8`,24,31 lbzx $acc01,$Tbl1,$acc01 rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl1,$acc02 rlwinm $acc10,$s0,`32-8`,24,31 lbzx $acc03,$Tbl1,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl1,$acc04 rlwinm $acc12,$s1,`0`,24,31 lbzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s2,`0`,24,31 lbzx $acc06,$Tbl1,$acc06 rlwinm $acc14,$s3,`0`,24,31 lbzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s0,`0`,24,31 lbzx $acc08,$Tbl1,$acc08 rlwinm $s0,$acc00,24,0,7 lbzx $acc09,$Tbl1,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl1,$acc10 rlwinm $s2,$acc02,24,0,7 lbzx $acc11,$Tbl1,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl1,$acc12 rlwimi $s0,$acc04,16,8,15 lbzx $acc13,$Tbl1,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl1,$acc14 rlwimi $s2,$acc06,16,8,15 lbzx $acc15,$Tbl1,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 rlwimi $s2,$acc10,8,16,23 rlwimi $s3,$acc11,8,16,23 lwz $t0,0($key) or $s0,$s0,$acc12 lwz $t1,4($key) or $s1,$s1,$acc13 lwz $t2,8($key) or $s2,$s2,$acc14 lwz $t3,12($key) or $s3,$s3,$acc15 addi $key,$key,16 bdz Ldec_compact_done ___ $code.=<<___ if ($SIZE_T==8); # vectorized permutation improves decrypt performance by 10% insrdi $s0,$s1,32,0 insrdi $s2,$s3,32,0 and $acc00,$s0,$mask80 # r1=r0&0x80808080 and $acc02,$s2,$mask80 srdi $acc04,$acc00,7 # r1>>7 srdi $acc06,$acc02,7 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f andc $acc10,$s2,$mask80 sub $acc00,$acc00,$acc04 # r1-(r1>>7) sub $acc02,$acc02,$acc06 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 add $acc10,$acc10,$acc10 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b and $acc02,$acc02,$mask1b xor $acc00,$acc00,$acc08 # r2 xor $acc02,$acc02,$acc10 and $acc04,$acc00,$mask80 # r1=r2&0x80808080 and $acc06,$acc02,$mask80 srdi $acc08,$acc04,7 # r1>>7 srdi $acc10,$acc06,7 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f andc $acc14,$acc02,$mask80 sub $acc04,$acc04,$acc08 # r1-(r1>>7) sub $acc06,$acc06,$acc10 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1 add $acc14,$acc14,$acc14 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b and $acc06,$acc06,$mask1b xor $acc04,$acc04,$acc12 # r4 xor $acc06,$acc06,$acc14 and $acc08,$acc04,$mask80 # r1=r4&0x80808080 and $acc10,$acc06,$mask80 srdi $acc12,$acc08,7 # r1>>7 srdi $acc14,$acc10,7 sub $acc08,$acc08,$acc12 # r1-(r1>>7) sub $acc10,$acc10,$acc14 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f andc $acc14,$acc06,$mask80 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1 add $acc14,$acc14,$acc14 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b and $acc10,$acc10,$mask1b xor $acc08,$acc08,$acc12 # r8 xor $acc10,$acc10,$acc14 xor $acc00,$acc00,$s0 # r2^r0 xor $acc02,$acc02,$s2 xor $acc04,$acc04,$s0 # r4^r0 xor $acc06,$acc06,$s2 extrdi $acc01,$acc00,32,0 extrdi $acc03,$acc02,32,0 extrdi $acc05,$acc04,32,0 extrdi $acc07,$acc06,32,0 extrdi $acc09,$acc08,32,0 extrdi $acc11,$acc10,32,0 ___ $code.=<<___ if ($SIZE_T==4); and $acc00,$s0,$mask80 # r1=r0&0x80808080 and $acc01,$s1,$mask80 and $acc02,$s2,$mask80 and $acc03,$s3,$mask80 srwi $acc04,$acc00,7 # r1>>7 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f srwi $acc05,$acc01,7 andc $acc09,$s1,$mask80 srwi $acc06,$acc02,7 andc $acc10,$s2,$mask80 srwi $acc07,$acc03,7 andc $acc11,$s3,$mask80 sub $acc00,$acc00,$acc04 # r1-(r1>>7) sub $acc01,$acc01,$acc05 sub $acc02,$acc02,$acc06 sub $acc03,$acc03,$acc07 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 add $acc09,$acc09,$acc09 add $acc10,$acc10,$acc10 add $acc11,$acc11,$acc11 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b and $acc01,$acc01,$mask1b and $acc02,$acc02,$mask1b and $acc03,$acc03,$mask1b xor $acc00,$acc00,$acc08 # r2 xor $acc01,$acc01,$acc09 xor $acc02,$acc02,$acc10 xor $acc03,$acc03,$acc11 and $acc04,$acc00,$mask80 # r1=r2&0x80808080 and $acc05,$acc01,$mask80 and $acc06,$acc02,$mask80 and $acc07,$acc03,$mask80 srwi $acc08,$acc04,7 # r1>>7 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f srwi $acc09,$acc05,7 andc $acc13,$acc01,$mask80 srwi $acc10,$acc06,7 andc $acc14,$acc02,$mask80 srwi $acc11,$acc07,7 andc $acc15,$acc03,$mask80 sub $acc04,$acc04,$acc08 # r1-(r1>>7) sub $acc05,$acc05,$acc09 sub $acc06,$acc06,$acc10 sub $acc07,$acc07,$acc11 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1 add $acc13,$acc13,$acc13 add $acc14,$acc14,$acc14 add $acc15,$acc15,$acc15 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b and $acc05,$acc05,$mask1b and $acc06,$acc06,$mask1b and $acc07,$acc07,$mask1b xor $acc04,$acc04,$acc12 # r4 xor $acc05,$acc05,$acc13 xor $acc06,$acc06,$acc14 xor $acc07,$acc07,$acc15 and $acc08,$acc04,$mask80 # r1=r4&0x80808080 and $acc09,$acc05,$mask80 srwi $acc12,$acc08,7 # r1>>7 and $acc10,$acc06,$mask80 srwi $acc13,$acc09,7 and $acc11,$acc07,$mask80 srwi $acc14,$acc10,7 sub $acc08,$acc08,$acc12 # r1-(r1>>7) srwi $acc15,$acc11,7 sub $acc09,$acc09,$acc13 sub $acc10,$acc10,$acc14 sub $acc11,$acc11,$acc15 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f andc $acc13,$acc05,$mask80 andc $acc14,$acc06,$mask80 andc $acc15,$acc07,$mask80 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1 add $acc13,$acc13,$acc13 add $acc14,$acc14,$acc14 add $acc15,$acc15,$acc15 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b and $acc09,$acc09,$mask1b and $acc10,$acc10,$mask1b and $acc11,$acc11,$mask1b xor $acc08,$acc08,$acc12 # r8 xor $acc09,$acc09,$acc13 xor $acc10,$acc10,$acc14 xor $acc11,$acc11,$acc15 xor $acc00,$acc00,$s0 # r2^r0 xor $acc01,$acc01,$s1 xor $acc02,$acc02,$s2 xor $acc03,$acc03,$s3 xor $acc04,$acc04,$s0 # r4^r0 xor $acc05,$acc05,$s1 xor $acc06,$acc06,$s2 xor $acc07,$acc07,$s3 ___ $code.=<<___; rotrwi $s0,$s0,8 # = ROTATE(r0,8) rotrwi $s1,$s1,8 xor $s0,$s0,$acc00 # ^= r2^r0 rotrwi $s2,$s2,8 xor $s1,$s1,$acc01 rotrwi $s3,$s3,8 xor $s2,$s2,$acc02 xor $s3,$s3,$acc03 xor $acc00,$acc00,$acc08 xor $acc01,$acc01,$acc09 xor $acc02,$acc02,$acc10 xor $acc03,$acc03,$acc11 xor $s0,$s0,$acc04 # ^= r4^r0 rotrwi $acc00,$acc00,24 xor $s1,$s1,$acc05 rotrwi $acc01,$acc01,24 xor $s2,$s2,$acc06 rotrwi $acc02,$acc02,24 xor $s3,$s3,$acc07 rotrwi $acc03,$acc03,24 xor $acc04,$acc04,$acc08 xor $acc05,$acc05,$acc09 xor $acc06,$acc06,$acc10 xor $acc07,$acc07,$acc11 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] rotrwi $acc04,$acc04,16 xor $s1,$s1,$acc09 rotrwi $acc05,$acc05,16 xor $s2,$s2,$acc10 rotrwi $acc06,$acc06,16 xor $s3,$s3,$acc11 rotrwi $acc07,$acc07,16 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) rotrwi $acc08,$acc08,8 xor $s1,$s1,$acc01 rotrwi $acc09,$acc09,8 xor $s2,$s2,$acc02 rotrwi $acc10,$acc10,8 xor $s3,$s3,$acc03 rotrwi $acc11,$acc11,8 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) xor $s1,$s1,$acc05 xor $s2,$s2,$acc06 xor $s3,$s3,$acc07 xor $s0,$s0,$acc08 # ^= ROTATE(r8,8) xor $s1,$s1,$acc09 xor $s2,$s2,$acc10 xor $s3,$s3,$acc11 b Ldec_compact_loop .align 4 Ldec_compact_done: xor $s0,$s0,$t0 xor $s1,$s1,$t1 xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT;