glibc/sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c

/* Optimized PowerPC implementation of ChaCha20 cipher.
   Copyright (C) 2022 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

/* chacha20-ppc.c - PowerPC vector implementation of ChaCha20
   Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>

   This file is part of Libgcrypt.

   Libgcrypt is free software; you can redistribute it and/or modify
   it under the terms of the GNU Lesser General Public License as
   published by the Free Software Foundation; either version 2.1 of
   the License, or (at your option) any later version.

   Libgcrypt is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with this program; if not, see <https://www.gnu.org/licenses/>.
 */

#include <altivec.h>
#include <endian.h>
#include <stddef.h>
#include <stdint.h>
#include <sys/cdefs.h>

typedef vector unsigned char vector16x_u8;
typedef vector unsigned int vector4x_u32;
typedef vector unsigned long long vector2x_u64;

#if __BYTE_ORDER == __BIG_ENDIAN
static const vector16x_u8 le_bswap_const =
  { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
#endif

static inline vector4x_u32
vec_rol_elems (vector4x_u32 v, unsigned int idx)
{
#if __BYTE_ORDER != __BIG_ENDIAN
  return vec_sld (v, v, (16 - (4 * idx)) & 15);
#else
  return vec_sld (v, v, (4 * idx) & 15);
#endif
}

static inline vector4x_u32
vec_load_le (unsigned long offset, const unsigned char *ptr)
{
  vector4x_u32 vec;
  vec = vec_vsx_ld (offset, (const uint32_t *)ptr);
#if __BYTE_ORDER == __BIG_ENDIAN
  vec = (vector4x_u32) vec_perm ((vector16x_u8)vec, (vector16x_u8)vec,
				 le_bswap_const);
#endif
  return vec;
}

static inline void
vec_store_le (vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
{
#if __BYTE_ORDER == __BIG_ENDIAN
  vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,
			       le_bswap_const);
#endif
  vec_vsx_st (vec, offset, (uint32_t *)ptr);
}


static inline vector4x_u32
vec_add_ctr_u64 (vector4x_u32 v, vector4x_u32 a)
{
#if __BYTE_ORDER == __BIG_ENDIAN
  static const vector16x_u8 swap32 =
    { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };
  vector2x_u64 vec, add, sum;

  vec = (vector2x_u64)vec_perm ((vector16x_u8)v, (vector16x_u8)v, swap32);
  add = (vector2x_u64)vec_perm ((vector16x_u8)a, (vector16x_u8)a, swap32);
  sum = vec + add;
  return (vector4x_u32)vec_perm ((vector16x_u8)sum, (vector16x_u8)sum, swap32);
#else
  return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));
#endif
}

/**********************************************************************
  4-way chacha20
 **********************************************************************/

#define ROTATE(v1,rolv)			\
	__asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv))

#define PLUS(ds,s) \
	((ds) += (s))

#define XOR(ds,s) \
	((ds) ^= (s))

#define ADD_U64(v,a) \
	(v = vec_add_ctr_u64(v, a))

/* 4x4 32-bit integer matrix transpose */
#define transpose_4x4(x0, x1, x2, x3) ({ \
	vector4x_u32 t1 = vec_mergeh(x0, x2); \
	vector4x_u32 t2 = vec_mergel(x0, x2); \
	vector4x_u32 t3 = vec_mergeh(x1, x3); \
	x3 = vec_mergel(x1, x3); \
	x0 = vec_mergeh(t1, t3); \
	x1 = vec_mergel(t1, t3); \
	x2 = vec_mergeh(t2, x3); \
	x3 = vec_mergel(t2, x3); \
      })

#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2)			\
	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
	    ROTATE(d1, rotate_16); ROTATE(d2, rotate_16);	\
	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
	    ROTATE(b1, rotate_12); ROTATE(b2, rotate_12);	\
	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
	    ROTATE(d1, rotate_8); ROTATE(d2, rotate_8);		\
	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
	    ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);

unsigned int attribute_hidden
__chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst, const uint8_t *src,
			   size_t nblks)
{
  vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
  vector4x_u32 counter_4 = { 4, 0, 0, 0 };
  vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
  vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
  vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
  vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
  vector4x_u32 state0, state1, state2, state3;
  vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;
  vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;
  vector4x_u32 tmp;
  int i;

  /* Force preload of constants to vector registers.  */
  __asm__ ("": "+v" (counters_0123) :: "memory");
  __asm__ ("": "+v" (counter_4) :: "memory");
  __asm__ ("": "+v" (rotate_16) :: "memory");
  __asm__ ("": "+v" (rotate_12) :: "memory");
  __asm__ ("": "+v" (rotate_8) :: "memory");
  __asm__ ("": "+v" (rotate_7) :: "memory");

  state0 = vec_vsx_ld (0 * 16, state);
  state1 = vec_vsx_ld (1 * 16, state);
  state2 = vec_vsx_ld (2 * 16, state);
  state3 = vec_vsx_ld (3 * 16, state);

  do
    {
      v0 = vec_splat (state0, 0);
      v1 = vec_splat (state0, 1);
      v2 = vec_splat (state0, 2);
      v3 = vec_splat (state0, 3);
      v4 = vec_splat (state1, 0);
      v5 = vec_splat (state1, 1);
      v6 = vec_splat (state1, 2);
      v7 = vec_splat (state1, 3);
      v8 = vec_splat (state2, 0);
      v9 = vec_splat (state2, 1);
      v10 = vec_splat (state2, 2);
      v11 = vec_splat (state2, 3);
      v12 = vec_splat (state3, 0);
      v13 = vec_splat (state3, 1);
      v14 = vec_splat (state3, 2);
      v15 = vec_splat (state3, 3);

      v12 += counters_0123;
      v13 -= vec_cmplt (v12, counters_0123);

      for (i = 20; i > 0; i -= 2)
	{
	  QUARTERROUND2 (v0, v4,  v8, v12,   v1, v5,  v9, v13)
	  QUARTERROUND2 (v2, v6, v10, v14,   v3, v7, v11, v15)
	  QUARTERROUND2 (v0, v5, v10, v15,   v1, v6, v11, v12)
	  QUARTERROUND2 (v2, v7,  v8, v13,   v3, v4,  v9, v14)
	}

      v0 += vec_splat (state0, 0);
      v1 += vec_splat (state0, 1);
      v2 += vec_splat (state0, 2);
      v3 += vec_splat (state0, 3);
      v4 += vec_splat (state1, 0);
      v5 += vec_splat (state1, 1);
      v6 += vec_splat (state1, 2);
      v7 += vec_splat (state1, 3);
      v8 += vec_splat (state2, 0);
      v9 += vec_splat (state2, 1);
      v10 += vec_splat (state2, 2);
      v11 += vec_splat (state2, 3);
      tmp = vec_splat( state3, 0);
      tmp += counters_0123;
      v12 += tmp;
      v13 += vec_splat (state3, 1) - vec_cmplt (tmp, counters_0123);
      v14 += vec_splat (state3, 2);
      v15 += vec_splat (state3, 3);
      ADD_U64 (state3, counter_4);

      transpose_4x4 (v0, v1, v2, v3);
      transpose_4x4 (v4, v5, v6, v7);
      transpose_4x4 (v8, v9, v10, v11);
      transpose_4x4 (v12, v13, v14, v15);

      vec_store_le (v0, (64 * 0 + 16 * 0), dst);
      vec_store_le (v1, (64 * 1 + 16 * 0), dst);
      vec_store_le (v2, (64 * 2 + 16 * 0), dst);
      vec_store_le (v3, (64 * 3 + 16 * 0), dst);

      vec_store_le (v4, (64 * 0 + 16 * 1), dst);
      vec_store_le (v5, (64 * 1 + 16 * 1), dst);
      vec_store_le (v6, (64 * 2 + 16 * 1), dst);
      vec_store_le (v7, (64 * 3 + 16 * 1), dst);

      vec_store_le (v8, (64 * 0 + 16 * 2), dst);
      vec_store_le (v9, (64 * 1 + 16 * 2), dst);
      vec_store_le (v10, (64 * 2 + 16 * 2), dst);
      vec_store_le (v11, (64 * 3 + 16 * 2), dst);

      vec_store_le (v12, (64 * 0 + 16 * 3), dst);
      vec_store_le (v13, (64 * 1 + 16 * 3), dst);
      vec_store_le (v14, (64 * 2 + 16 * 3), dst);
      vec_store_le (v15, (64 * 3 + 16 * 3), dst);

      src += 4*64;
      dst += 4*64;

      nblks -= 4;
    }
  while (nblks);

  vec_vsx_st (state3, 3 * 16, state);

  return 0;
}
powerpc64: Add optimized chacha20 It adds vectorized ChaCha20 implementation based on libgcrypt cipher/chacha20-ppc.c. It targets POWER8 and it is used on default for LE. On a POWER8 it shows the following improvements (using formatted bench-arc4random data): POWER8 GENERIC MB/s ----------------------------------------------- arc4random [single-thread] 138.77 arc4random_buf(16) [single-thread] 174.36 arc4random_buf(32) [single-thread] 228.11 arc4random_buf(48) [single-thread] 252.31 arc4random_buf(64) [single-thread] 270.11 arc4random_buf(80) [single-thread] 278.97 arc4random_buf(96) [single-thread] 287.78 arc4random_buf(112) [single-thread] 291.92 arc4random_buf(128) [single-thread] 295.25 POWER8 MB/s ----------------------------------------------- arc4random [single-thread] 198.06 arc4random_buf(16) [single-thread] 278.79 arc4random_buf(32) [single-thread] 448.89 arc4random_buf(48) [single-thread] 551.09 arc4random_buf(64) [single-thread] 646.12 arc4random_buf(80) [single-thread] 698.04 arc4random_buf(96) [single-thread] 756.06 arc4random_buf(112) [single-thread] 784.12 arc4random_buf(128) [single-thread] 808.04 ----------------------------------------------- Checked on powerpc64-linux-gnu and powerpc64le-linux-gnu. Reviewed-by: Paul E. Murphy <murphyp@linux.ibm.com> 2022-07-21 15:05:05 +02:00			`/* Optimized PowerPC implementation of ChaCha20 cipher.`
			`Copyright (C) 2022 Free Software Foundation, Inc.`

			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<https://www.gnu.org/licenses/>. */`

			`/* chacha20-ppc.c - PowerPC vector implementation of ChaCha20`
			`Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>`

			`This file is part of Libgcrypt.`

			`Libgcrypt is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU Lesser General Public License as`
			`published by the Free Software Foundation; either version 2.1 of`
			`the License, or (at your option) any later version.`

			`Libgcrypt is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with this program; if not, see <https://www.gnu.org/licenses/>.`
			`*/`

			`#include <altivec.h>`
			`#include <endian.h>`
			`#include <stddef.h>`
			`#include <stdint.h>`
			`#include <sys/cdefs.h>`

			`typedef vector unsigned char vector16x_u8;`
			`typedef vector unsigned int vector4x_u32;`
			`typedef vector unsigned long long vector2x_u64;`

			`#if __BYTE_ORDER == __BIG_ENDIAN`
			`static const vector16x_u8 le_bswap_const =`
			`{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };`
			`#endif`

			`static inline vector4x_u32`
			`vec_rol_elems (vector4x_u32 v, unsigned int idx)`
			`{`
			`#if __BYTE_ORDER != __BIG_ENDIAN`
			`return vec_sld (v, v, (16 - (4 * idx)) & 15);`
			`#else`
			`return vec_sld (v, v, (4 * idx) & 15);`
			`#endif`
			`}`

			`static inline vector4x_u32`
			`vec_load_le (unsigned long offset, const unsigned char *ptr)`
			`{`
			`vector4x_u32 vec;`
			`vec = vec_vsx_ld (offset, (const uint32_t *)ptr);`
			`#if __BYTE_ORDER == __BIG_ENDIAN`
			`vec = (vector4x_u32) vec_perm ((vector16x_u8)vec, (vector16x_u8)vec,`
			`le_bswap_const);`
			`#endif`
			`return vec;`
			`}`

			`static inline void`
			`vec_store_le (vector4x_u32 vec, unsigned long offset, unsigned char *ptr)`
			`{`
			`#if __BYTE_ORDER == __BIG_ENDIAN`
			`vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,`
			`le_bswap_const);`
			`#endif`
			`vec_vsx_st (vec, offset, (uint32_t *)ptr);`
			`}`


			`static inline vector4x_u32`
			`vec_add_ctr_u64 (vector4x_u32 v, vector4x_u32 a)`
			`{`
			`#if __BYTE_ORDER == __BIG_ENDIAN`
			`static const vector16x_u8 swap32 =`
			`{ 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };`
			`vector2x_u64 vec, add, sum;`

			`vec = (vector2x_u64)vec_perm ((vector16x_u8)v, (vector16x_u8)v, swap32);`
			`add = (vector2x_u64)vec_perm ((vector16x_u8)a, (vector16x_u8)a, swap32);`
			`sum = vec + add;`
			`return (vector4x_u32)vec_perm ((vector16x_u8)sum, (vector16x_u8)sum, swap32);`
			`#else`
			`return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));`
			`#endif`
			`}`

			`/**********************************************************************`
			`4-way chacha20`
			`**********************************************************************/`

			`#define ROTATE(v1,rolv) \`
			`__asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv))`

			`#define PLUS(ds,s) \`
			`((ds) += (s))`

			`#define XOR(ds,s) \`
			`((ds) ^= (s))`

			`#define ADD_U64(v,a) \`
			`(v = vec_add_ctr_u64(v, a))`

			`/* 4x4 32-bit integer matrix transpose */`
			`#define transpose_4x4(x0, x1, x2, x3) ({ \`
			`vector4x_u32 t1 = vec_mergeh(x0, x2); \`
			`vector4x_u32 t2 = vec_mergel(x0, x2); \`
			`vector4x_u32 t3 = vec_mergeh(x1, x3); \`
			`x3 = vec_mergel(x1, x3); \`
			`x0 = vec_mergeh(t1, t3); \`
			`x1 = vec_mergel(t1, t3); \`
			`x2 = vec_mergeh(t2, x3); \`
			`x3 = vec_mergel(t2, x3); \`
			`})`

			`#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \`
			`PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \`
			`ROTATE(d1, rotate_16); ROTATE(d2, rotate_16); \`
			`PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \`
			`ROTATE(b1, rotate_12); ROTATE(b2, rotate_12); \`
			`PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \`
			`ROTATE(d1, rotate_8); ROTATE(d2, rotate_8); \`
			`PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \`
			`ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);`

			`unsigned int attribute_hidden`
			`__chacha20_power8_blocks4 (uint32_t state, uint8_t dst, const uint8_t *src,`
			`size_t nblks)`
			`{`
			`vector4x_u32 counters_0123 = { 0, 1, 2, 3 };`
			`vector4x_u32 counter_4 = { 4, 0, 0, 0 };`
			`vector4x_u32 rotate_16 = { 16, 16, 16, 16 };`
			`vector4x_u32 rotate_12 = { 12, 12, 12, 12 };`
			`vector4x_u32 rotate_8 = { 8, 8, 8, 8 };`
			`vector4x_u32 rotate_7 = { 7, 7, 7, 7 };`
			`vector4x_u32 state0, state1, state2, state3;`
			`vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;`
			`vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;`
			`vector4x_u32 tmp;`
			`int i;`

			`/* Force preload of constants to vector registers. */`
			`__asm__ ("": "+v" (counters_0123) :: "memory");`
			`__asm__ ("": "+v" (counter_4) :: "memory");`
			`__asm__ ("": "+v" (rotate_16) :: "memory");`
			`__asm__ ("": "+v" (rotate_12) :: "memory");`
			`__asm__ ("": "+v" (rotate_8) :: "memory");`
			`__asm__ ("": "+v" (rotate_7) :: "memory");`

			`state0 = vec_vsx_ld (0 * 16, state);`
			`state1 = vec_vsx_ld (1 * 16, state);`
			`state2 = vec_vsx_ld (2 * 16, state);`
			`state3 = vec_vsx_ld (3 * 16, state);`

			`do`
			`{`
			`v0 = vec_splat (state0, 0);`
			`v1 = vec_splat (state0, 1);`
			`v2 = vec_splat (state0, 2);`
			`v3 = vec_splat (state0, 3);`
			`v4 = vec_splat (state1, 0);`
			`v5 = vec_splat (state1, 1);`
			`v6 = vec_splat (state1, 2);`
			`v7 = vec_splat (state1, 3);`
			`v8 = vec_splat (state2, 0);`
			`v9 = vec_splat (state2, 1);`
			`v10 = vec_splat (state2, 2);`
			`v11 = vec_splat (state2, 3);`
			`v12 = vec_splat (state3, 0);`
			`v13 = vec_splat (state3, 1);`
			`v14 = vec_splat (state3, 2);`
			`v15 = vec_splat (state3, 3);`

			`v12 += counters_0123;`
			`v13 -= vec_cmplt (v12, counters_0123);`

			`for (i = 20; i > 0; i -= 2)`
			`{`
			`QUARTERROUND2 (v0, v4, v8, v12, v1, v5, v9, v13)`
			`QUARTERROUND2 (v2, v6, v10, v14, v3, v7, v11, v15)`
			`QUARTERROUND2 (v0, v5, v10, v15, v1, v6, v11, v12)`
			`QUARTERROUND2 (v2, v7, v8, v13, v3, v4, v9, v14)`
			`}`

			`v0 += vec_splat (state0, 0);`
			`v1 += vec_splat (state0, 1);`
			`v2 += vec_splat (state0, 2);`
			`v3 += vec_splat (state0, 3);`
			`v4 += vec_splat (state1, 0);`
			`v5 += vec_splat (state1, 1);`
			`v6 += vec_splat (state1, 2);`
			`v7 += vec_splat (state1, 3);`
			`v8 += vec_splat (state2, 0);`
			`v9 += vec_splat (state2, 1);`
			`v10 += vec_splat (state2, 2);`
			`v11 += vec_splat (state2, 3);`
			`tmp = vec_splat( state3, 0);`
			`tmp += counters_0123;`
			`v12 += tmp;`
			`v13 += vec_splat (state3, 1) - vec_cmplt (tmp, counters_0123);`
			`v14 += vec_splat (state3, 2);`
			`v15 += vec_splat (state3, 3);`
			`ADD_U64 (state3, counter_4);`

			`transpose_4x4 (v0, v1, v2, v3);`
			`transpose_4x4 (v4, v5, v6, v7);`
			`transpose_4x4 (v8, v9, v10, v11);`
			`transpose_4x4 (v12, v13, v14, v15);`

			`vec_store_le (v0, (64 * 0 + 16 * 0), dst);`
			`vec_store_le (v1, (64 * 1 + 16 * 0), dst);`
			`vec_store_le (v2, (64 * 2 + 16 * 0), dst);`
			`vec_store_le (v3, (64 * 3 + 16 * 0), dst);`

			`vec_store_le (v4, (64 * 0 + 16 * 1), dst);`
			`vec_store_le (v5, (64 * 1 + 16 * 1), dst);`
			`vec_store_le (v6, (64 * 2 + 16 * 1), dst);`
			`vec_store_le (v7, (64 * 3 + 16 * 1), dst);`

			`vec_store_le (v8, (64 * 0 + 16 * 2), dst);`
			`vec_store_le (v9, (64 * 1 + 16 * 2), dst);`
			`vec_store_le (v10, (64 * 2 + 16 * 2), dst);`
			`vec_store_le (v11, (64 * 3 + 16 * 2), dst);`

			`vec_store_le (v12, (64 * 0 + 16 * 3), dst);`
			`vec_store_le (v13, (64 * 1 + 16 * 3), dst);`
			`vec_store_le (v14, (64 * 2 + 16 * 3), dst);`
			`vec_store_le (v15, (64 * 3 + 16 * 3), dst);`

			`src += 4*64;`
			`dst += 4*64;`

			`nblks -= 4;`
			`}`
			`while (nblks);`

			`vec_vsx_st (state3, 3 * 16, state);`

			`return 0;`
			`}`