// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * LoongArch SIMD XOR operations
 *
 * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
 */

#include "xor_simd.h"

/*
 * Process one cache line (64 bytes) per loop. This is assuming all future
 * popular LoongArch cores are similar performance-characteristics-wise to the
 * current models.
 */
#define LINE_WIDTH 64

#ifdef CONFIG_CPU_HAS_LSX

#define LD(reg, base, offset)	\
	"vld $vr" #reg ", %[" #base "], " #offset "\n\t"
#define ST(reg, base, offset)	\
	"vst $vr" #reg ", %[" #base "], " #offset "\n\t"
#define XOR(dj, k)	"vxor.v $vr" #dj ", $vr" #dj ", $vr" #k "\n\t"

#define LD_INOUT_LINE(base)	\
	LD(0, base, 0)		\
	LD(1, base, 16)		\
	LD(2, base, 32)		\
	LD(3, base, 48)

#define LD_AND_XOR_LINE(base)	\
	LD(4, base, 0)		\
	LD(5, base, 16)		\
	LD(6, base, 32)		\
	LD(7, base, 48)		\
	XOR(0, 4)		\
	XOR(1, 5)		\
	XOR(2, 6)		\
	XOR(3, 7)

#define ST_LINE(base)		\
	ST(0, base, 0)		\
	ST(1, base, 16)		\
	ST(2, base, 32)		\
	ST(3, base, 48)

#define XOR_FUNC_NAME(nr) __xor_lsx_##nr
#include "xor_template.c"

#undef LD
#undef ST
#undef XOR
#undef LD_INOUT_LINE
#undef LD_AND_XOR_LINE
#undef ST_LINE
#undef XOR_FUNC_NAME

#endif /* CONFIG_CPU_HAS_LSX */

#ifdef CONFIG_CPU_HAS_LASX

#define LD(reg, base, offset)	\
	"xvld $xr" #reg ", %[" #base "], " #offset "\n\t"
#define ST(reg, base, offset)	\
	"xvst $xr" #reg ", %[" #base "], " #offset "\n\t"
#define XOR(dj, k)	"xvxor.v $xr" #dj ", $xr" #dj ", $xr" #k "\n\t"

#define LD_INOUT_LINE(base)	\
	LD(0, base, 0)		\
	LD(1, base, 32)

#define LD_AND_XOR_LINE(base)	\
	LD(2, base, 0)		\
	LD(3, base, 32)		\
	XOR(0, 2)		\
	XOR(1, 3)

#define ST_LINE(base)		\
	ST(0, base, 0)		\
	ST(1, base, 32)

#define XOR_FUNC_NAME(nr) __xor_lasx_##nr
#include "xor_template.c"

#undef LD
#undef ST
#undef XOR
#undef LD_INOUT_LINE
#undef LD_AND_XOR_LINE
#undef ST_LINE
#undef XOR_FUNC_NAME

#endif /* CONFIG_CPU_HAS_LASX */