/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
 */

#include <linux/linkage.h>
#include <asm/asm.h>
#include <asm/export.h>

/*
 * copy_user_nocache - Uncached memory copy with exception handling
 *
 * This copies from user space into kernel space, but the kernel
 * space accesses can take a machine check exception, so they too
 * need exception handling.
 *
 * Note: only 32-bit and 64-bit stores have non-temporal versions,
 * and we only use aligned versions. Any unaligned parts at the
 * start or end of the copy will be done using normal cached stores.
 *
 * Input:
 * rdi destination
 * rsi source
 * edx count
 *
 * Output:
 * rax uncopied bytes or 0 if successful.
 */
SYM_FUNC_START(__copy_user_nocache)
	/* If destination is not 7-byte aligned, we'll have to align it */
	testb $7,%dil
	jne .Lalign

.Lis_aligned:
	cmp $64,%edx
	jb .Lquadwords

	.p2align 4,0x90
.Lunrolled:
10:	movq (%rsi),%r8
11:	movq 8(%rsi),%r9
12:	movq 16(%rsi),%r10
13:	movq 24(%rsi),%r11
20:	movnti %r8,(%rdi)
21:	movnti %r9,8(%rdi)
22:	movnti %r10,16(%rdi)
23:	movnti %r11,24(%rdi)
30:	movq 32(%rsi),%r8
31:	movq 40(%rsi),%r9
32:	movq 48(%rsi),%r10
33:	movq 56(%rsi),%r11
40:	movnti %r8,32(%rdi)
41:	movnti %r9,40(%rdi)
42:	movnti %r10,48(%rdi)
43:	movnti %r11,56(%rdi)

	addq $64,%rsi
	addq $64,%rdi
	sub $64,%edx
	cmp $64,%edx
	jae .Lunrolled

/*
 * First set of user mode loads have been done
 * without any stores, so if they fail, we can
 * just try the non-unrolled loop.
 */
_ASM_EXTABLE_UA(10b, .Lquadwords)
_ASM_EXTABLE_UA(11b, .Lquadwords)
_ASM_EXTABLE_UA(12b, .Lquadwords)
_ASM_EXTABLE_UA(13b, .Lquadwords)

/*
 * The second set of user mode loads have been
 * done with 32 bytes stored to the destination,
 * so we need to take that into account before
 * falling back to the unrolled loop.
 */
_ASM_EXTABLE_UA(30b, .Lfixup32)
_ASM_EXTABLE_UA(31b, .Lfixup32)
_ASM_EXTABLE_UA(32b, .Lfixup32)
_ASM_EXTABLE_UA(33b, .Lfixup32)

/*
 * An exception on a write means that we're
 * done, but we need to update the count
 * depending on where in the unrolled loop
 * we were.
 */
_ASM_EXTABLE_UA(20b, .Ldone0)
_ASM_EXTABLE_UA(21b, .Ldone8)
_ASM_EXTABLE_UA(22b, .Ldone16)
_ASM_EXTABLE_UA(23b, .Ldone24)
_ASM_EXTABLE_UA(40b, .Ldone32)
_ASM_EXTABLE_UA(41b, .Ldone40)
_ASM_EXTABLE_UA(42b, .Ldone48)
_ASM_EXTABLE_UA(43b, .Ldone56)

.Lquadwords:
	cmp $8,%edx
	jb .Llong
50:	movq (%rsi),%rax
51:	movnti %rax,(%rdi)
	addq $8,%rsi
	addq $8,%rdi
	sub $8,%edx
	jmp .Lquadwords

/*
 * If we fail on the last full quadword, we will
 * not try to do any byte-wise cached accesses.
 * We will try to do one more 4-byte uncached
 * one, though.
 */
_ASM_EXTABLE_UA(50b, .Llast4)
_ASM_EXTABLE_UA(51b, .Ldone0)

.Llong:
	test $4,%dl
	je .Lword
60:	movl (%rsi),%eax
61:	movnti %eax,(%rdi)
	addq $4,%rsi
	addq $4,%rdi
	sub $4,%edx
.Lword:
	sfence
	test $2,%dl
	je .Lbyte
70:	movw (%rsi),%ax
71:	movw %ax,(%rdi)
	addq $2,%rsi
	addq $2,%rdi
	sub $2,%edx
.Lbyte:
	test $1,%dl
	je .Ldone
80:	movb (%rsi),%al
81:	movb %al,(%rdi)
	dec %edx
.Ldone:
	mov %edx,%eax
	RET

/*
 * If we fail on the last four bytes, we won't
 * bother with any fixups. It's dead, Jim. Note
 * that there's no need for 'sfence' for any
 * of this, since the exception will have been
 * serializing.
 */
_ASM_EXTABLE_UA(60b, .Ldone)
_ASM_EXTABLE_UA(61b, .Ldone)
_ASM_EXTABLE_UA(70b, .Ldone)
_ASM_EXTABLE_UA(71b, .Ldone)
_ASM_EXTABLE_UA(80b, .Ldone)
_ASM_EXTABLE_UA(81b, .Ldone)

/*
 * This is the "head needs aliging" case when
 * the destination isn't 8-byte aligned. The
 * 4-byte case can be done uncached, but any
 * smaller alignment is done with regular stores.
 */
.Lalign:
	test $1,%dil
	je .Lalign_word
	test %edx,%edx
	je .Ldone
90:	movb (%rsi),%al
91:	movb %al,(%rdi)
	inc %rsi
	inc %rdi
	dec %edx
.Lalign_word:
	test $2,%dil
	je .Lalign_long
	cmp $2,%edx
	jb .Lbyte
92:	movw (%rsi),%ax
93:	movw %ax,(%rdi)
	addq $2,%rsi
	addq $2,%rdi
	sub $2,%edx
.Lalign_long:
	test $4,%dil
	je .Lis_aligned
	cmp $4,%edx
	jb .Lword
94:	movl (%rsi),%eax
95:	movnti %eax,(%rdi)
	addq $4,%rsi
	addq $4,%rdi
	sub $4,%edx
	jmp .Lis_aligned

/*
 * If we fail on the initial alignment accesses,
 * we're all done. Again, no point in trying to
 * do byte-by-byte probing if the 4-byte load
 * fails - we're not doing any uncached accesses
 * any more.
 */
_ASM_EXTABLE_UA(90b, .Ldone)
_ASM_EXTABLE_UA(91b, .Ldone)
_ASM_EXTABLE_UA(92b, .Ldone)
_ASM_EXTABLE_UA(93b, .Ldone)
_ASM_EXTABLE_UA(94b, .Ldone)
_ASM_EXTABLE_UA(95b, .Ldone)

/*
 * Exception table fixups for faults in the middle
 */
.Ldone56: sub $8,%edx
.Ldone48: sub $8,%edx
.Ldone40: sub $8,%edx
.Ldone32: sub $8,%edx
.Ldone24: sub $8,%edx
.Ldone16: sub $8,%edx
.Ldone8: sub $8,%edx
.Ldone0:
	mov %edx,%eax
	RET

.Lfixup32:
	addq $32,%rsi
	addq $32,%rdi
	sub $32,%edx
	jmp .Lquadwords

.Llast4:
52:	movl (%rsi),%eax
53:	movnti %eax,(%rdi)
	sfence
	sub $4,%edx
	mov %edx,%eax
	RET
_ASM_EXTABLE_UA(52b, .Ldone0)
_ASM_EXTABLE_UA(53b, .Ldone0)

SYM_FUNC_END(__copy_user_nocache)
EXPORT_SYMBOL