mirror of
				https://github.com/Atmosphere-NX/Atmosphere.git
				synced 2025-10-31 03:05:48 +01:00 
			
		
		
		
	Before the MMU is up, all reads/writes must be aligned; the optimized memcpy implementation does not guarantee all reads/writes it performs are aligned. This commit splits the libc impl to be separate for kernel/kernel_ldr, and so now only kernel will use the optimized impl. This is safe, as the MMU is brought up before kernel begins executing.
		
			
				
	
	
		
			173 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			173 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * memset - fill memory with a constant byte
 | |
|  *
 | |
|  * Copyright (c) 2012-2020, Arm Limited.
 | |
|  * SPDX-License-Identifier: MIT
 | |
|  */
 | |
| 
 | |
| /* Assumptions:
 | |
|  *
 | |
|  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
 | |
|  *
 | |
|  */
 | |
| 
 | |
| #include "asmdefs.h"
 | |
| 
 | |
| #define DC_ZVA_THRESHOLD 512
 | |
| 
 | |
| #define dstin	x0
 | |
| #define val	    x1
 | |
| #define valw	w1
 | |
| #define count	x2
 | |
| #define dst	    x3
 | |
| #define dstend	x4
 | |
| #define zva_val	x5
 | |
| 
 | |
| ENTRY (memset)
 | |
| 
 | |
|     bfi     valw, valw,  8,  8
 | |
|     bfi     valw, valw, 16, 16
 | |
|     bfi     val,   val, 32, 32
 | |
| 
 | |
| 	add	    dstend, dstin, count
 | |
| 
 | |
| 	cmp 	count, 96
 | |
| 	b.hi	L(set_long)
 | |
| 	cmp	    count, 16
 | |
| 	b.hs	L(set_medium)
 | |
| 
 | |
| 	/* Set 0..15 bytes.  */
 | |
| 	tbz	    count, 3, 1f
 | |
| 	str	    val, [dstin]
 | |
| 	str	    val, [dstend, -8]
 | |
|     ret
 | |
| 1:	tbz	    count, 2, 2f
 | |
| 	str	    valw, [dstin]
 | |
| 	str	    valw, [dstend, -4]
 | |
|     ret
 | |
| 2:	cbz	    count, 3f
 | |
| 	strb	valw, [dstin]
 | |
| 	tbz	    count, 1, 3f
 | |
| 	strh	valw, [dstend, -2]
 | |
| 3:	ret
 | |
| 
 | |
| 	/* Set 16..96 bytes.  */
 | |
|     .p2align 4
 | |
| L(set_medium):
 | |
|     stp     val, val, [dstin]
 | |
| 	tbnz	count, 6, L(set96)
 | |
| 	stp	    val, val, [dstend, -16]
 | |
| 	tbz	    count, 5, 1f
 | |
| 	stp	    val, val, [dstin, 16]
 | |
| 	stp	    val, val, [dstend, -32]
 | |
| 1:	ret
 | |
| 
 | |
| 	.p2align 4
 | |
| 	/* Set 64..96 bytes.  Write 64 bytes from the start and
 | |
| 	   32 bytes from the end.  */
 | |
| L(set96):
 | |
| 	stp	    val, val, [dstin, 16]
 | |
| 	stp	    val, val, [dstin, 32]
 | |
| 	stp	    val, val, [dstin, 48]
 | |
| 	stp	    val, val, [dstend, -32]
 | |
| 	stp	    val, val, [dstend, -16]
 | |
|     ret
 | |
| 
 | |
| 	.p2align 4
 | |
| L(set_long):
 | |
| 	stp	    val, val, [dstin]
 | |
| #if DC_ZVA_THRESHOLD
 | |
| 	cmp	    count, DC_ZVA_THRESHOLD
 | |
| 	ccmp	val, 0, 0, cs
 | |
| 	bic	    dst, dstin, 15
 | |
| 	b.eq	L(zva_64)
 | |
| #else
 | |
| 	bic	    dst, dstin, 15
 | |
| #endif
 | |
| 	/* Small-size or non-zero memset does not use DC ZVA. */
 | |
| 	sub	    count, dstend, dst
 | |
| 
 | |
| 	/*
 | |
| 	 * Adjust count and bias for loop. By substracting extra 1 from count,
 | |
| 	 * it is easy to use tbz instruction to check whether loop tailing
 | |
| 	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
 | |
| 	 */
 | |
| 	sub	    count, count, 64+16+1
 | |
| 
 | |
| #if DC_ZVA_THRESHOLD
 | |
| 	/* Align loop on 16-byte boundary, this might be friendly to i-cache. */
 | |
| 	nop
 | |
| #endif
 | |
| 
 | |
| 1:	stp	    val, val, [dst, 16]
 | |
| 	stp	    val, val, [dst, 32]
 | |
| 	stp	    val, val, [dst, 48]
 | |
| 	stp	    val, val, [dst, 64]!
 | |
| 	subs	count, count, 64
 | |
| 	b.hs	1b
 | |
| 
 | |
| 	tbz	    count, 5, 1f	/* Remaining count is less than 33 bytes? */
 | |
| 	stp	    val, val, [dst, 16]
 | |
| 	stp	    val, val, [dst, 32]
 | |
| 1:	stp	    val, val, [dstend, -32]
 | |
| 	stp	    val, val, [dstend, -16]
 | |
| 	ret
 | |
| 
 | |
| #if DC_ZVA_THRESHOLD
 | |
| 	.p2align 4
 | |
| L(zva_64):
 | |
| 	stp	    val, val, [dst, 16]
 | |
| 	stp	    val, val, [dst, 32]
 | |
| 	stp	    val, val, [dst, 48]
 | |
| 	bic	    dst, dst, 63
 | |
| 
 | |
| 	/*
 | |
| 	 * Previous memory writes might cross cache line boundary, and cause
 | |
| 	 * cache line partially dirty. Zeroing this kind of cache line using
 | |
| 	 * DC ZVA will incur extra cost, for it requires loading untouched
 | |
| 	 * part of the line from memory before zeoring.
 | |
| 	 *
 | |
| 	 * So, write the first 64 byte aligned block using stp to force
 | |
| 	 * fully dirty cache line.
 | |
| 	 */
 | |
| 	stp	    val, val, [dst, 64]
 | |
| 	stp	    val, val, [dst, 80]
 | |
| 	stp	    val, val, [dst, 96]
 | |
| 	stp	    val, val, [dst, 112]
 | |
| 
 | |
| 	sub	    count, dstend, dst
 | |
| 	/*
 | |
| 	 * Adjust count and bias for loop. By substracting extra 1 from count,
 | |
| 	 * it is easy to use tbz instruction to check whether loop tailing
 | |
| 	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
 | |
| 	 */
 | |
| 	sub	    count, count, 128+64+64+1
 | |
| 	add	    dst, dst, 128
 | |
| 	nop
 | |
| 
 | |
| 	/* DC ZVA sets 64 bytes each time. */
 | |
| 1:	dc	    zva, dst
 | |
| 	add	    dst, dst, 64
 | |
| 	subs	count, count, 64
 | |
| 	b.hs	1b
 | |
| 
 | |
| 	/*
 | |
| 	 * Write the last 64 byte aligned block using stp to force fully
 | |
| 	 * dirty cache line.
 | |
| 	 */
 | |
| 	stp	    val, val, [dst, 0]
 | |
| 	stp	    val, val, [dst, 16]
 | |
| 	stp	    val, val, [dst, 32]
 | |
| 	stp	    val, val, [dst, 48]
 | |
| 
 | |
| 	tbz	    count, 5, 1f	/* Remaining count is less than 33 bytes? */
 | |
| 	stp	    val, val, [dst, 64]
 | |
| 	stp	    val, val, [dst, 80]
 | |
| 1:	stp	    val, val, [dstend, -32]
 | |
| 	stp	    val, val, [dstend, -16]
 | |
| 	ret
 | |
| #endif
 | |
| 
 | |
| 
 | |
| END (memset)
 |