|
| 1 | +/* |
| 2 | + * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD |
| 3 | + * |
| 4 | + * SPDX-License-Identifier: Apache-2.0 |
| 5 | + */ |
| 6 | + |
| 7 | +#include "lv_macro_memcpy.S" // Memcpy macros |
| 8 | + |
| 9 | +// This is LVGL RGB888 image blend to RGB888 for ESP32S3 processor |
| 10 | + |
| 11 | + .section .text |
| 12 | + .align 4 |
| 13 | + .global lv_rgb888_blend_normal_to_rgb888_esp |
| 14 | + .type lv_rgb888_blend_normal_to_rgb888_esp,@function |
| 15 | +// The function implements the following C code: |
| 16 | +// void lv_color_blend_to_rgb888(_lv_draw_sw_blend_fill_dsc_t * dsc); |
| 17 | + |
| 18 | +// Input params |
| 19 | +// |
| 20 | +// dsc - a2 |
| 21 | + |
| 22 | +// typedef struct { |
| 23 | +// uint32_t opa; l32i 0 |
| 24 | +// void * dst_buf; l32i 4 |
| 25 | +// uint32_t dst_w; l32i 8 |
| 26 | +// uint32_t dst_h; l32i 12 |
| 27 | +// uint32_t dst_stride; l32i 16 |
| 28 | +// const void * src_buf; l32i 20 |
| 29 | +// uint32_t src_stride; l32i 24 |
| 30 | +// const lv_opa_t * mask_buf; l32i 28 |
| 31 | +// uint32_t mask_stride; l32i 32 |
| 32 | +// } asm_dsc_t; |
| 33 | + |
| 34 | +lv_rgb888_blend_normal_to_rgb888_esp: |
| 35 | + |
| 36 | + entry a1, 32 |
| 37 | + l32i.n a3, a2, 4 // a3 - dest_buff |
| 38 | + l32i.n a4, a2, 8 // a4 - dest_w in uint16_t |
| 39 | + l32i.n a5, a2, 12 // a5 - dest_h in uint16_t |
| 40 | + l32i.n a6, a2, 16 // a6 - dest_stride in bytes |
| 41 | + l32i.n a7, a2, 20 // a7 - src_buff |
| 42 | + l32i.n a8, a2, 24 // a8 - src_stride in bytes |
| 43 | + movi.n a10, 0xf // 0xf alignment mask (16-byte alignment) |
| 44 | + slli a11, a4, 1 // a11 = (a4 << 1) + a4 |
| 45 | + add a11, a11, a4 // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w |
| 46 | + |
| 47 | + // No need to convert any colors here, we are copying from rgb888 to rgb888 |
| 48 | + |
| 49 | + // Check dest_w length |
| 50 | + bltui a4, 8, _matrix_width_check // Branch if dest_w (a4) is lower than 8 |
| 51 | + |
| 52 | +//********************************************************************************************************************** |
| 53 | + |
| 54 | + // The most general case, can handle all the possible combinations |
| 55 | + |
| 56 | + // dest_buff (a3) - any alignment |
| 57 | + // src_buff (a7) - any alignment |
| 58 | + // dest_stride (a6) - any length |
| 59 | + // src_stride (a8) - any length |
| 60 | + // dest_w (a4) - any length |
| 61 | + |
| 62 | + // Convert strides to matrix paddings |
| 63 | + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) |
| 64 | + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) |
| 65 | + |
| 66 | + .outer_loop_all_unalign: |
| 67 | + |
| 68 | + // dest_buff alignment check |
| 69 | + and a13, a10, a3 // Alignment mask 0xf (a10) AND dest_buff pointer |
| 70 | + beqz a13, _dest_buff_aligned // Branch if a13 = 0 (if dest_buff is aligned) |
| 71 | + |
| 72 | + movi.n a14, 16 // a14 = 16 |
| 73 | + sub a13, a14, a13 // a13 = 16 - unalignment |
| 74 | + |
| 75 | + // Check modulo 8 of the unalignment a13, if - then copy 8 bytes (2 and 2/3 of RGB888 pixels) |
| 76 | + // src_buff a7, dest_buff a3, unalignment a13, copy registers a14, a15 |
| 77 | + macro_memcpy_mod_8 a7, a3, a13, a15, a14, __LINE__ |
| 78 | + |
| 79 | + // Check modulo 4 of the unalignment, if - then copy 4 bytes (1 and 1/3 of RGB888 pixels) |
| 80 | + // src_buff a7, dest_buff a3, unalignment a13, copy register a15 |
| 81 | + macro_memcpy_mod_4 a7, a3, a13, a15, __LINE__ |
| 82 | + |
| 83 | + // Check modulo 2 of the unalignment, if - then copy 2 bytes (2/3 of RGB888 pixel) |
| 84 | + // src_buff a7, dest_buff a3, unalignment a13, copy register a15 |
| 85 | + macro_memcpy_mod_2 a7, a3, a13, a15, __LINE__ |
| 86 | + |
| 87 | + // Check modulo 1 of the unalignment, if - then copy 1 byte (1/3 of RGB888 pixel) |
| 88 | + // src_buff a7, dest_buff a3, unalignment a13, copy register a15 |
| 89 | + macro_memcpy_mod_1 a7, a3, a13, a15, __LINE__ |
| 90 | + |
| 91 | + _dest_buff_aligned: |
| 92 | + |
| 93 | + // Calculate modulo for non-aligned data |
| 94 | + sub a11, a11, a13 // a11 = local_dest_w_bytes (a11) = dest_w_bytes (a11) - (16 - unalignment) |
| 95 | + movi a15, 48 // a15 = 48 |
| 96 | + quou a9, a11, a15 // a9 = local_dest_w_bytes (a11) DIV 48 (a15) |
| 97 | + remu a12, a11, a15 // a12 = local_dest_w_bytes (a11) remainder after div 48 (a15) |
| 98 | + |
| 99 | + ee.ld.128.usar.ip q2, a7, 16 // Preload 16 bytes from src_buff a7 to q2, get value of the SAR_BYTE, increase src_buf pointer a7 by 16 |
| 100 | + ee.ld.128.usar.ip q3, a7, 16 // Preload 16 bytes from src_buff a7 to q3, get value of the SAR_BYTE, increase src_buf pointer a7 by 16 |
| 101 | + |
| 102 | + // Run main loop which copies 48 bytes (16 RGB888 pixels) in one loop run |
| 103 | + loopnez a9, ._main_loop_all_unalign // 48 bytes (16 RGB888 pixels) in one loop |
| 104 | + ee.src.q.ld.ip q4, a7, 16, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 |
| 105 | + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 106 | + ee.src.q.ld.ip q2, a7, 16, q3, q4 // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 |
| 107 | + ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 108 | + ee.src.q.ld.ip q3, a7, 16, q4, q2 // Load 16 bytes from src_buff a7 to q3, concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 |
| 109 | + ee.vst.128.ip q4, a3, 16 // Store 16 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 110 | + ._main_loop_all_unalign: |
| 111 | + |
| 112 | + // Finish the main loop outside of the loop from Q registers preloads |
| 113 | + |
| 114 | + // Check modulo 32 and modulo 8 of the loop_len_remainder a12 |
| 115 | + bbci a12, 5, _all_unalign_mod_32_check // Branch if 5-th bit of loop_len_remainder a12 is clear |
| 116 | + bbsi a12, 3, _all_unalign_mod_32_mod_8_check // Branch if 3-rd bif of loop_len_remainder a12 is set |
| 117 | + |
| 118 | + // Copy 32 bytes (10 and 2/3 of RGB888 pixels) |
| 119 | + ee.src.q.ld.ip q4, a7, 0, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7 |
| 120 | + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 121 | + ee.src.q q3, q3, q4 // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount |
| 122 | + ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 123 | + j _skip_mod16 |
| 124 | + |
| 125 | + _all_unalign_mod_32_mod_8_check: |
| 126 | + // Copy 40 bytes (13 and 1/3 of RGB888 pixels) |
| 127 | + ee.src.q.ld.ip q4, a7, 16, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 |
| 128 | + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 129 | + ee.src.q.ld.ip q2, a7, 0, q3, q4 // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, don't increase src_buf pointer a7 |
| 130 | + ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 131 | + ee.src.q q4, q4, q2 // Concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount |
| 132 | + ee.vst.l.64.ip q4, a3, 8 // Store lower 8 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 8 |
| 133 | + addi a7, a7, -8 // Correct the src_buff pointer a7, caused by q reg preload |
| 134 | + j _skip_mod16 |
| 135 | + |
| 136 | + _all_unalign_mod_32_check: |
| 137 | + |
| 138 | + // Check modulo 16 and modulo 8 of the loop_len_remainder a12 |
| 139 | + bbci a12, 4, _all_unalign_mod_16_check // branch if 4-th bit of loop_len_remainder a12 is clear |
| 140 | + bbsi a12, 3, _all_unalign_mod_16_mod_8_check // branch if 3-rd bit of loop_len_remainder a12 is set |
| 141 | + |
| 142 | + // Copy 16 bytes (5 and 1/3 of RGB888 pixels) |
| 143 | + ee.src.q q2, q2, q3 // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount |
| 144 | + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 145 | + addi a7, a7, -16 // Correct the src_buff pointer a7, caused by q reg preload |
| 146 | + j _skip_mod16 |
| 147 | + |
| 148 | + _all_unalign_mod_16_mod_8_check: |
| 149 | + // Copy 24 bytes (8 RGB888 pixels) |
| 150 | + ee.src.q.ld.ip q4, a7, 0, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7 |
| 151 | + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 |
| 152 | + ee.src.q q3, q3, q4 // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount |
| 153 | + ee.vst.l.64.ip q3, a3, 8 // Store lower 8 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 8 |
| 154 | + addi a7, a7, -8 // Correct the src_buff pointer a7, caused by q reg preload |
| 155 | + j _skip_mod16 |
| 156 | + _all_unalign_mod_16_check: |
| 157 | + |
| 158 | + bbci a12, 3, _all_unalign_mod_8_check // Branch if 3-rd bit of loop_len_remainder a12 is clear |
| 159 | + // Copy 8 bytes (2 and 2/3 of RGB888 pixels) |
| 160 | + ee.src.q q2, q2, q3 // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount |
| 161 | + ee.vst.l.64.ip q2, a3, 8 // Store lower 8 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 8 |
| 162 | + addi a7, a7, -24 // Correct the src_buff pointer a7, caused by q reg preload |
| 163 | + j _skip_mod16 |
| 164 | + _all_unalign_mod_8_check: |
| 165 | + |
| 166 | + addi a7, a7, -32 // Correct the src_buff pointer a7, caused by q reg preload |
| 167 | + |
| 168 | + _skip_mod16: |
| 169 | + |
| 170 | + // Check modulo 4 of the loop_len_remainder, if - then copy 4 bytes (1 and 1/3 of RGB888 pixels) |
| 171 | + // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15 |
| 172 | + macro_memcpy_mod_4 a7, a3, a12, a15, __LINE__ |
| 173 | + |
| 174 | + // Check modulo 2 of the loop_len_remainder, if - then copy 2 bytes (2/3 of RGB888 pixel) |
| 175 | + // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15 |
| 176 | + macro_memcpy_mod_2 a7, a3, a12, a15, __LINE__ |
| 177 | + |
| 178 | + // Check modulo 1 of the loop_len_remainder, if - then copy 1 byte (1/3 of RGB888 pixel) |
| 179 | + // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15 |
| 180 | + macro_memcpy_mod_1 a7, a3, a12, a15, __LINE_ |
| 181 | + |
| 182 | + slli a11, a4, 1 // Refresh dest_w_bytes |
| 183 | + add a11, a11, a4 |
| 184 | + add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) |
| 185 | + add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8) |
| 186 | + addi.n a5, a5, -1 // Decrease the outer loop |
| 187 | + bnez a5, .outer_loop_all_unalign |
| 188 | + |
| 189 | + movi.n a2, 1 // Return LV_RESULT_OK = 1 |
| 190 | + retw.n // Return |
| 191 | + |
| 192 | +//********************************************************************************************************************** |
| 193 | + |
| 194 | + // Small matrix width, keep it simple for lengths less than 8 pixels |
| 195 | + _matrix_width_check: // Matrix width is greater or equal 8 pixels |
| 196 | + |
| 197 | + // Convert strides to matrix paddings |
| 198 | + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) |
| 199 | + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) |
| 200 | + |
| 201 | + .outer_loop_short_matrix_length: |
| 202 | + |
| 203 | + // Run main loop which copies 3 bytes (one RGB888 pixel) in one loop run |
| 204 | + loopnez a4, ._main_loop_short_matrix_length |
| 205 | + l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0 |
| 206 | + l8ui a14, a7, 1 // Load 8 bits from src_buff a7 to a14, offset 1 |
| 207 | + l8ui a13, a7, 2 // Load 8 bits from src_buff a7 to a13, offset 2 |
| 208 | + s8i a15, a3, 0 // Save 8 bits from a15 to dest_buff a3, offset 0 |
| 209 | + s8i a14, a3, 1 // Save 8 bits from a14 to dest_buff a3, offset 1 |
| 210 | + s8i a13, a3, 2 // Save 8 bits from a13 to dest_buff a3, offset 2 |
| 211 | + addi.n a7, a7, 3 // Increment src_buff pointer a7 by 3 |
| 212 | + addi.n a3, a3, 3 // Increment dest_buff pointer a3 by 3 |
| 213 | + ._main_loop_short_matrix_length: |
| 214 | + |
| 215 | + add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) |
| 216 | + add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8) |
| 217 | + addi.n a5, a5, -1 // Decrease the outer loop |
| 218 | + bnez a5, .outer_loop_short_matrix_length |
| 219 | + |
| 220 | + movi.n a2, 1 // Return LV_RESULT_OK = 1 |
| 221 | + retw.n // Return |
0 commit comments