Skip to content

Commit 23d3a43

Browse files
Merge pull request #553 from espressif/feat/lvgl_port_simd_rgb888_memcpy
feat(lvgl_port_simd): RGB888 image blend to RGB888
2 parents 5129470 + 2c7e658 commit 23d3a43

File tree

9 files changed

+697
-20
lines changed

9 files changed

+697
-20
lines changed

components/esp_lvgl_port/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0"))
100100
set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb565_esp")
101101
set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb888_esp")
102102
set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_rgb565_blend_normal_to_rgb565_esp")
103+
set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_rgb888_blend_normal_to_rgb888_esp")
103104
endif()
104105
endif()
105106

components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ extern "C" {
4242
_lv_rgb565_blend_normal_to_rgb565_esp(dsc)
4343
#endif
4444

45+
#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888
46+
#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size, src_px_size) \
47+
_lv_rgb888_blend_normal_to_rgb888_esp(dsc, dest_px_size, src_px_size)
48+
#endif
49+
4550
/**********************
4651
* TYPEDEFS
4752
**********************/
@@ -126,6 +131,26 @@ static inline lv_result_t _lv_rgb565_blend_normal_to_rgb565_esp(_lv_draw_sw_blen
126131
return lv_rgb565_blend_normal_to_rgb565_esp(&asm_dsc);
127132
}
128133

134+
extern int lv_rgb888_blend_normal_to_rgb888_esp(asm_dsc_t *asm_dsc);
135+
136+
static inline lv_result_t _lv_rgb888_blend_normal_to_rgb888_esp(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size, uint32_t src_px_size)
137+
{
138+
if (!(dest_px_size == 3 && src_px_size == 3)) {
139+
return LV_RESULT_INVALID;
140+
}
141+
142+
asm_dsc_t asm_dsc = {
143+
.dst_buf = dsc->dest_buf,
144+
.dst_w = dsc->dest_w,
145+
.dst_h = dsc->dest_h,
146+
.dst_stride = dsc->dest_stride,
147+
.src_buf = dsc->src_buf,
148+
.src_stride = dsc->src_stride
149+
};
150+
151+
return lv_rgb888_blend_normal_to_rgb888_esp(&asm_dsc);
152+
}
153+
129154
#endif // CONFIG_LV_DRAW_SW_ASM_CUSTOM
130155

131156
#ifdef __cplusplus

components/esp_lvgl_port/src/lvgl9/simd/lv_rgb888_blend_normal_to_rgb888_esp32.S

Lines changed: 261 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
/*
2+
* SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
3+
*
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
#include "lv_macro_memcpy.S" // Memcpy macros
8+
9+
// This is LVGL RGB888 image blend to RGB888 for ESP32S3 processor
10+
11+
.section .text
12+
.align 4
13+
.global lv_rgb888_blend_normal_to_rgb888_esp
14+
.type lv_rgb888_blend_normal_to_rgb888_esp,@function
15+
// The function implements the following C code:
16+
// void lv_color_blend_to_rgb888(_lv_draw_sw_blend_fill_dsc_t * dsc);
17+
18+
// Input params
19+
//
20+
// dsc - a2
21+
22+
// typedef struct {
23+
// uint32_t opa; l32i 0
24+
// void * dst_buf; l32i 4
25+
// uint32_t dst_w; l32i 8
26+
// uint32_t dst_h; l32i 12
27+
// uint32_t dst_stride; l32i 16
28+
// const void * src_buf; l32i 20
29+
// uint32_t src_stride; l32i 24
30+
// const lv_opa_t * mask_buf; l32i 28
31+
// uint32_t mask_stride; l32i 32
32+
// } asm_dsc_t;
33+
34+
lv_rgb888_blend_normal_to_rgb888_esp:
35+
36+
entry a1, 32
37+
l32i.n a3, a2, 4 // a3 - dest_buff
38+
l32i.n a4, a2, 8 // a4 - dest_w in uint16_t
39+
l32i.n a5, a2, 12 // a5 - dest_h in uint16_t
40+
l32i.n a6, a2, 16 // a6 - dest_stride in bytes
41+
l32i.n a7, a2, 20 // a7 - src_buff
42+
l32i.n a8, a2, 24 // a8 - src_stride in bytes
43+
movi.n a10, 0xf // 0xf alignment mask (16-byte alignment)
44+
slli a11, a4, 1 // a11 = (a4 << 1) + a4
45+
add a11, a11, a4 // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w
46+
47+
// No need to convert any colors here, we are copying from rgb888 to rgb888
48+
49+
// Check dest_w length
50+
bltui a4, 8, _matrix_width_check // Branch if dest_w (a4) is lower than 8
51+
52+
//**********************************************************************************************************************
53+
54+
// The most general case, can handle all the possible combinations
55+
56+
// dest_buff (a3) - any alignment
57+
// src_buff (a7) - any alignment
58+
// dest_stride (a6) - any length
59+
// src_stride (a8) - any length
60+
// dest_w (a4) - any length
61+
62+
// Convert strides to matrix paddings
63+
sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
64+
sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
65+
66+
.outer_loop_all_unalign:
67+
68+
// dest_buff alignment check
69+
and a13, a10, a3 // Alignment mask 0xf (a10) AND dest_buff pointer
70+
beqz a13, _dest_buff_aligned // Branch if a13 = 0 (if dest_buff is aligned)
71+
72+
movi.n a14, 16 // a14 = 16
73+
sub a13, a14, a13 // a13 = 16 - unalignment
74+
75+
// Check modulo 8 of the unalignment a13, if - then copy 8 bytes (2 and 2/3 of RGB888 pixels)
76+
// src_buff a7, dest_buff a3, unalignment a13, copy registers a14, a15
77+
macro_memcpy_mod_8 a7, a3, a13, a15, a14, __LINE__
78+
79+
// Check modulo 4 of the unalignment, if - then copy 4 bytes (1 and 1/3 of RGB888 pixels)
80+
// src_buff a7, dest_buff a3, unalignment a13, copy register a15
81+
macro_memcpy_mod_4 a7, a3, a13, a15, __LINE__
82+
83+
// Check modulo 2 of the unalignment, if - then copy 2 bytes (2/3 of RGB888 pixel)
84+
// src_buff a7, dest_buff a3, unalignment a13, copy register a15
85+
macro_memcpy_mod_2 a7, a3, a13, a15, __LINE__
86+
87+
// Check modulo 1 of the unalignment, if - then copy 1 byte (1/3 of RGB888 pixel)
88+
// src_buff a7, dest_buff a3, unalignment a13, copy register a15
89+
macro_memcpy_mod_1 a7, a3, a13, a15, __LINE__
90+
91+
_dest_buff_aligned:
92+
93+
// Calculate modulo for non-aligned data
94+
sub a11, a11, a13 // a11 = local_dest_w_bytes (a11) = dest_w_bytes (a11) - (16 - unalignment)
95+
movi a15, 48 // a15 = 48
96+
quou a9, a11, a15 // a9 = local_dest_w_bytes (a11) DIV 48 (a15)
97+
remu a12, a11, a15 // a12 = local_dest_w_bytes (a11) remainder after div 48 (a15)
98+
99+
ee.ld.128.usar.ip q2, a7, 16 // Preload 16 bytes from src_buff a7 to q2, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
100+
ee.ld.128.usar.ip q3, a7, 16 // Preload 16 bytes from src_buff a7 to q3, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
101+
102+
// Run main loop which copies 48 bytes (16 RGB888 pixels) in one loop run
103+
loopnez a9, ._main_loop_all_unalign // 48 bytes (16 RGB888 pixels) in one loop
104+
ee.src.q.ld.ip q4, a7, 16, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
105+
ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
106+
ee.src.q.ld.ip q2, a7, 16, q3, q4 // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
107+
ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
108+
ee.src.q.ld.ip q3, a7, 16, q4, q2 // Load 16 bytes from src_buff a7 to q3, concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
109+
ee.vst.128.ip q4, a3, 16 // Store 16 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
110+
._main_loop_all_unalign:
111+
112+
// Finish the main loop outside of the loop from Q registers preloads
113+
114+
// Check modulo 32 and modulo 8 of the loop_len_remainder a12
115+
bbci a12, 5, _all_unalign_mod_32_check // Branch if 5-th bit of loop_len_remainder a12 is clear
116+
bbsi a12, 3, _all_unalign_mod_32_mod_8_check // Branch if 3-rd bif of loop_len_remainder a12 is set
117+
118+
// Copy 32 bytes (10 and 2/3 of RGB888 pixels)
119+
ee.src.q.ld.ip q4, a7, 0, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7
120+
ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
121+
ee.src.q q3, q3, q4 // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount
122+
ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
123+
j _skip_mod16
124+
125+
_all_unalign_mod_32_mod_8_check:
126+
// Copy 40 bytes (13 and 1/3 of RGB888 pixels)
127+
ee.src.q.ld.ip q4, a7, 16, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
128+
ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
129+
ee.src.q.ld.ip q2, a7, 0, q3, q4 // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, don't increase src_buf pointer a7
130+
ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
131+
ee.src.q q4, q4, q2 // Concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount
132+
ee.vst.l.64.ip q4, a3, 8 // Store lower 8 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
133+
addi a7, a7, -8 // Correct the src_buff pointer a7, caused by q reg preload
134+
j _skip_mod16
135+
136+
_all_unalign_mod_32_check:
137+
138+
// Check modulo 16 and modulo 8 of the loop_len_remainder a12
139+
bbci a12, 4, _all_unalign_mod_16_check // branch if 4-th bit of loop_len_remainder a12 is clear
140+
bbsi a12, 3, _all_unalign_mod_16_mod_8_check // branch if 3-rd bit of loop_len_remainder a12 is set
141+
142+
// Copy 16 bytes (5 and 1/3 of RGB888 pixels)
143+
ee.src.q q2, q2, q3 // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount
144+
ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
145+
addi a7, a7, -16 // Correct the src_buff pointer a7, caused by q reg preload
146+
j _skip_mod16
147+
148+
_all_unalign_mod_16_mod_8_check:
149+
// Copy 24 bytes (8 RGB888 pixels)
150+
ee.src.q.ld.ip q4, a7, 0, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7
151+
ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
152+
ee.src.q q3, q3, q4 // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount
153+
ee.vst.l.64.ip q3, a3, 8 // Store lower 8 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
154+
addi a7, a7, -8 // Correct the src_buff pointer a7, caused by q reg preload
155+
j _skip_mod16
156+
_all_unalign_mod_16_check:
157+
158+
bbci a12, 3, _all_unalign_mod_8_check // Branch if 3-rd bit of loop_len_remainder a12 is clear
159+
// Copy 8 bytes (2 and 2/3 of RGB888 pixels)
160+
ee.src.q q2, q2, q3 // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount
161+
ee.vst.l.64.ip q2, a3, 8 // Store lower 8 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
162+
addi a7, a7, -24 // Correct the src_buff pointer a7, caused by q reg preload
163+
j _skip_mod16
164+
_all_unalign_mod_8_check:
165+
166+
addi a7, a7, -32 // Correct the src_buff pointer a7, caused by q reg preload
167+
168+
_skip_mod16:
169+
170+
// Check modulo 4 of the loop_len_remainder, if - then copy 4 bytes (1 and 1/3 of RGB888 pixels)
171+
// src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
172+
macro_memcpy_mod_4 a7, a3, a12, a15, __LINE__
173+
174+
// Check modulo 2 of the loop_len_remainder, if - then copy 2 bytes (2/3 of RGB888 pixel)
175+
// src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
176+
macro_memcpy_mod_2 a7, a3, a12, a15, __LINE__
177+
178+
// Check modulo 1 of the loop_len_remainder, if - then copy 1 byte (1/3 of RGB888 pixel)
179+
// src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
180+
macro_memcpy_mod_1 a7, a3, a12, a15, __LINE_
181+
182+
slli a11, a4, 1 // Refresh dest_w_bytes
183+
add a11, a11, a4
184+
add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
185+
add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
186+
addi.n a5, a5, -1 // Decrease the outer loop
187+
bnez a5, .outer_loop_all_unalign
188+
189+
movi.n a2, 1 // Return LV_RESULT_OK = 1
190+
retw.n // Return
191+
192+
//**********************************************************************************************************************
193+
194+
// Small matrix width, keep it simple for lengths less than 8 pixels
195+
_matrix_width_check: // Matrix width is greater or equal 8 pixels
196+
197+
// Convert strides to matrix paddings
198+
sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
199+
sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
200+
201+
.outer_loop_short_matrix_length:
202+
203+
// Run main loop which copies 3 bytes (one RGB888 pixel) in one loop run
204+
loopnez a4, ._main_loop_short_matrix_length
205+
l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0
206+
l8ui a14, a7, 1 // Load 8 bits from src_buff a7 to a14, offset 1
207+
l8ui a13, a7, 2 // Load 8 bits from src_buff a7 to a13, offset 2
208+
s8i a15, a3, 0 // Save 8 bits from a15 to dest_buff a3, offset 0
209+
s8i a14, a3, 1 // Save 8 bits from a14 to dest_buff a3, offset 1
210+
s8i a13, a3, 2 // Save 8 bits from a13 to dest_buff a3, offset 2
211+
addi.n a7, a7, 3 // Increment src_buff pointer a7 by 3
212+
addi.n a3, a3, 3 // Increment dest_buff pointer a3 by 3
213+
._main_loop_short_matrix_length:
214+
215+
add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
216+
add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
217+
addi.n a5, a5, -1 // Decrease the outer loop
218+
bnez a5, .outer_loop_short_matrix_length
219+
220+
movi.n a2, 1 // Return LV_RESULT_OK = 1
221+
retw.n // Return

components/esp_lvgl_port/test_apps/simd/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/)
2323
| :----------- | :---------- | :--------------- | :------------- | :------------- |
2424
| RGB565 | 128x128 | 16 byte | 0.352 | 3.437 |
2525
| | 127x128 | 1 byte | 0.866 | 5.978 |
26+
| RGB888 | 128x128 | 16 byte | 0.744 | 4.002 |
27+
| | 127x128 | 1 byte | 1.002 | 7.998 |
2628
* this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x128 1 byte aligned matrix (worst case)
2729
* the values represent cycles per sample to perform memory copy between two matrices on esp32s3
2830

components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,9 @@ static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc
712712
if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
713713
/*Special case*/
714714
if (mask_buf == NULL && opa >= LV_OPA_MAX) {
715-
if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size, src_px_size)) {
715+
if (dsc->use_asm && dest_px_size == 3 && src_px_size == 3) {
716+
LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(dsc, src_px_size, dest_px_size);
717+
} else {
716718
if (src_px_size == dest_px_size) {
717719
for (y = 0; y < h; y++) {
718720
lv_memcpy(dest_buf, src_buf, w);

0 commit comments

Comments
 (0)