From d3a94802d8edf952ea4617eb018f46d79aafdd5c Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Thu, 3 Nov 2011 00:47:55 -0500 Subject: [PATCH] [libpng15] Added support for ARM processor (Mans Rullgard) --- ANNOUNCE | 1 + CHANGES | 1 + arm/filter_neon.S | 220 ++++++++++++++++++++++++++++++++++++++++++ pngpread.c | 2 +- pngpriv.h | 17 +++- pngread.c | 2 +- pngrutil.c | 241 ++++++++++++++++++++++++++-------------------- pngstruct.h | 3 + 8 files changed, 381 insertions(+), 106 deletions(-) create mode 100644 arm/filter_neon.S diff --git a/ANNOUNCE b/ANNOUNCE index eb771a080..11520a22b 100644 --- a/ANNOUNCE +++ b/ANNOUNCE @@ -27,6 +27,7 @@ Other information: Changes since the last public release (1.5.6): Version 1.5.7 [November 3, 2011] + Added support for ARM processor (Mans Rullgard) Send comments/corrections/commendations to png-mng-implement at lists.sf.net: (subscription required; visit diff --git a/CHANGES b/CHANGES index d3aedb480..305ffeada 100644 --- a/CHANGES +++ b/CHANGES @@ -3670,6 +3670,7 @@ Version 1.5.6 [November 3, 2011] No changes. Version 1.5.7 [November 3, 2011] + Added support for ARM processor (Mans Rullgard) Send comments/corrections/commendations to png-mng-implement at lists.sf.net (subscription required; visit diff --git a/arm/filter_neon.S b/arm/filter_neon.S new file mode 100644 index 000000000..a65ab0c13 --- /dev/null +++ b/arm/filter_neon.S @@ -0,0 +1,220 @@ + +/* filter_neon.S - NEON optimised filter functions + * + * Copyright (c) 2011 Mans Rullgard + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + + .arch armv7-a + .fpu neon + +.macro func name, export=0 + .macro endfunc +ELF .size \name, . - \name + .endfunc + .purgem endfunc + .endm + .text + .if \export + .global \name + .endif +ELF .type \name, STT_FUNC + .func \name +\name: +.endm + +func png_read_filter_row_sub4_neon, export=1 + ldr r3, [r0, #4] @ rowbytes + vmov.i8 d3, #0 +1: + vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] + vadd.u8 d0, d3, d4 + vadd.u8 d1, d0, d5 + vadd.u8 d2, d1, d6 + vadd.u8 d3, d2, d7 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! + subs r3, r3, #16 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_sub3_neon, export=1 + ldr r3, [r0, #4] @ rowbytes + vmov.i8 d3, #0 + mov r0, r1 + mov r2, #3 + mov r12, #12 + vld1.8 {q11}, [r0], r12 +1: + vext.8 d5, d22, d23, #3 + vadd.u8 d0, d3, d22 + vext.8 d6, d22, d23, #6 + vadd.u8 d1, d0, d5 + vext.8 d7, d23, d23, #1 + vld1.8 {q11}, [r0], r12 + vst1.32 {d0[0]}, [r1,:32], r2 + vadd.u8 d2, d1, d6 + vst1.32 {d1[0]}, [r1], r2 + vadd.u8 d3, d2, d7 + vst1.32 {d2[0]}, [r1], r2 + vst1.32 {d3[0]}, [r1], r2 + subs r3, r3, #12 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_up_neon, export=1 + ldr r3, [r0, #4] @ rowbytes +1: + vld1.8 {q0}, [r1,:128] + vld1.8 {q1}, [r2,:128]! + vadd.u8 q0, q0, q1 + vst1.8 {q0}, [r1,:128]! + subs r3, r3, #16 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_avg4_neon, export=1 + ldr r12, [r0, #4] @ rowbytes + vmov.i8 d3, #0 +1: + vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] + vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]! + vhadd.u8 d0, d3, d16 + vadd.u8 d0, d0, d4 + vhadd.u8 d1, d0, d17 + vadd.u8 d1, d1, d5 + vhadd.u8 d2, d1, d18 + vadd.u8 d2, d2, d6 + vhadd.u8 d3, d2, d19 + vadd.u8 d3, d3, d7 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! + subs r12, r12, #16 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_avg3_neon, export=1 + push {r4,lr} + ldr r12, [r0, #4] @ rowbytes + vmov.i8 d3, #0 + mov r0, r1 + mov r4, #3 + mov lr, #12 + vld1.8 {q11}, [r0], lr +1: + vld1.8 {q10}, [r2], lr + vext.8 d5, d22, d23, #3 + vhadd.u8 d0, d3, d20 + vext.8 d17, d20, d21, #3 + vadd.u8 d0, d0, d22 + vext.8 d6, d22, d23, #6 + vhadd.u8 d1, d0, d17 + vext.8 d18, d20, d21, #6 + vadd.u8 d1, d1, d5 + vext.8 d7, d23, d23, #1 + vld1.8 {q11}, [r0], lr + vst1.32 {d0[0]}, [r1,:32], r4 + vhadd.u8 d2, d1, d18 + vst1.32 {d1[0]}, [r1], r4 + vext.8 d19, d21, d21, #1 + vadd.u8 d2, d2, d6 + vhadd.u8 d3, d2, d19 + vst1.32 {d2[0]}, [r1], r4 + vadd.u8 d3, d3, d7 + vst1.32 {d3[0]}, [r1], r4 + subs r12, r12, #12 + bgt 1b + + pop {r4,pc} +endfunc + +.macro paeth rx, ra, rb, rc + vaddl.u8 q12, \ra, \rb @ a + b + vaddl.u8 q15, \rc, \rc @ 2*c + vabdl.u8 q13, \rb, \rc @ pa + vabdl.u8 q14, \ra, \rc @ pb + vabd.u16 q15, q12, q15 @ pc + vcle.u16 q12, q13, q14 @ pa <= pb + vcle.u16 q13, q13, q15 @ pa <= pc + vcle.u16 q14, q14, q15 @ pb <= pc + vand q12, q12, q13 @ pa <= pb && pa <= pc + vmovn.u16 d28, q14 + vmovn.u16 \rx, q12 + vbsl d28, \rb, \rc + vbsl \rx, \ra, d28 +.endm + +func png_read_filter_row_paeth4_neon, export=1 + ldr r12, [r0, #4] @ rowbytes + vmov.i8 d3, #0 + vmov.i8 d20, #0 +1: + vld4.32 {d4[],d5[],d6[],d7[]}, [r1,:128] + vld4.32 {d16[],d17[],d18[],d19[]},[r2,:128]! + paeth d0, d3, d16, d20 + vadd.u8 d0, d0, d4 + paeth d1, d0, d17, d16 + vadd.u8 d1, d1, d5 + paeth d2, d1, d18, d17 + vadd.u8 d2, d2, d6 + paeth d3, d2, d19, d18 + vmov d20, d19 + vadd.u8 d3, d3, d7 + vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r1,:128]! + subs r12, r12, #16 + bgt 1b + + bx lr +endfunc + +func png_read_filter_row_paeth3_neon, export=1 + push {r4,lr} + ldr r12, [r0, #4] @ rowbytes + vmov.i8 d3, #0 + vmov.i8 d4, #0 + mov r0, r1 + mov r4, #3 + mov lr, #12 + vld1.8 {q11}, [r0], lr +1: + vld1.8 {q10}, [r2], lr + paeth d0, d3, d20, d4 + vext.8 d5, d22, d23, #3 + vadd.u8 d0, d0, d22 + vext.8 d17, d20, d21, #3 + paeth d1, d0, d17, d20 + vst1.32 {d0[0]}, [r1,:32], r4 + vext.8 d6, d22, d23, #6 + vadd.u8 d1, d1, d5 + vext.8 d18, d20, d21, #6 + paeth d2, d1, d18, d17 + vext.8 d7, d23, d23, #1 + vld1.8 {q11}, [r0], lr + vst1.32 {d1[0]}, [r1], r4 + vadd.u8 d2, d2, d6 + vext.8 d19, d21, d21, #1 + paeth d3, d2, d19, d18 + vst1.32 {d2[0]}, [r1], r4 + vmov d4, d19 + vadd.u8 d3, d3, d7 + vst1.32 {d3[0]}, [r1], r4 + subs r12, r12, #12 + bgt 1b + + pop {r4,pc} +endfunc diff --git a/pngpread.c b/pngpread.c index 225fbdad6..56ee8643d 100644 --- a/pngpread.c +++ b/pngpread.c @@ -985,7 +985,7 @@ png_push_process_row(png_structp png_ptr) if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE) { if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST) - png_read_filter_row(&row_info, png_ptr->row_buf + 1, + png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, png_ptr->prev_row + 1, png_ptr->row_buf[0]); else png_error(png_ptr, "bad adaptive filter value"); diff --git a/pngpriv.h b/pngpriv.h index ca36fe18a..d0b7180eb 100644 --- a/pngpriv.h +++ b/pngpriv.h @@ -922,9 +922,24 @@ PNG_EXTERN void png_do_write_interlace PNGARG((png_row_infop row_info, /* Unfilter a row: check the filter value before calling this, there is no point * calling it for PNG_FILTER_VALUE_NONE. */ -PNG_EXTERN void png_read_filter_row PNGARG((png_row_infop row_info, +PNG_EXTERN void png_read_filter_row PNGARG((png_structp pp, png_row_infop row_info, png_bytep row, png_const_bytep prev_row, int filter)); +PNG_EXTERN void png_read_filter_row_up_neon PNGARG((png_row_infop row_info, + png_bytep row, png_const_bytep prev_row)); +PNG_EXTERN void png_read_filter_row_sub3_neon PNGARG((png_row_infop row_info, + png_bytep row, png_const_bytep prev_row)); +PNG_EXTERN void png_read_filter_row_sub4_neon PNGARG((png_row_infop row_info, + png_bytep row, png_const_bytep prev_row)); +PNG_EXTERN void png_read_filter_row_avg3_neon PNGARG((png_row_infop row_info, + png_bytep row, png_const_bytep prev_row)); +PNG_EXTERN void png_read_filter_row_avg4_neon PNGARG((png_row_infop row_info, + png_bytep row, png_const_bytep prev_row)); +PNG_EXTERN void png_read_filter_row_paeth3_neon PNGARG((png_row_infop row_info, + png_bytep row, png_const_bytep prev_row)); +PNG_EXTERN void png_read_filter_row_paeth4_neon PNGARG((png_row_infop row_info, + png_bytep row, png_const_bytep prev_row)); + /* Choose the best filter to use and filter the row data */ PNG_EXTERN void png_write_find_filter PNGARG((png_structp png_ptr, png_row_infop row_info)); diff --git a/pngread.c b/pngread.c index 0f598fa15..1b4ae03e7 100644 --- a/pngread.c +++ b/pngread.c @@ -578,7 +578,7 @@ png_read_row(png_structp png_ptr, png_bytep row, png_bytep dsp_row) if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE) { if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST) - png_read_filter_row(&row_info, png_ptr->row_buf + 1, + png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, png_ptr->prev_row + 1, png_ptr->row_buf[0]); else png_error(png_ptr, "bad adaptive filter value"); diff --git a/pngrutil.c b/pngrutil.c index 7d80d5d1c..beb947b8e 100644 --- a/pngrutil.c +++ b/pngrutil.c @@ -3498,134 +3498,169 @@ png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass, } #endif /* PNG_READ_INTERLACING_SUPPORTED */ -/* 1.5.6: Changed to just take a png_row_info (not png_ptr) and to ignore bad - * adaptive filter bytes. - */ -void /* PRIVATE */ -png_read_filter_row(png_row_infop row_info, png_bytep row, - png_const_bytep prev_row, int filter) +static void +png_read_filter_row_sub(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) { - switch (filter) + png_size_t i; + png_size_t istop = row_info->rowbytes; + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_bytep rp = row + bpp; + png_bytep lp = row; + + PNG_UNUSED(prev_row) + + for (i = bpp; i < istop; i++) { - case PNG_FILTER_VALUE_NONE: - break; + *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff); + rp++; + } +} - case PNG_FILTER_VALUE_SUB: - { - png_size_t i; - png_size_t istop = row_info->rowbytes; - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_bytep rp = row + bpp; - png_bytep lp = row; +static void +png_read_filter_row_up(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_size_t istop = row_info->rowbytes; + png_bytep rp = row; + png_const_bytep pp = prev_row; - for (i = bpp; i < istop; i++) - { - *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff); - rp++; - } - break; - } - case PNG_FILTER_VALUE_UP: - { - png_size_t i; - png_size_t istop = row_info->rowbytes; - png_bytep rp = row; - png_const_bytep pp = prev_row; + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } +} - for (i = 0; i < istop; i++) - { - *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); - rp++; - } - break; - } - case PNG_FILTER_VALUE_AVG: - { - png_size_t i; - png_bytep rp = row; - png_const_bytep pp = prev_row; - png_bytep lp = row; - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_size_t istop = row_info->rowbytes - bpp; +static void +png_read_filter_row_avg(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_bytep rp = row; + png_const_bytep pp = prev_row; + png_bytep lp = row; + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_size_t istop = row_info->rowbytes - bpp; - for (i = 0; i < bpp; i++) - { - *rp = (png_byte)(((int)(*rp) + - ((int)(*pp++) / 2 )) & 0xff); + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); - rp++; - } + rp++; + } - for (i = 0; i < istop; i++) - { - *rp = (png_byte)(((int)(*rp) + - (int)(*pp++ + *lp++) / 2 ) & 0xff); + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *lp++) / 2 ) & 0xff); - rp++; - } - break; - } - case PNG_FILTER_VALUE_PAETH: - { - png_size_t i; - png_bytep rp = row; - png_const_bytep pp = prev_row; - png_bytep lp = row; - png_const_bytep cp = prev_row; - unsigned int bpp = (row_info->pixel_depth + 7) >> 3; - png_size_t istop=row_info->rowbytes - bpp; + rp++; + } +} - for (i = 0; i < bpp; i++) - { - *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); - rp++; - } +static void +png_read_filter_row_paeth(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_size_t i; + png_bytep rp = row; + png_const_bytep pp = prev_row; + png_bytep lp = row; + png_const_bytep cp = prev_row; + unsigned int bpp = (row_info->pixel_depth + 7) >> 3; + png_size_t istop=row_info->rowbytes - bpp; - for (i = 0; i < istop; i++) /* Use leftover rp,pp */ - { - int a, b, c, pa, pb, pc, p; + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } - a = *lp++; - b = *pp++; - c = *cp++; + for (i = 0; i < istop; i++) /* Use leftover rp,pp */ + { + int a, b, c, pa, pb, pc, p; - p = b - c; - pc = a - c; + a = *lp++; + b = *pp++; + c = *cp++; + + p = b - c; + pc = a - c; #ifdef PNG_USE_ABS - pa = abs(p); - pb = abs(pc); - pc = abs(p + pc); + pa = abs(p); + pb = abs(pc); + pc = abs(p + pc); #else - pa = p < 0 ? -p : p; - pb = pc < 0 ? -pc : pc; - pc = (p + pc) < 0 ? -(p + pc) : p + pc; + pa = p < 0 ? -p : p; + pb = pc < 0 ? -pc : pc; + pc = (p + pc) < 0 ? -(p + pc) : p + pc; #endif - /* - if (pa <= pb && pa <= pc) - p = a; + /* + if (pa <= pb && pa <= pc) + p = a; - else if (pb <= pc) - p = b; + else if (pb <= pc) + p = b; - else - p = c; - */ + else + p = c; + */ - p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c; + p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c; - *rp = (png_byte)(((int)(*rp) + p) & 0xff); - rp++; - } - break; - } - default: - /* NOT REACHED */ - break; + *rp = (png_byte)(((int)(*rp) + p) & 0xff); + rp++; } } +#ifdef PNG_ARM_NEON +static void +png_init_filter_functions_neon(png_structp pp) +{ + unsigned int bpp = (pp->pixel_depth + 7) >> 3; + + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon; + + if (bpp == 3) { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_neon; + } else if (bpp == 4) { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_neon; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_neon; + } +} +#endif + +static void +png_init_filter_functions(png_structp pp) +{ + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub; + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth; + +#ifdef PNG_ARM_NEON + png_init_filter_functions_neon(pp); +#endif +} + +void /* PRIVATE */ +png_read_filter_row(png_structp pp, png_row_infop row_info, png_bytep row, + png_const_bytep prev_row, int filter) +{ + if (pp->read_filter[0] == NULL) + png_init_filter_functions(pp); + if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST) + pp->read_filter[filter-1](row_info, row, prev_row); +} + #ifdef PNG_SEQUENTIAL_READ_SUPPORTED void /* PRIVATE */ png_read_finish_row(png_structp png_ptr) diff --git a/pngstruct.h b/pngstruct.h index 33f23844a..07f3a0425 100644 --- a/pngstruct.h +++ b/pngstruct.h @@ -353,5 +353,8 @@ struct png_struct_def /* New member added in libpng-1.5.6 */ png_bytep big_prev_row; + + void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info, + png_bytep row, png_const_bytep prev_row); }; #endif /* PNGSTRUCT_H */