diff --git a/ANNOUNCE b/ANNOUNCE index d98869175..dcdd13593 100644 --- a/ANNOUNCE +++ b/ANNOUNCE @@ -1,10 +1,12 @@ -Libpng 1.0.4 - September 19, 1999 +Libpng 1.0.4c - October 1, 1999 -This is a public release of libpng, intended for use in production codes. +This is not intended to be a public release. It will be replaced +within a few weeks by a public version or by another test version. Changes since the last public release (1.0.3): +version 1.0.3a [August 12, 1999] Added check for PNG_READ_INTERLACE_SUPPORTED in pngread.c; issue a warning if an attempt is made to read an interlaced image when it's not supported. Added check if png_ptr->trans is defined before free'ing it in pngread.c @@ -32,25 +34,50 @@ Changes since the last public release (1.0.3): consistent with PNG-1.2, and allow variance of 500 before complaining. Added assembler code contributed by Intel in file pngvcrd.c and modified makefile.w32 to use it (Nirav Chhatrapati, INTEL Corporation, Gilles Vollant) - Define PNG_USE_PNGVCRD in makefile.w32, to get MMX assembler code. Changed "ln -s -f" to "ln -f -s" in the makefiles to make Solaris happy. + Added some aliases for png_set_expand() in pngrtran.c, namely + png_set_expand_PLTE(), png_set_expand_depth(), and png_set_expand_tRNS() + (Greg Roelofs, in "PNG: The Definitive Guide"). Added makefile.beo for BEOS on X86, contributed by Sander Stok. +version 1.0.3b [August 26, 1999] Replaced 2147483647L several places with PNG_MAX_UINT macro, defined in png.h Changed leading blanks to tabs in all makefiles. + Define PNG_USE_PNGVCRD in makefile.w32, to get MMX assembler code. Made alternate versions of png_set_expand() in pngrtran.c, namely png_set_gray_1_2_4_to_8, png_set_palette_to_rgb, and png_set_tRNS_to_alpha - (Greg Roelofs, in "PNG: The Definitive Guide"). + (Greg Roelofs, in "PNG: The Definitive Guide"). Deleted the 1.0.3a aliases. Relocated start of 'extern "C"' block in png.h so it doesn't include pngconf.h Revised calculation of num_blocks in pngmem.c to avoid a potentially negative shift distance, whose results are undefined in the C language. Added a check in pngset.c to prevent writing multiple tIME chunks. Added a check in pngwrite.c to detect invalid small window_bits sizes. +version 1.0.3d [September 4, 1999] + Fixed type casting of igamma in pngrutil.c + Added new png_expand functions to scripts/pngdef.pas and pngos2.def Added a demo read_user_transform_fn that examines the row filters in pngtest.c +version 1.0.4 [September 24, 1999] Define PNG_ALWAYS_EXTERN in pngconf.h if __STDC__ is defined + Delete #define PNG_INTERNAL and include "png.h" from pngasmrd.h Made several minor corrections to pngtest.c Changed "hptr += 16L" to "hptr = hptr + 16L" in pngmem.c for Turbo 3.0 Renamed the makefiles with longer but more user friendly extensions. Copied the PNG copyright and license to a separate LICENSE file. + Revised documentation, png.h, and example.c to remove reference to + "viewing_gamma" which no longer appears in the PNG specification. + Revised pngvcrd.c to use MMX code for interlacing only on the final pass. + Updated pngvcrd.c to use the faster C filter algorithms from libpng-1.0.1a + Split makefile.win32vc into two versions, makefile.vcawin32 (uses MMX + assembler code) and makefile.vcwin32 (doesn't). + Added a CPU timing report to pngtest.c (enabled by defining PNGTEST_TIMING) +version 1.0.4a September 25, 1999 + Increase max_pixel_depth in pngrutil.c if a user transform needs it. + Changed several division operations to right-shifts in pngvcrd.c +version 1.0.4b September 30, 1999 + Added parentheses in line 3732 of pngvcrd.c + Added a comment in makefile.linux warning about buggy -O3 in pgcc 2.95.1 +version 1.0.4c [October 1, 1999] + Added a "png_check_version" function in png.c and pngtest.c that will generate + a helpful compiler error if an old png.h is found in the search path. Send comments/corrections/commendations to png-implement@ccrc.wustl.edu or to randeg@alum.rpi.edu diff --git a/CHANGES b/CHANGES index 7135912ec..ee7c17f45 100644 --- a/CHANGES +++ b/CHANGES @@ -436,9 +436,25 @@ version 1.0.3d [September 4, 1999] Fixed type casting of igamma in pngrutil.c Added new png_expand functions to scripts/pngdef.pas and pngos2.def Added a demo read_user_transform_fn that examines the row filters in pngtest.c -version 1.0.4 [September 19, 1999] +version 1.0.4 [September 24, 1999] Define PNG_ALWAYS_EXTERN in pngconf.h if __STDC__ is defined Delete #define PNG_INTERNAL and include "png.h" from pngasmrd.h Made several minor corrections to pngtest.c Renamed the makefiles with longer but more user friendly extensions. Copied the PNG copyright and license to a separate LICENSE file. + Revised documentation, png.h, and example.c to remove reference to + "viewing_gamma" which no longer appears in the PNG specification. + Revised pngvcrd.c to use MMX code for interlacing only on the final pass. + Updated pngvcrd.c to use the faster C filter algorithms from libpng-1.0.1a + Split makefile.win32vc into two versions, makefile.vcawin32 (uses MMX + assembler code) and makefile.vcwin32 (doesn't). + Added a CPU timing report to pngtest.c (enabled by defining PNGTEST_TIMING) +version 1.0.4a [September 25, 1999] + Increase max_pixel_depth in pngrutil.c if a user transform needs it. + Changed several division operations to right-shifts in pngvcrd.c +version 1.0.4b [September 30, 1999] + Added parentheses in line 3732 of pngvcrd.c + Added a comment in makefile.linux warning about buggy -O3 in pgcc 2.95.1 +version 1.0.4c [October 1, 1999] + Added a "png_check_version" function in png.c and pngtest.c that will generate + a helpful compiler error if an old png.h is found in the search path. diff --git a/INSTALL b/INSTALL index 1cd8b6323..475a8d477 100644 --- a/INSTALL +++ b/INSTALL @@ -1,5 +1,5 @@ -Installing libpng version 1.0.4 - September 19, 1999 +Installing libpng version 1.0.4c - October 1, 1999 Before installing libpng, you must first install zlib. zlib can usually be found wherever you got libpng. zlib can be @@ -10,7 +10,7 @@ zlib.h and zconf.h include files that correspond to the version of zlib that's installed. You can rename the directories that you downloaded (they -might be called "libpng-1.0.4" or "lpng103" and "zlib-1.1.3" +might be called "libpng-1.0.4c" or "lpng103" and "zlib-1.1.3" or "zlib113") so that you have directories called "zlib" and "libpng". Your directory structure should look like this: @@ -47,8 +47,8 @@ include makefile.hpux => HPUX (10.20 and 11.00) makefile makefile.sgi => Silicon Graphics IRIX makefile makefile.sunos => Sun makefile - makefile.solaris => Solaris 2.X makefile (gcc, creates libpng.so.2.1.0.4) - makefile.linux => Linux/ELF makefile (gcc, creates libpng.so.2.1.0.4) + makefile.solaris => Solaris 2.X makefile (gcc, creates libpng.so.2.1.0.4c) + makefile.linux => Linux/ELF makefile (gcc, creates libpng.so.2.1.0.4c) makefile.sco => For SCO OSr5 ELF and Unixware 7 with Native cc makefile.mips => MIPS makefile makefile.acorn => Acorn makefile @@ -61,7 +61,10 @@ include build.bat => MS-DOS batch file for Borland compiler makefile.dj2 => DJGPP 2 makefile makefile.msc => Microsoft C makefile - makefile.win32vc => makefile for Microsoft Visual C++ 4.0 and later + makefile.vcawin32 => makefile for Microsoft Visual C++ 5.0 and later (uses + assembler code) + makefile.vcwin32 => makefile for Microsoft Visual C++ 4.0 and later (does not + use assembler code) makefile.turboc3 => Turbo C 3.0 makefile makefile.os2 => OS/2 Makefile (gcc and emx, requires pngos2.def) pngos2.def => OS/2 module definition file used by makefile.os2 diff --git a/KNOWNBUG b/KNOWNBUG index 7c640a8ab..1c5629dcd 100644 --- a/KNOWNBUG +++ b/KNOWNBUG @@ -14,12 +14,12 @@ Known bugs and suggested enhancements in libpng-1.0.4 Question whether i-- or --i is better. STATUS: Under investigation, postponed until after - libpng-1.0.4. About 160 loops will be turned around + libpng-1.0.5. About 160 loops will be turned around in libpng-1.0.Nn, for testing. 2. July 4, 1998 -- ENHANCEMENT -- Glenn R-P - libpng-1.0.4 and earlier transform colors to gamma=1.0 space for + libpng-1.0.5 and earlier transform colors to gamma=1.0 space for merging with background, and then back to the image's gamma. The bit_depth of the intermediate (gamma=1.0) representation is probably not sufficient. In the typical gamma=1/2.2 situation, the linear @@ -34,7 +34,7 @@ Known bugs and suggested enhancements in libpng-1.0.4 It should be possible to use libpng without floating-point aritmetic. STATUS: Under investigation, implementation postponed until after - libpng-1.0.4. The application interface will change because replacements + libpng-1.0.5. The application interface will change because replacements for the png_set_gAMA(), png_set_cHRM(), and corresponding png_get_() functions will be needed. diff --git a/LICENSE b/LICENSE index 92dd0eaef..22fc3f20b 100644 --- a/LICENSE +++ b/LICENSE @@ -5,7 +5,25 @@ Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. Copyright (c) 1996, 1997 Andreas Dilger (libpng versions 0.90, December 1996, through 0.96, May 1997) Copyright (c) 1998, 1999 Glenn Randers-Pehrson -(libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999) +(libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999) + +For the purposes of this copyright and license, "Contributing Authors" +is defined as the following set of individuals: + + John Bowler + Kevin Bracey + Sam Bushell + Andreas Dilger + Magnus Holmgren + Tom Lane + Dave Martindale + Glenn Randers-Pehrson + Greg Roelofs + Guy Eric Schalnat + Paul Schmidt + Tom Tanner + Willem van Schaik + Tim Wegner The PNG Reference Library is supplied "AS IS". The Contributing Authors and Group 42, Inc. disclaim all warranties, expressed or implied, @@ -37,5 +55,5 @@ source code in a product, acknowledgment is not required but would be appreciated. Glenn Randers-Pehrson -randeg at alum.rpi.edu -September 19, 1999 +randeg@alum.rpi.edu +October 1, 1999 diff --git a/README b/README index ae4f72858..e19f30608 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -README for libpng 1.0.4 - September 19, 1999 (shared library 2.1) +README for libpng 1.0.4c - October 1, 1999 (shared library 2.1) See the note about version numbers near the top of png.h See INSTALL for instructions on how to install libpng. @@ -163,9 +163,9 @@ Files in this distribution: makefile.sgi => Silicon Graphics IRIX makefile makefile.sunos => Sun makefile makefile.solaris => Solaris 2.X makefile - (gcc, creates libpng.so.2.1.0.4) + (gcc, creates libpng.so.2.1.0.4c) makefile.linux => Linux/ELF makefile - (gcc, creates libpng.so.2.1.0.4) + (gcc, creates libpng.so.2.1.0.4c) makefile.sco => For SCO OSr5 ELF and Unixware 7 with Native cc makefile.mips => MIPS makefile makefile.acorn => Acorn makefile @@ -179,7 +179,10 @@ Files in this distribution: build.bat => MS-DOS batch file for Borland compiler makefile.dj2 => DJGPP 2 makefile makefile.msc => Microsoft C makefile - makefile.win32vc => makefile for Microsoft Visual C++ 4.0 and later + makefile.vcawin32 => makefile for Microsoft Visual C++ 5.0 and + later (uses assembler code) + makefile.vcwin32 => makefile for Microsoft Visual C++ 4.0 and + later (does not use assembler code) makefile.turboc3 => Turbo C 3.0 makefile makefile.os2 => OS/2 Makefile (gcc and emx, requires pngos2.def) pngos2.def => OS/2 module definition file used by makefile.os2 diff --git a/Y2KINFO b/Y2KINFO index bdd22447e..103579ad5 100644 --- a/Y2KINFO +++ b/Y2KINFO @@ -1,13 +1,13 @@ Y2K compliance in libpng: ========================= - September 19, 1999 + October 1, 1999 Since the PNG Development group is an ad-hoc body, we can't make an official declaration. This is your unofficial assurance that libpng from version 0.71 and - upward through 1.0.4 are Y2K compliant. It is my belief that earlier + upward through 1.0.4c are Y2K compliant. It is my belief that earlier versions were also Y2K compliant. Libpng only has three year fields. One is a 2-byte unsigned integer diff --git a/example.c b/example.c index a83ea48b5..52afbdfce 100644 --- a/example.c +++ b/example.c @@ -197,7 +197,8 @@ void read_png(FILE *fp, unsigned int sig_read) /* file is already open */ /* Some suggestions as to how to get a screen gamma value */ - /* Note that screen gamma is (display_gamma/viewing_gamma) */ + /* Note that screen gamma is the display_exponent, which includes + * the CRT_exponent and any correction for viewing conditions */ if (/* We have a user-defined screen gamma value */) { screen_gamma = user-defined screen_gamma; diff --git a/libpng.3 b/libpng.3 index 48f3eb537..8fbfa9980 100644 --- a/libpng.3 +++ b/libpng.3 @@ -1,6 +1,6 @@ -.TH LIBPNG 3 "September 19, 1999" +.TH LIBPNG 3 "October 1, 1999" .SH NAME -libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4 - September 19, 1999 +libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4c - October 1, 1999 .SH SYNOPSIS \fI\fB @@ -617,7 +617,7 @@ Following is a copy of the libpng.txt file that accompanies libpng. .SH LIBPNG.TXT libpng.txt - A description on how to use and modify libpng - libpng version 1.0.4 - September 19, 1999 + libpng version 1.0.4c - October 1, 1999 Updated and distributed by Glenn Randers-Pehrson Copyright (c) 1998, 1999 Glenn Randers-Pehrson @@ -1314,17 +1314,15 @@ or as an RGB triplet that may or may not be in the palette (need_expand = 0). To properly display PNG images on any kind of system, the application needs to know what the display gamma is. Ideally, the user will know this, and the application will allow them to set it. One method of allowing the user -to set the display gamma separately for each system is to check for the -DISPLAY_GAMMA and VIEWING_GAMMA environment variables or for a SCREEN_GAMMA -environment variable, which will hopefully be correctly set. +to set the display gamma separately for each system is to check for a +SCREEN_GAMMA or DISPLAY_GAMMA environment variable, which will hopefully be +correctly set. -Note that display_gamma is the gamma of your display, while screen_gamma is -the overall gamma correction required to produce pleasing results, -which depends on the lighting conditions in the surrounding environment. -Screen_gamma is display_gamma/viewing_gamma, where viewing_gamma is -the amount of additional gamma correction needed to compensate for -a (viewing_gamma=1.25) environment. In a dim or brightly lit room, no -compensation other than the display_gamma is needed (viewing_gamma=1.0). +Note that display_gamma is the overall gamma correction required to produce +pleasing results, which depends on the lighting conditions in the surrounding +environment. In a dim or brightly lit room, no compensation other than +the physical gamma exponent of the monitor is needed, while in a dark room +a slightly smaller exponent is better. double gamma, screen_gamma; @@ -2677,13 +2675,13 @@ the old method. .SH VII. Y2K Compliance in libpng -January 13, 1999 +October 1, 1999 Since the PNG Development group is an ad-hoc body, we can't make an official declaration. This is your unofficial assurance that libpng from version 0.71 and -upward through 1.0.4 are Y2K compliant. It is my belief that earlier +upward through 1.0.4c are Y2K compliant. It is my belief that earlier versions were also Y2K compliant. Libpng only has three year fields. One is a 2-byte unsigned integer that @@ -2802,12 +2800,6 @@ and this library, the specification takes precedence. .SH AUTHORS This man page: Glenn Randers-Pehrson - -Contributing Authors: John Bowler, Kevin Bracey, Sam Bushell, Andreas Dilger, -Magnus Holmgren, Tom Lane, Dave Martindale, Glenn Randers-Pehrson, -Greg Roelofs, Guy Eric Schalnat, Paul Schmidt, Tom Tanner, Willem van -Schaik, Tim Wegner. - The contributing authors would like to thank all those who helped with testing, bug fixes, and patience. This wouldn't have been @@ -2815,7 +2807,7 @@ possible without all of you. Thanks to Frank J. T. Wojcik for helping with the documentation. -Libpng version 1.0.4 - September 19, 1999: +Libpng version 1.0.4c - October 1, 1999: Initially created in 1995 by Guy Eric Schalnat, then of Group 42, Inc. Currently maintained by Glenn Randers-Pehrson (randeg@alum.rpi.edu). @@ -2830,7 +2822,25 @@ Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. Copyright (c) 1996, 1997 Andreas Dilger (libpng versions 0.90, December 1996, through 0.96, May 1997) Copyright (c) 1998, 1999 Glenn Randers-Pehrson -(libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999) +(libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999) + +For the purposes of this copyright and license, "Contributing Authors" +is defined as the following set of individuals: + + John Bowler + Kevin Bracey + Sam Bushell + Andreas Dilger + Magnus Holmgren + Tom Lane + Dave Martindale + Glenn Randers-Pehrson + Greg Roelofs + Guy Eric Schalnat + Paul Schmidt + Tom Tanner + Willem van Schaik + Tim Wegner The PNG Reference Library (libpng) is supplied "AS IS". The Contributing Authors and Group 42, Inc. disclaim all warranties, expressed or implied, @@ -2869,5 +2879,8 @@ boxes and the like: Also, the PNG logo (in PNG format, of course) is supplied in the file "pngnow.png". +Libpng is OSI Certified Open Source Software. OSI Certified is a +certification mark of the Open Source Initiative. + .\" end of man page diff --git a/libpng.txt b/libpng.txt index 7fd60af35..8023e79e0 100644 --- a/libpng.txt +++ b/libpng.txt @@ -1,6 +1,6 @@ libpng.txt - A description on how to use and modify libpng - libpng version 1.0.4 - September 19, 1999 + libpng version 1.0.4c - October 1, 1999 Updated and distributed by Glenn Randers-Pehrson Copyright (c) 1998, 1999 Glenn Randers-Pehrson @@ -697,17 +697,15 @@ or as an RGB triplet that may or may not be in the palette (need_expand = 0). To properly display PNG images on any kind of system, the application needs to know what the display gamma is. Ideally, the user will know this, and the application will allow them to set it. One method of allowing the user -to set the display gamma separately for each system is to check for the -DISPLAY_GAMMA and VIEWING_GAMMA environment variables or for a SCREEN_GAMMA -environment variable, which will hopefully be correctly set. +to set the display gamma separately for each system is to check for a +SCREEN_GAMMA or DISPLAY_GAMMA environment variable, which will hopefully be +correctly set. -Note that display_gamma is the gamma of your display, while screen_gamma is -the overall gamma correction required to produce pleasing results, -which depends on the lighting conditions in the surrounding environment. -Screen_gamma is display_gamma/viewing_gamma, where viewing_gamma is -the amount of additional gamma correction needed to compensate for -a (viewing_gamma=1.25) environment. In a dim or brightly lit room, no -compensation other than the display_gamma is needed (viewing_gamma=1.0). +Note that display_gamma is the overall gamma correction required to produce +pleasing results, which depends on the lighting conditions in the surrounding +environment. In a dim or brightly lit room, no compensation other than +the physical gamma exponent of the monitor is needed, while in a dark room +a slightly smaller exponent is better. double gamma, screen_gamma; @@ -2060,13 +2058,13 @@ the old method. VII. Y2K Compliance in libpng -January 13, 1999 +October 1, 1999 Since the PNG Development group is an ad-hoc body, we can't make an official declaration. This is your unofficial assurance that libpng from version 0.71 and -upward through 1.0.4 are Y2K compliant. It is my belief that earlier +upward through 1.0.4c are Y2K compliant. It is my belief that earlier versions were also Y2K compliant. Libpng only has three year fields. One is a 2-byte unsigned integer that diff --git a/libpngpf.3 b/libpngpf.3 index c808ae8ed..fb0803f9d 100644 --- a/libpngpf.3 +++ b/libpngpf.3 @@ -1,6 +1,6 @@ -.TH LIBPNGPF 3 September 19, 1999 +.TH LIBPNGPF 3 October 1, 1999 .SH NAME -libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4 - September 19, 1999 +libpng \- Portable Network Graphics (PNG) Reference Library 1.0.4c - October 1, 1999 (private functions) .SH SYNOPSIS \fB#include \fP diff --git a/png.5 b/png.5 index 0bfb13015..081adb520 100644 --- a/png.5 +++ b/png.5 @@ -1,4 +1,4 @@ -.TH PNG 5 "September 19, 1999" +.TH PNG 5 "October 1, 1999" .SH NAME png \- Portable Network Graphics (PNG) format .SH DESCRIPTION diff --git a/png.c b/png.c index dfca0501f..1b0141b36 100644 --- a/png.c +++ b/png.c @@ -1,7 +1,7 @@ /* png.c - location for general purpose libpng functions * - * libpng version 1.0.4 - September 19, 1999 + * libpng version 1.0.4c - October 1, 1999 * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger * Copyright (c) 1998, 1999 Glenn Randers-Pehrson @@ -16,7 +16,7 @@ * string defined in png.h. */ -char png_libpng_ver[12] = "1.0.4"; +char png_libpng_ver[12] = "1.0.4c"; /* Place to hold the signature string for a PNG file. */ png_byte FARDATA png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10}; @@ -73,12 +73,12 @@ int FARDATA png_pass_mask[] = {0x80, 0x08, 0x88, 0x22, 0xaa, 0x55, 0xff}; /* Mask to determine which pixels to overwrite while displaying */ int FARDATA png_pass_dsp_mask[] = {0xff, 0x0f, 0xff, 0x33, 0xff, 0x55, 0xff}; - /* Tells libpng that we have already handled the first "num_bytes" bytes * of the PNG file signature. If the PNG data is embedded into another * stream we can set num_bytes = 8 so that libpng will not attempt to read * or write any of the magic bytes before it starts on the IHDR. */ + void png_set_sig_bytes(png_structp png_ptr, int num_bytes) { @@ -352,8 +352,17 @@ png_get_copyright(png_structp png_ptr) { if(png_ptr == NULL) /* silence compiler warning about unused png_ptr */ ; - return("\n libpng version 1.0.4 - September 19, 1999\n\ + return("\n libpng version 1.0.4c - October 1, 1999\n\ Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.\n\ Copyright (c) 1996, 1997 Andreas Dilger\n\ Copyright (c) 1998, 1999 Glenn Randers-Pehrson\n"); } + +/* Generate a compiler error if there is an old png.h in the search path. */ +void +png_check_version + (version_1_0_4c png_h_is_not_version_1_0_4c) +{ + if(png_h_is_not_version_1_0_4c == NULL) + /* silence compiler warning about unused parameter */ ; +} diff --git a/png.h b/png.h index d37070383..67a29e575 100644 --- a/png.h +++ b/png.h @@ -1,7 +1,7 @@ /* png.h - header file for PNG reference library * - * libpng version 1.0.4 - September 19, 1999 + * libpng version 1.0.4c - October 1, 1999 * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger * Copyright (c) 1998, 1999 Glenn Randers-Pehrson @@ -9,19 +9,19 @@ * Authors and maintainers: * libpng versions 0.71, May 1995, through 0.89c, May 1996: Guy Schalnat * libpng versions 0.90, December 1996, through 0.96, May 1997: Andreas Dilger - * libpng versions 0.97, January 1998, through 1.0.4 - September 19, 1999: Glenn R-P + * libpng versions 0.97, January 1998, through 1.0.4c - October 1, 1999: Glenn * See also "Contributing Authors", below. * * Y2K compliance in libpng: * ========================= * - * January 13, 1999 + * October 1, 1999 * * Since the PNG Development group is an ad-hoc body, we can't make * an official declaration. * * This is your unofficial assurance that libpng from version 0.71 and - * upward through 1.0.4 are Y2K compliant. It is my belief that earlier + * upward through 1.0.4c are Y2K compliant. It is my belief that earlier * versions were also Y2K compliant. * * Libpng only has three year fields. One is a 2-byte unsigned integer @@ -86,8 +86,8 @@ * 0.98 0.98 98 2.0.98 * 0.99 0.99 98 2.0.99 * 0.99a-m 0.99 99 2.0.99 - * 1.00 1.00 100 2.1.0 [int should be 10000] - * 1.0.0 1.0.0 100 2.1.0 [int should be 10000] + * 1.00 1.00 100 2.1.0 [100 should be 10000] + * 1.0.0 1.0.0 100 2.1.0 [100 should be 10000] * 1.0.1 1.0.1 10001 2.1.0 * 1.0.1a-e 1.0.1a-e 10002 2.1.0.1a-e * 1.0.2 1.0.2 10002 2.1.0.2 @@ -95,6 +95,8 @@ * 1.0.3 1.0.3 10003 2.1.0.3 * 1.0.3a-d 1.0.3a-d 10004 2.1.0.3a-d * 1.0.4 1.0.4 10004 2.1.0.4 + * 1.0.4a-c 1.0.4a-c 10005 2.1.0.4a-c + * 1.0.5 1.0.5 10005 2.1.0.5 * * Henceforth the source version will match the shared-library minor * and patch numbers; the shared-library major version number will be @@ -108,7 +110,18 @@ * is available as RFC 2083 * and as a W3C Recommendation * - * Contributing Authors: + * COPYRIGHT NOTICE: + * + * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. + * (libpng versions 0.5, May 1995, through 0.89c, May 1996) + * Copyright (c) 1996, 1997 Andreas Dilger + * (libpng versions 0.90, December 1996, through 0.96, May 1997) + * Copyright (c) 1998, 1999 Glenn Randers-Pehrson + * (libpng versions 0.97, January 1998, through 1.0.4c, October 1, 1999) + * + * For the purposes of this copyright and license, "Contributing Authors" + * is defined as the following set of individuals: + * * John Bowler * Kevin Bracey * Sam Bushell @@ -124,21 +137,6 @@ * Willem van Schaik * Tim Wegner * - * The contributing authors would like to thank all those who helped - * with testing, bug fixes, and patience. This wouldn't have been - * possible without all of you. - * - * Thanks to Frank J. T. Wojcik for helping with the documentation. - * - * COPYRIGHT NOTICE: - * - * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. - * (libpng versions 0.5, May 1995, through 0.89c, May 1996) - * Copyright (c) 1996, 1997 Andreas Dilger - * (libpng versions 0.90, December 1996, through 0.96, May 1997) - * Copyright (c) 1998, 1999 Glenn Randers-Pehrson - * (libpng versions 0.97, January 1998, through 1.0.4, September 19, 1999) - * * The PNG Reference Library is supplied "AS IS". The Contributing Authors * and Group 42, Inc. disclaim all warranties, expressed or implied, * including, without limitation, the warranties of merchantability and of @@ -169,6 +167,20 @@ * appreciated. */ +/* + * + * Libpng is OSI Certified Open Source Software. OSI Certified is a + * certification mark of the Open Source Initiative. + */ + +/* + * The contributing authors would like to thank all those who helped + * with testing, bug fixes, and patience. This wouldn't have been + * possible without all of you. + * + * Thanks to Frank J. T. Wojcik for helping with the documentation. + */ + #ifndef _PNG_H #define _PNG_H @@ -196,14 +208,16 @@ extern "C" { */ /* Version information for png.h - this should match the version in png.c */ -#define PNG_LIBPNG_VER_STRING "1.0.4" +#define PNG_LIBPNG_VER_STRING "1.0.4c" /* Careful here. At one time, Guy wanted to use 082, but that would be octal. * We must not include leading zeros. * Versions 0.7 through 1.0.0 were in the range 0 to 100 here (only * version 1.0.0 was mis-numbered 100 instead of 10000). From * version 1.0.1 it's xxyyzz, where x=major, y=minor, z=bugfix */ -#define PNG_LIBPNG_VER 10004 /* 1.0.4 */ +#define PNG_LIBPNG_VER 10005 /* 1.0.5 */ + +/* Note to maintainer: update this number in scripts/pngdef.pas as well */ /* variables declared in png.c - only it needs to define PNG_NO_EXTERN */ #if !defined(PNG_NO_EXTERN) || defined(PNG_ALWAYS_EXTERN) @@ -701,7 +715,7 @@ struct png_struct_def #if defined(PNG_READ_GAMMA_SUPPORTED) || defined(PNG_READ_BACKGROUND_SUPPORTED) int gamma_shift; /* number of "insignificant" bits 16-bit gamma */ float gamma; /* file gamma value */ - float screen_gamma; /* screen gamma value (display_gamma/viewing_gamma */ + float screen_gamma; /* screen gamma value (display_exponent) */ #endif /* PNG_READ_GAMMA_SUPPORTED */ #if defined(PNG_READ_GAMMA_SUPPORTED) || defined(PNG_READ_BACKGROUND_SUPPORTED) png_bytep gamma_table; /* gamma table for 8-bit depth files */ @@ -791,6 +805,11 @@ struct png_struct_def #endif }; +/* This prevents a compiler error in png_get_copyright() in png.c if png.c +and png.h are both at * version 1.0.4c + */ +typedef png_structp version_1_0_4c; + typedef png_struct FAR * FAR * png_structpp; /* Here are the function definitions most commonly used. This is not @@ -993,7 +1012,7 @@ extern PNG_EXPORT(void,png_set_dither) PNGARG((png_structp png_ptr, #endif /* PNG_READ_DITHER_SUPPORTED */ #if defined(PNG_READ_GAMMA_SUPPORTED) -/* Handle gamma correction. Screen_gamma=(display_gamma/viewing_gamma) */ +/* Handle gamma correction. Screen_gamma=(display_exponent) */ extern PNG_EXPORT(void,png_set_gamma) PNGARG((png_structp png_ptr, double screen_gamma, double default_file_gamma)); #endif /* PNG_READ_GAMMA_SUPPORTED */ @@ -1610,7 +1629,7 @@ png_get_header_version(png_structp png_ptr) { if(png_ptr == NULL) /* silence compiler warning about unused png_ptr */ ; - return("\n libpng version 1.0.4 - September 19, 1999 (header)\n"); + return("\n libpng version 1.0.4c - October 1, 1999 (header)\n"); } #endif diff --git a/pngasmrd.h b/pngasmrd.h index ae9853cc1..e6c9c02e0 100644 --- a/pngasmrd.h +++ b/pngasmrd.h @@ -1,6 +1,6 @@ /* pngasmrd.h - assembler version of utilities to read a PNG file * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1999 Glenn Randers-Pehrson * @@ -21,7 +21,7 @@ /* Set this in the makefile for gcc on Pentium, not in pngconf.h */ #ifdef PNG_USE_PNGGCCRD /* Platform must be Pentium. Makefile must assemble and load pnggccrd.c - * (not available in libpng 1.0.4). + * (not available in libpng 1.0.4c). * MMX will be detected at run time and used if present. */ #define PNG_HAVE_ASSEMBLER_COMBINE_ROW diff --git a/pngconf.h b/pngconf.h index 3f546bc50..41316d9e9 100644 --- a/pngconf.h +++ b/pngconf.h @@ -1,7 +1,7 @@ /* pngconf.h - machine configurable file for libpng * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngerror.c b/pngerror.c index 2d2cede02..63a2b4814 100644 --- a/pngerror.c +++ b/pngerror.c @@ -1,7 +1,7 @@ /* pngerror.c - stub functions for i/o and memory allocation * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngget.c b/pngget.c index 248db8e36..c27d9b9c1 100644 --- a/pngget.c +++ b/pngget.c @@ -1,7 +1,7 @@ /* pngget.c - retrieval of values from info struct * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngmem.c b/pngmem.c index 06eb09098..cbaa27e19 100644 --- a/pngmem.c +++ b/pngmem.c @@ -1,7 +1,7 @@ /* pngmem.c - stub functions for memory allocation * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngnow.png b/pngnow.png new file mode 100644 index 000000000..16280e7d8 Binary files /dev/null and b/pngnow.png differ diff --git a/pngpread.c b/pngpread.c index 9d2fe4888..d703b8120 100644 --- a/pngpread.c +++ b/pngpread.c @@ -1,7 +1,7 @@ /* pngpread.c - read a png file in push mode * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngread.c b/pngread.c index fc700f487..1517535a2 100644 --- a/pngread.c +++ b/pngread.c @@ -1,7 +1,7 @@ /* pngread.c - read a PNG file * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger @@ -531,7 +531,7 @@ png_read_row(png_structp png_ptr, png_bytep row, png_bytep dsp_row) * not called png_set_interlace_handling(), the display_row buffer will * be ignored, so pass NULL to it. * - * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4. + * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4c. */ void @@ -580,7 +580,7 @@ png_read_rows(png_structp png_ptr, png_bytepp row, * only call this function once. If you desire to have an image for * each pass of a interlaced image, use png_read_rows() instead. * - * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4. + * [*] png_handle_alpha() does not exist yet, as of libpng version 1.0.4c. */ void png_read_image(png_structp png_ptr, png_bytepp image) diff --git a/pngrio.c b/pngrio.c index 8d4390c2d..4cc33a7fd 100644 --- a/pngrio.c +++ b/pngrio.c @@ -1,7 +1,7 @@ /* pngrio.c - functions for data input * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngrtran.c b/pngrtran.c index f8b8e80de..9c2b0edfd 100644 --- a/pngrtran.c +++ b/pngrtran.c @@ -1,7 +1,7 @@ /* pngrtran.c - transforms the data in a row for PNG readers * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger @@ -1069,6 +1069,7 @@ png_read_transform_info(png_structp png_ptr, png_infop info_ptr) info_ptr->pixel_depth = (png_byte)(info_ptr->channels * info_ptr->bit_depth); info_ptr->rowbytes = ((info_ptr->width * info_ptr->pixel_depth + 7) >> 3); + } /* Transform the row. The order of transformations is significant, diff --git a/pngrutil.c b/pngrutil.c index c49ac6190..bbf0838da 100644 --- a/pngrutil.c +++ b/pngrutil.c @@ -1,7 +1,7 @@ /* pngrutil.c - utilities to read a PNG file * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger @@ -945,7 +945,7 @@ png_handle_hIST(png_structp png_ptr, png_infop info_ptr, png_uint_32 length) return; } - num = (int)length / 2; + num = (int)length / 2 ; png_ptr->hist = (png_uint_16p)png_malloc(png_ptr, (png_uint_32)(num * sizeof (png_uint_16))); png_ptr->flags |= PNG_FLAG_FREE_HIST; @@ -1892,6 +1892,7 @@ png_do_read_interlace png_size_t pixel_bytes = (row_info->pixel_depth >> 3); png_bytep sp = row + (png_size_t)(row_info->width - 1) * pixel_bytes; png_bytep dp = row + (png_size_t)(final_width - 1) * pixel_bytes; + int jstop = png_pass_inc[pass]; png_uint_32 i; @@ -1937,7 +1938,7 @@ png_read_filter_row { png_uint_32 i; png_uint_32 istop = row_info->rowbytes; - png_uint_32 bpp = (row_info->pixel_depth + 7) / 8; + png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; png_bytep rp = row + bpp; png_bytep lp = row; @@ -1968,20 +1969,20 @@ png_read_filter_row png_bytep rp = row; png_bytep pp = prev_row; png_bytep lp = row; - png_uint_32 bpp = (row_info->pixel_depth + 7) / 8; + png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; png_uint_32 istop = row_info->rowbytes - bpp; for (i = 0; i < bpp; i++) { *rp = (png_byte)(((int)(*rp) + - ((int)(*pp++) / 2)) & 0xff); + ((int)(*pp++) / 2 )) & 0xff); rp++; } for (i = 0; i < istop; i++) { *rp = (png_byte)(((int)(*rp) + - (int)(*pp++ + *lp++) / 2) & 0xff); + (int)(*pp++ + *lp++) / 2 ) & 0xff); rp++; } break; @@ -1993,7 +1994,7 @@ png_read_filter_row png_bytep pp = prev_row; png_bytep lp = row; png_bytep cp = prev_row; - png_uint_32 bpp = (row_info->pixel_depth + 7) / 8; + png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; png_uint_32 istop=row_info->rowbytes - bpp; for (i = 0; i < bpp; i++) @@ -2267,6 +2268,16 @@ png_read_start_row(png_structp png_ptr) } #endif +#if defined(PNG_READ_USER_TRANSFORM_SUPPORTED) + if(png_ptr->transformations & PNG_USER_TRANSFORM) + { + int user_pixel_depth=png_ptr->user_transform_depth* + png_ptr->user_transform_channels; + if(user_pixel_depth > max_pixel_depth) + max_pixel_depth=user_pixel_depth; + } +#endif + /* align the width on the next larger 8 pixels. Mainly used for interlacing */ row_bytes = ((png_ptr->width + 7) & ~((png_uint_32)7)); diff --git a/pngset.c b/pngset.c index e0f9e0ac3..bfec9072b 100644 --- a/pngset.c +++ b/pngset.c @@ -1,7 +1,7 @@ /* pngset.c - storage of image information into info struct * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngtest.c b/pngtest.c index afd5c376d..51d289c15 100644 --- a/pngtest.c +++ b/pngtest.c @@ -1,7 +1,7 @@ /* pngtest.c - a simple test program to test libpng * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger @@ -35,8 +35,24 @@ #define PNG_DEBUG 0 #endif +/* Turn on CPU timing +#define PNGTEST_TIMING +*/ + +#ifdef PNGTEST_TIMING +static float t_start, t_stop, t_decode, t_encode, t_misc; +#include +#endif + #include "png.h" +#ifdef PNGTEST_TIMING +static float t_start, t_stop, t_decode, t_encode, t_misc; +#if !defined(PNG_READ_tIME_SUPPORTED) && !defined(PNG_WRITE_tIME_SUPPORTED) +#include +#endif +#endif + #if defined(PNG_TIME_RFC1123_SUPPORTED) static int tIME_chunk_present=0; static char tIME_string[30] = "no tIME chunk present in file"; @@ -800,16 +816,36 @@ test_one_file(PNG_CONST char *inname, PNG_CONST char *outname) } png_debug(0, "Writing row data\n"); +#if defined(PNG_READ_INTERLACING_SUPPORTED) || \ + defined(PNG_WRITE_INTERLACING_SUPPORTED) num_pass = png_set_interlace_handling(read_ptr); png_set_interlace_handling(write_ptr); +#else + num_pass=1; +#endif +#ifdef PNGTEST_TIMING + t_stop = (float)clock(); + t_misc += (t_stop - t_start); + t_start = t_stop; +#endif for (pass = 0; pass < num_pass; pass++) { png_debug1(0, "Writing row data for pass %d\n",pass); for (y = 0; y < height; y++) { png_read_rows(read_ptr, (png_bytepp)&row_buf, (png_bytepp)NULL, 1); +#ifdef PNGTEST_TIMING + t_stop = (float)clock(); + t_decode += (t_stop - t_start); + t_start = t_stop; +#endif png_write_rows(write_ptr, (png_bytepp)&row_buf, 1); +#ifdef PNGTEST_TIMING + t_stop = (float)clock(); + t_encode += (t_stop - t_start); + t_start = t_stop; +#endif } } @@ -1040,7 +1076,9 @@ main(int argc, char *argv[]) #endif } #ifdef PNG_USER_MEM_SUPPORTED - fprintf(STDERR, "Maximum memory allocation: %d bytes\n", + fprintf(STDERR, " Current memory allocation: %d bytes\n", + current_allocation); + fprintf(STDERR, " Maximum memory allocation: %d bytes\n", maximum_allocation); #endif } @@ -1103,11 +1141,27 @@ main(int argc, char *argv[]) #endif } #ifdef PNG_USER_MEM_SUPPORTED - fprintf(STDERR, "Maximum memory allocation: %d bytes\n", + fprintf(STDERR, " Current memory allocation: %d bytes\n", + current_allocation); + fprintf(STDERR, " Maximum memory allocation: %d bytes\n", maximum_allocation); #endif } +#ifdef PNGTEST_TIMING + t_stop = (float)clock(); + t_misc += (t_stop - t_start); + t_start = t_stop; + fprintf(STDERR," CPU time used = %.3f seconds", + (t_misc+t_decode+t_encode)/(float)CLOCKS_PER_SEC); + fprintf(STDERR," (decoding %.3f,\n", + t_decode/(float)CLOCKS_PER_SEC); + fprintf(STDERR," encoding %.3f ,", + t_encode/(float)CLOCKS_PER_SEC); + fprintf(STDERR," other %.3f seconds)\n\n", + t_misc/(float)CLOCKS_PER_SEC); +#endif + if (ierror == 0) fprintf(STDERR, "libpng passes test\n"); else @@ -1115,3 +1169,10 @@ main(int argc, char *argv[]) return (int)(ierror != 0); } +/* Generate a compiler error if there is an old png.h in the search path. */ +void +png_check_pngtest_version + (version_1_0_4c png_h_is_not_version_1_0_4c) +{ + if(png_h_is_not_version_1_0_4c == NULL) return; +} diff --git a/pngtrans.c b/pngtrans.c index bf1401858..57a1f9426 100644 --- a/pngtrans.c +++ b/pngtrans.c @@ -1,7 +1,7 @@ /* pngtrans.c - transforms the data in a row (used by both readers and writers) * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngvcrd.c b/pngvcrd.c index 8f429d92a..4ab0a913c 100644 --- a/pngvcrd.c +++ b/pngvcrd.c @@ -1,13 +1,13 @@ -/* pngvcrd.c - assembler version of utilities to read a PNG file +/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file * - * For Intel CPU and Microsoft Visual C++ compiler + * For Intel x86 CPU and Microsoft Visual C++ compiler * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1998, Intel Corporation * Copyright (c) 1998, 1999 Glenn Randers-Pehrson * - * Contributed by Nirav Chhatrapati, INTEL Corporation, 1998 + * Contributed by Nirav Chhatrapati, Intel Corporation, 1998 * Interface to libpng contributed by Gilles Vollant, 1999 * */ @@ -15,7 +15,7 @@ #define PNG_INTERNAL #include "png.h" -#ifdef PNG_ASSEMBLER_CODE_SUPPORTED +#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD) static int mmx_supported=2; @@ -68,8 +68,8 @@ NOT_SUPPORTED: } -//mmx_supported_local=0; // test code for force don't support MMX - //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); + //mmx_supported_local=0; // test code for force don't support MMX + //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); return mmx_supported_local; } @@ -85,858 +85,857 @@ NOT_SUPPORTED: to any alpha or transparency value associated with the pixel. If you want all pixels to be combined, pass 0xff (255) in mask. */ -/* Use this routine for X86 platform - uses faster MMX routine if machine -supports MMX */ +/* Use this routine for x86 platform - uses faster MMX routine if machine + supports MMX */ void -png_combine_row(png_structp png_ptr, png_bytep row, - int mask) +png_combine_row(png_structp png_ptr, png_bytep row, int mask) { - //int mmx_supported=0; // another test code for remove MMX in this routine + int save_mmx_supported = mmx_supported; png_debug(1,"in png_combine_row_asm\n"); - //if (mmx_supported==2) - // mmx_supported=mmxsupport(); + + if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6) + mmx_supported = 0; + else + if (mmx_supported == 2) + mmx_supported = mmxsupport(); if (mask == 0xff) { png_memcpy(row, png_ptr->row_buf + 1, - (png_size_t)((png_ptr->width * - png_ptr->row_info.pixel_depth + 7) >> 3)); + (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3)); } else { - switch (png_ptr->row_info.pixel_depth) + switch (png_ptr->row_info.pixel_depth) { - case 1: - { - png_bytep sp; - png_bytep dp; - int s_inc, s_start, s_end; - int m; - int shift; - png_uint_32 i; + case 1: + { + png_bytep sp; + png_bytep dp; + int s_inc, s_start, s_end; + int m; + int shift; + png_uint_32 i; - sp = png_ptr->row_buf + 1; - dp = row; - m = 0x80; + sp = png_ptr->row_buf + 1; + dp = row; + m = 0x80; #if defined(PNG_READ_PACKSWAP_SUPPORTED) - if (png_ptr->transformations & PNG_PACKSWAP) - { - s_start = 0; - s_end = 7; - s_inc = 1; - } - else + if (png_ptr->transformations & PNG_PACKSWAP) + { + s_start = 0; + s_end = 7; + s_inc = 1; + } + else #endif - { - s_start = 7; - s_end = 0; - s_inc = -1; - } + { + s_start = 7; + s_end = 0; + s_inc = -1; + } - shift = s_start; + shift = s_start; - for (i = 0; i < png_ptr->width; i++) - { - if (m & mask) - { - int value; + for (i = 0; i < png_ptr->width; i++) + { + if (m & mask) + { + int value; - value = (*sp >> shift) & 0x1; - *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff); - *dp |= (png_byte)(value << shift); + value = (*sp >> shift) & 0x1; + *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff); + *dp |= (png_byte)(value << shift); + } + + if (shift == s_end) + { + shift = s_start; + sp++; + dp++; + } + else + shift += s_inc; + + if (m == 1) + m = 0x80; + else + m >>= 1; + } + break; } - if (shift == s_end) + case 2: { - shift = s_start; - sp++; - dp++; - } - else - shift += s_inc; + png_bytep sp; + png_bytep dp; + int s_start, s_end, s_inc; + int m; + int shift; + png_uint_32 i; + int value; - if (m == 1) - m = 0x80; - else - m >>= 1; - } - break; - } - case 2: - { - png_bytep sp; - png_bytep dp; - int s_start, s_end, s_inc; - int m; - int shift; - png_uint_32 i; - int value; - - sp = png_ptr->row_buf + 1; - dp = row; - m = 0x80; + sp = png_ptr->row_buf + 1; + dp = row; + m = 0x80; #if defined(PNG_READ_PACKSWAP_SUPPORTED) - if (png_ptr->transformations & PNG_PACKSWAP) - { - s_start = 0; - s_end = 6; - s_inc = 2; - } - else + if (png_ptr->transformations & PNG_PACKSWAP) + { + s_start = 0; + s_end = 6; + s_inc = 2; + } + else #endif - { - s_start = 6; - s_end = 0; - s_inc = -2; - } + { + s_start = 6; + s_end = 0; + s_inc = -2; + } - shift = s_start; + shift = s_start; - for (i = 0; i < png_ptr->width; i++) - { - if (m & mask) - { - value = (*sp >> shift) & 0x3; - *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff); - *dp |= (png_byte)(value << shift); + for (i = 0; i < png_ptr->width; i++) + { + if (m & mask) + { + value = (*sp >> shift) & 0x3; + *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff); + *dp |= (png_byte)(value << shift); + } + + if (shift == s_end) + { + shift = s_start; + sp++; + dp++; + } + else + shift += s_inc; + if (m == 1) + m = 0x80; + else + m >>= 1; + } + break; } - - if (shift == s_end) + case 4: { - shift = s_start; - sp++; - dp++; - } - else - shift += s_inc; - if (m == 1) - m = 0x80; - else - m >>= 1; - } - break; - } - case 4: - { - png_bytep sp; - png_bytep dp; - int s_start, s_end, s_inc; - int m; - int shift; - png_uint_32 i; - int value; + png_bytep sp; + png_bytep dp; + int s_start, s_end, s_inc; + int m; + int shift; + png_uint_32 i; + int value; - sp = png_ptr->row_buf + 1; - dp = row; - m = 0x80; + sp = png_ptr->row_buf + 1; + dp = row; + m = 0x80; #if defined(PNG_READ_PACKSWAP_SUPPORTED) - if (png_ptr->transformations & PNG_PACKSWAP) - { - s_start = 0; - s_end = 4; - s_inc = 4; - } - else + if (png_ptr->transformations & PNG_PACKSWAP) + { + s_start = 0; + s_end = 4; + s_inc = 4; + } + else #endif - { - s_start = 4; - s_end = 0; - s_inc = -4; - } - shift = s_start; + { + s_start = 4; + s_end = 0; + s_inc = -4; + } + shift = s_start; - for (i = 0; i < png_ptr->width; i++) - { - if (m & mask) - { - value = (*sp >> shift) & 0xf; - *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff); - *dp |= (png_byte)(value << shift); + for (i = 0; i < png_ptr->width; i++) + { + if (m & mask) + { + value = (*sp >> shift) & 0xf; + *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff); + *dp |= (png_byte)(value << shift); + } + + if (shift == s_end) + { + shift = s_start; + sp++; + dp++; + } + else + shift += s_inc; + if (m == 1) + m = 0x80; + else + m >>= 1; + } + break; } - - if (shift == s_end) + case 8: { - shift = s_start; - sp++; - dp++; - } - else - shift += s_inc; - if (m == 1) - m = 0x80; - else - m >>= 1; - } - break; - } - case 8: - { - png_bytep srcptr; - png_bytep dstptr; - png_uint_32 len; - int m; - int diff, unmask; + png_bytep srcptr; + png_bytep dstptr; + png_uint_32 len; + int m; + int diff, unmask; - __int64 mask0=0x0102040810204080; + __int64 mask0=0x0102040810204080; - if (mmx_supported) - { - srcptr = png_ptr->row_buf + 1; - dstptr = row; - m = 0x80; - unmask = ~mask; - len = png_ptr->width &~7; //reduce to multiple of 8 - diff = png_ptr->width & 7; //amount lost - _asm { - movd mm7, unmask //load bit pattern - psubb mm6,mm6 //zero mm6 - punpcklbw mm7,mm7 - punpcklwd mm7,mm7 - punpckldq mm7,mm7 //fill register with 8 masks + if (mmx_supported) + { + srcptr = png_ptr->row_buf + 1; + dstptr = row; + m = 0x80; + unmask = ~mask; + len = png_ptr->width &~7; //reduce to multiple of 8 + diff = png_ptr->width & 7; //amount lost - movq mm0,mask0 + _asm + { + movd mm7, unmask //load bit pattern + psubb mm6,mm6 //zero mm6 + punpcklbw mm7,mm7 + punpcklwd mm7,mm7 + punpckldq mm7,mm7 //fill register with 8 masks - pand mm0,mm7 //nonzero if keep byte - pcmpeqb mm0,mm6 //zeros->1s, v versa + movq mm0,mask0 - mov ecx,len //load length of line - mov esi,srcptr //load source - mov ebx,dstptr //load dest - cmp ecx,0 //lcr - je mainloop8end + pand mm0,mm7 //nonzero if keep byte + pcmpeqb mm0,mm6 //zeros->1s, v versa + + mov ecx,len //load length of line + mov esi,srcptr //load source + mov ebx,dstptr //load dest + cmp ecx,0 //lcr + je mainloop8end mainloop8: - movq mm4,[esi] - pand mm4,mm0 - movq mm6,mm0 - pandn mm6,[ebx] - por mm4,mm6 - movq [ebx],mm4 + movq mm4,[esi] + pand mm4,mm0 + movq mm6,mm0 + pandn mm6,[ebx] + por mm4,mm6 + movq [ebx],mm4 - add esi,8 //inc by 8 bytes processed - add ebx,8 - sub ecx,8 //dec by 8 pixels processed + add esi,8 //inc by 8 bytes processed + add ebx,8 + sub ecx,8 //dec by 8 pixels processed - ja mainloop8 + ja mainloop8 mainloop8end: - mov ecx,diff - cmp ecx,0 - jz end8 + mov ecx,diff + cmp ecx,0 + jz end8 - mov edx,mask - sal edx,24 //make low byte the high byte + mov edx,mask + sal edx,24 //make low byte the high byte secondloop8: - sal edx,1 //move high bit to CF - jnc skip8 //if CF = 0 - mov al,[esi] - mov [ebx],al + sal edx,1 //move high bit to CF + jnc skip8 //if CF = 0 + mov al,[esi] + mov [ebx],al skip8: - inc esi - inc ebx + inc esi + inc ebx - dec ecx - jnz secondloop8 + dec ecx + jnz secondloop8 end8: - emms - } - } - else /* mmx _not supported - Use modified C routine*/ - { - register unsigned int incr1, initial_val, final_val; - png_size_t pixel_bytes; - png_uint_32 i; - //if ((mask != 0x0f) && (mask != 0x33)) - register int disp = png_pass_inc[png_ptr->pass]; - int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; - pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); - srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* - pixel_bytes; - dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; - initial_val = offset_table[png_ptr->pass]*pixel_bytes; - final_val = png_ptr->width*pixel_bytes; - incr1 = (disp)*pixel_bytes; - for (i = initial_val; i < final_val; i += incr1) - { - png_memcpy(dstptr, srcptr, pixel_bytes); - srcptr += incr1; - dstptr += incr1; - } - } /* end of else */ + emms + } + } + else /* mmx not supported - use modified C routine */ + { + register unsigned int incr1, initial_val, final_val; + png_size_t pixel_bytes; + png_uint_32 i; + //if ((mask != 0x0f) && (mask != 0x33)) + register int disp = png_pass_inc[png_ptr->pass]; + int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; + pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); + srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* + pixel_bytes; + dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; + initial_val = offset_table[png_ptr->pass]*pixel_bytes; + final_val = png_ptr->width*pixel_bytes; + incr1 = (disp)*pixel_bytes; + for (i = initial_val; i < final_val; i += incr1) + { + png_memcpy(dstptr, srcptr, pixel_bytes); + srcptr += incr1; + dstptr += incr1; + } + } /* end of else */ - break; - } //end 8bpp + break; + } //end 8bpp - case 16: - { - png_bytep srcptr; - png_bytep dstptr; - png_uint_32 len; - int unmask, diff; + case 16: + { + png_bytep srcptr; + png_bytep dstptr; + png_uint_32 len; + int unmask, diff; + __int64 mask1=0x0101020204040808, + mask0=0x1010202040408080; - __int64 mask1=0x0101020204040808, - mask0=0x1010202040408080; + if (mmx_supported) + { + srcptr = png_ptr->row_buf + 1; + dstptr = row; - if (mmx_supported) - { - srcptr = png_ptr->row_buf + 1; - dstptr = row; + unmask = ~mask; + len = (png_ptr->width)&~7; + diff = (png_ptr->width)&7; + _asm + { + movd mm7, unmask //load bit pattern + psubb mm6,mm6 //zero mm6 + punpcklbw mm7,mm7 + punpcklwd mm7,mm7 + punpckldq mm7,mm7 //fill register with 8 masks - unmask = ~mask; - len = (png_ptr->width)&~7; - diff = (png_ptr->width)&7; - _asm { - movd mm7, unmask //load bit pattern - psubb mm6,mm6 //zero mm6 - punpcklbw mm7,mm7 - punpcklwd mm7,mm7 - punpckldq mm7,mm7 //fill register with 8 masks + movq mm0,mask0 + movq mm1,mask1 - movq mm0,mask0 - movq mm1,mask1 + pand mm0,mm7 + pand mm1,mm7 - pand mm0,mm7 - pand mm1,mm7 + pcmpeqb mm0,mm6 + pcmpeqb mm1,mm6 - pcmpeqb mm0,mm6 - pcmpeqb mm1,mm6 - - mov ecx,len //load length of line - mov esi,srcptr //load source - mov ebx,dstptr //load dest - cmp ecx,0 //lcr - jz mainloop16end + mov ecx,len //load length of line + mov esi,srcptr //load source + mov ebx,dstptr //load dest + cmp ecx,0 //lcr + jz mainloop16end mainloop16: - movq mm4,[esi] - pand mm4,mm0 - movq mm6,mm0 - movq mm7,[ebx] - pandn mm6,mm7 - por mm4,mm6 - movq [ebx],mm4 + movq mm4,[esi] + pand mm4,mm0 + movq mm6,mm0 + movq mm7,[ebx] + pandn mm6,mm7 + por mm4,mm6 + movq [ebx],mm4 - movq mm5,[esi+8] - pand mm5,mm1 - movq mm7,mm1 - movq mm6,[ebx+8] - pandn mm7,mm6 - por mm5,mm7 - movq [ebx+8],mm5 + movq mm5,[esi+8] + pand mm5,mm1 + movq mm7,mm1 + movq mm6,[ebx+8] + pandn mm7,mm6 + por mm5,mm7 + movq [ebx+8],mm5 - add esi,16 //inc by 16 bytes processed - add ebx,16 - sub ecx,8 //dec by 8 pixels processed + add esi,16 //inc by 16 bytes processed + add ebx,16 + sub ecx,8 //dec by 8 pixels processed + + ja mainloop16 - ja mainloop16 mainloop16end: + mov ecx,diff + cmp ecx,0 + jz end16 - mov ecx,diff - cmp ecx,0 - jz end16 - - mov edx,mask - sal edx,24 //make low byte the high byte - + mov edx,mask + sal edx,24 //make low byte the high byte secondloop16: - sal edx,1 //move high bit to CF - jnc skip16 //if CF = 0 - mov ax,[esi] - mov [ebx],ax + sal edx,1 //move high bit to CF + jnc skip16 //if CF = 0 + mov ax,[esi] + mov [ebx],ax skip16: - add esi,2 - add ebx,2 - - dec ecx - jnz secondloop16 + add esi,2 + add ebx,2 + dec ecx + jnz secondloop16 end16: - emms - } - } - else /* mmx _not supported - Use modified C routine */ - { - register unsigned int incr1, initial_val, final_val; - png_size_t pixel_bytes; - png_uint_32 i; - register int disp = png_pass_inc[png_ptr->pass]; - int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; - pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); - srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* - pixel_bytes; - dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; - initial_val = offset_table[png_ptr->pass]*pixel_bytes; - final_val = png_ptr->width*pixel_bytes; - incr1 = (disp)*pixel_bytes; - for (i = initial_val; i < final_val; i += incr1) - { - png_memcpy(dstptr, srcptr, pixel_bytes); - srcptr += incr1; - dstptr += incr1; - } - } /* end of else */ + emms + } + } + else /* mmx not supported - use modified C routine */ + { + register unsigned int incr1, initial_val, final_val; + png_size_t pixel_bytes; + png_uint_32 i; + register int disp = png_pass_inc[png_ptr->pass]; + int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; - break; - } - case 24: - { - png_bytep srcptr; - png_bytep dstptr; - png_uint_32 len; - int unmask, diff; + pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); + srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* + pixel_bytes; + dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; + initial_val = offset_table[png_ptr->pass]*pixel_bytes; + final_val = png_ptr->width*pixel_bytes; + incr1 = (disp)*pixel_bytes; + for (i = initial_val; i < final_val; i += incr1) + { + png_memcpy(dstptr, srcptr, pixel_bytes); + srcptr += incr1; + dstptr += incr1; + } + } /* end of else */ - __int64 mask2=0x0101010202020404, //24bpp - mask1=0x0408080810101020, - mask0=0x2020404040808080; + break; + } - srcptr = png_ptr->row_buf + 1; - dstptr = row; + case 24: + { + png_bytep srcptr; + png_bytep dstptr; + png_uint_32 len; + int unmask, diff; - unmask = ~mask; - len = (png_ptr->width)&~7; - diff = (png_ptr->width)&7; + __int64 mask2=0x0101010202020404, //24bpp + mask1=0x0408080810101020, + mask0=0x2020404040808080; - if (mmx_supported) - { - _asm { - movd mm7, unmask //load bit pattern - psubb mm6,mm6 //zero mm6 - punpcklbw mm7,mm7 - punpcklwd mm7,mm7 - punpckldq mm7,mm7 //fill register with 8 masks + srcptr = png_ptr->row_buf + 1; + dstptr = row; - movq mm0,mask0 - movq mm1,mask1 - movq mm2,mask2 + unmask = ~mask; + len = (png_ptr->width)&~7; + diff = (png_ptr->width)&7; + if (mmx_supported) + { + _asm + { + movd mm7, unmask //load bit pattern + psubb mm6,mm6 //zero mm6 + punpcklbw mm7,mm7 + punpcklwd mm7,mm7 + punpckldq mm7,mm7 //fill register with 8 masks - pand mm0,mm7 - pand mm1,mm7 - pand mm2,mm7 + movq mm0,mask0 + movq mm1,mask1 + movq mm2,mask2 - pcmpeqb mm0,mm6 - pcmpeqb mm1,mm6 - pcmpeqb mm2,mm6 + pand mm0,mm7 + pand mm1,mm7 + pand mm2,mm7 - mov ecx,len //load length of line - mov esi,srcptr //load source - mov ebx,dstptr //load dest - cmp ecx,0 - jz mainloop24end + pcmpeqb mm0,mm6 + pcmpeqb mm1,mm6 + pcmpeqb mm2,mm6 + + mov ecx,len //load length of line + mov esi,srcptr //load source + mov ebx,dstptr //load dest + cmp ecx,0 + jz mainloop24end mainloop24: - movq mm4,[esi] - pand mm4,mm0 - movq mm6,mm0 - movq mm7,[ebx] - pandn mm6,mm7 - por mm4,mm6 - movq [ebx],mm4 + movq mm4,[esi] + pand mm4,mm0 + movq mm6,mm0 + movq mm7,[ebx] + pandn mm6,mm7 + por mm4,mm6 + movq [ebx],mm4 - movq mm5,[esi+8] - pand mm5,mm1 - movq mm7,mm1 - movq mm6,[ebx+8] - pandn mm7,mm6 - por mm5,mm7 - movq [ebx+8],mm5 + movq mm5,[esi+8] + pand mm5,mm1 + movq mm7,mm1 + movq mm6,[ebx+8] + pandn mm7,mm6 + por mm5,mm7 + movq [ebx+8],mm5 - movq mm6,[esi+16] - pand mm6,mm2 - movq mm4,mm2 - movq mm7,[ebx+16] - pandn mm4,mm7 - por mm6,mm4 - movq [ebx+16],mm6 + movq mm6,[esi+16] + pand mm6,mm2 + movq mm4,mm2 + movq mm7,[ebx+16] + pandn mm4,mm7 + por mm6,mm4 + movq [ebx+16],mm6 - add esi,24 //inc by 24 bytes processed - add ebx,24 - sub ecx,8 //dec by 8 pixels processed + add esi,24 //inc by 24 bytes processed + add ebx,24 + sub ecx,8 //dec by 8 pixels processed + + ja mainloop24 - ja mainloop24 mainloop24end: + mov ecx,diff + cmp ecx,0 + jz end24 - mov ecx,diff - cmp ecx,0 - jz end24 - - mov edx,mask - sal edx,24 //make low byte the high byte - + mov edx,mask + sal edx,24 //make low byte the high byte secondloop24: - sal edx,1 //move high bit to CF - jnc skip24 //if CF = 0 - mov ax,[esi] - mov [ebx],ax - xor eax,eax - mov al,[esi+2] - mov [ebx+2],al + sal edx,1 //move high bit to CF + jnc skip24 //if CF = 0 + mov ax,[esi] + mov [ebx],ax + xor eax,eax + mov al,[esi+2] + mov [ebx+2],al skip24: - add esi,3 - add ebx,3 + add esi,3 + add ebx,3 - dec ecx - jnz secondloop24 + dec ecx + jnz secondloop24 end24: - emms + emms + } + } + else /* mmx not supported - use modified C routine */ + { + register unsigned int incr1, initial_val, final_val; + png_size_t pixel_bytes; + png_uint_32 i; + register int disp = png_pass_inc[png_ptr->pass]; + int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; + pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); + srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* + pixel_bytes; + dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; + initial_val = offset_table[png_ptr->pass]*pixel_bytes; + final_val = png_ptr->width*pixel_bytes; + incr1 = (disp)*pixel_bytes; + for (i = initial_val; i < final_val; i += incr1) + { + png_memcpy(dstptr, srcptr, pixel_bytes); + srcptr += incr1; + dstptr += incr1; + } + } /* end of else */ - } - } - else /* mmx _not supported - Use modified C routine */ - { - register unsigned int incr1, initial_val, final_val; - png_size_t pixel_bytes; - png_uint_32 i; - register int disp = png_pass_inc[png_ptr->pass]; - int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; - pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); - srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass] - *pixel_bytes; - dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; - initial_val = offset_table[png_ptr->pass]*pixel_bytes; - final_val = png_ptr->width*pixel_bytes; - incr1 = (disp)*pixel_bytes; - for (i = initial_val; i < final_val; i += incr1) - { - png_memcpy(dstptr, srcptr, pixel_bytes); - srcptr += incr1; - dstptr += incr1; - } - } /* end of else */ + break; + } //end 24bpp - break; - } //end 24bpp - case 32: - { - png_bytep srcptr; - png_bytep dstptr; - png_uint_32 len; - int unmask, diff; + case 32: + { + png_bytep srcptr; + png_bytep dstptr; + png_uint_32 len; + int unmask, diff; + __int64 mask3=0x0101010102020202, //32bpp + mask2=0x0404040408080808, + mask1=0x1010101020202020, + mask0=0x4040404080808080; + srcptr = png_ptr->row_buf + 1; + dstptr = row; - __int64 mask3=0x0101010102020202, //32bpp - mask2=0x0404040408080808, - mask1=0x1010101020202020, - mask0=0x4040404080808080; + unmask = ~mask; + len = (png_ptr->width)&~7; + diff = (png_ptr->width)&7; - srcptr = png_ptr->row_buf + 1; - dstptr = row; + if (mmx_supported) + { + _asm + { + movd mm7, unmask //load bit pattern + psubb mm6,mm6 //zero mm6 + punpcklbw mm7,mm7 + punpcklwd mm7,mm7 + punpckldq mm7,mm7 //fill register with 8 masks - unmask = ~mask; - len = (png_ptr->width)&~7; - diff = (png_ptr->width)&7; + movq mm0,mask0 + movq mm1,mask1 + movq mm2,mask2 + movq mm3,mask3 - if (mmx_supported) - { - _asm { - movd mm7, unmask //load bit pattern - psubb mm6,mm6 //zero mm6 - punpcklbw mm7,mm7 - punpcklwd mm7,mm7 - punpckldq mm7,mm7 //fill register with 8 masks + pand mm0,mm7 + pand mm1,mm7 + pand mm2,mm7 + pand mm3,mm7 - movq mm0,mask0 - movq mm1,mask1 - movq mm2,mask2 - movq mm3,mask3 + pcmpeqb mm0,mm6 + pcmpeqb mm1,mm6 + pcmpeqb mm2,mm6 + pcmpeqb mm3,mm6 + mov ecx,len //load length of line + mov esi,srcptr //load source + mov ebx,dstptr //load dest - pand mm0,mm7 - pand mm1,mm7 - pand mm2,mm7 - pand mm3,mm7 - - pcmpeqb mm0,mm6 - pcmpeqb mm1,mm6 - pcmpeqb mm2,mm6 - pcmpeqb mm3,mm6 - - mov ecx,len //load length of line - mov esi,srcptr //load source - mov ebx,dstptr //load dest - - cmp ecx,0 //lcr - jz mainloop32end + cmp ecx,0 //lcr + jz mainloop32end mainloop32: - movq mm4,[esi] - pand mm4,mm0 - movq mm6,mm0 - movq mm7,[ebx] - pandn mm6,mm7 - por mm4,mm6 - movq [ebx],mm4 + movq mm4,[esi] + pand mm4,mm0 + movq mm6,mm0 + movq mm7,[ebx] + pandn mm6,mm7 + por mm4,mm6 + movq [ebx],mm4 + movq mm5,[esi+8] + pand mm5,mm1 + movq mm7,mm1 + movq mm6,[ebx+8] + pandn mm7,mm6 + por mm5,mm7 + movq [ebx+8],mm5 - movq mm5,[esi+8] - pand mm5,mm1 - movq mm7,mm1 - movq mm6,[ebx+8] - pandn mm7,mm6 - por mm5,mm7 - movq [ebx+8],mm5 + movq mm6,[esi+16] + pand mm6,mm2 + movq mm4,mm2 + movq mm7,[ebx+16] + pandn mm4,mm7 + por mm6,mm4 + movq [ebx+16],mm6 - movq mm6,[esi+16] - pand mm6,mm2 - movq mm4,mm2 - movq mm7,[ebx+16] - pandn mm4,mm7 - por mm6,mm4 - movq [ebx+16],mm6 + movq mm7,[esi+24] + pand mm7,mm3 + movq mm5,mm3 + movq mm4,[ebx+24] + pandn mm5,mm4 + por mm7,mm5 + movq [ebx+24],mm7 - movq mm7,[esi+24] - pand mm7,mm3 - movq mm5,mm3 - movq mm4,[ebx+24] - pandn mm5,mm4 - por mm7,mm5 - movq [ebx+24],mm7 + add esi,32 //inc by 32 bytes processed + add ebx,32 + sub ecx,8 //dec by 8 pixels processed + ja mainloop32 - add esi,32 //inc by 32 bytes processed - add ebx,32 - sub ecx,8 //dec by 8 pixels processed - - ja mainloop32 mainloop32end: + mov ecx,diff + cmp ecx,0 + jz end32 - mov ecx,diff - cmp ecx,0 - jz end32 - - mov edx,mask - sal edx,24 //make low byte the high byte - + mov edx,mask + sal edx,24 //make low byte the high byte secondloop32: - sal edx,1 //move high bit to CF - jnc skip32 //if CF = 0 - mov eax,[esi] - mov [ebx],eax + sal edx,1 //move high bit to CF + jnc skip32 //if CF = 0 + mov eax,[esi] + mov [ebx],eax skip32: - add esi,4 - add ebx,4 + add esi,4 + add ebx,4 - dec ecx - jnz secondloop32 + dec ecx + jnz secondloop32 end32: - emms + emms + } + } + else /* mmx _not supported - Use modified C routine */ + { + register unsigned int incr1, initial_val, final_val; + png_size_t pixel_bytes; + png_uint_32 i; + register int disp = png_pass_inc[png_ptr->pass]; + int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; + pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); + srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* + pixel_bytes; + dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; + initial_val = offset_table[png_ptr->pass]*pixel_bytes; + final_val = png_ptr->width*pixel_bytes; + incr1 = (disp)*pixel_bytes; + for (i = initial_val; i < final_val; i += incr1) + { + png_memcpy(dstptr, srcptr, pixel_bytes); + srcptr += incr1; + dstptr += incr1; + } + } /* end of else */ - } - } - else /* mmx _not supported - Use modified C routine */ - { - register unsigned int incr1, initial_val, final_val; - png_size_t pixel_bytes; - png_uint_32 i; - register int disp = png_pass_inc[png_ptr->pass]; - int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; - pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); - srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* - pixel_bytes; - dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; - initial_val = offset_table[png_ptr->pass]*pixel_bytes; - final_val = png_ptr->width*pixel_bytes; - incr1 = (disp)*pixel_bytes; - for (i = initial_val; i < final_val; i += incr1) - { - png_memcpy(dstptr, srcptr, pixel_bytes); - srcptr += incr1; - dstptr += incr1; - } - } /* end of else */ + break; + } //end 32bpp - break; - } //end 32bpp + case 48: + { + png_bytep srcptr; + png_bytep dstptr; + png_uint_32 len; + int unmask, diff; + __int64 mask5=0x0101010101010202, + mask4=0x0202020204040404, + mask3=0x0404080808080808, + mask2=0x1010101010102020, + mask1=0x2020202040404040, + mask0=0x4040808080808080; - case 48: - { - png_bytep srcptr; - png_bytep dstptr; - png_uint_32 len; - int unmask, diff; + if (mmx_supported) + { + srcptr = png_ptr->row_buf + 1; + dstptr = row; - __int64 mask5=0x0101010101010202, - mask4=0x0202020204040404, - mask3=0x0404080808080808, - mask2=0x1010101010102020, - mask1=0x2020202040404040, - mask0=0x4040808080808080; + unmask = ~mask; + len = (png_ptr->width)&~7; + diff = (png_ptr->width)&7; + _asm + { + movd mm7, unmask //load bit pattern + psubb mm6,mm6 //zero mm6 + punpcklbw mm7,mm7 + punpcklwd mm7,mm7 + punpckldq mm7,mm7 //fill register with 8 masks - if (mmx_supported) - { + movq mm0,mask0 + movq mm1,mask1 + movq mm2,mask2 + movq mm3,mask3 + movq mm4,mask4 + movq mm5,mask5 - srcptr = png_ptr->row_buf + 1; - dstptr = row; + pand mm0,mm7 + pand mm1,mm7 + pand mm2,mm7 + pand mm3,mm7 + pand mm4,mm7 + pand mm5,mm7 - unmask = ~mask; - len = (png_ptr->width)&~7; - diff = (png_ptr->width)&7; - _asm { - movd mm7, unmask //load bit pattern - psubb mm6,mm6 //zero mm6 - punpcklbw mm7,mm7 - punpcklwd mm7,mm7 - punpckldq mm7,mm7 //fill register with 8 masks + pcmpeqb mm0,mm6 + pcmpeqb mm1,mm6 + pcmpeqb mm2,mm6 + pcmpeqb mm3,mm6 + pcmpeqb mm4,mm6 + pcmpeqb mm5,mm6 - movq mm0,mask0 - movq mm1,mask1 - movq mm2,mask2 - movq mm3,mask3 - movq mm4,mask4 - movq mm5,mask5 + mov ecx,len //load length of line + mov esi,srcptr //load source + mov ebx,dstptr //load dest - pand mm0,mm7 - pand mm1,mm7 - pand mm2,mm7 - pand mm3,mm7 - pand mm4,mm7 - pand mm5,mm7 - - pcmpeqb mm0,mm6 - pcmpeqb mm1,mm6 - pcmpeqb mm2,mm6 - pcmpeqb mm3,mm6 - pcmpeqb mm4,mm6 - pcmpeqb mm5,mm6 - - mov ecx,len //load length of line - mov esi,srcptr //load source - mov ebx,dstptr //load dest - - cmp ecx,0 - jz mainloop48end + cmp ecx,0 + jz mainloop48end mainloop48: - movq mm7,[esi] - pand mm7,mm0 - movq mm6,mm0 - pandn mm6,[ebx] - por mm7,mm6 - movq [ebx],mm7 + movq mm7,[esi] + pand mm7,mm0 + movq mm6,mm0 + pandn mm6,[ebx] + por mm7,mm6 + movq [ebx],mm7 + movq mm6,[esi+8] + pand mm6,mm1 + movq mm7,mm1 + pandn mm7,[ebx+8] + por mm6,mm7 + movq [ebx+8],mm6 - movq mm6,[esi+8] - pand mm6,mm1 - movq mm7,mm1 - pandn mm7,[ebx+8] - por mm6,mm7 - movq [ebx+8],mm6 + movq mm6,[esi+16] + pand mm6,mm2 + movq mm7,mm2 + pandn mm7,[ebx+16] + por mm6,mm7 + movq [ebx+16],mm6 - movq mm6,[esi+16] - pand mm6,mm2 - movq mm7,mm2 - pandn mm7,[ebx+16] - por mm6,mm7 - movq [ebx+16],mm6 + movq mm7,[esi+24] + pand mm7,mm3 + movq mm6,mm3 + pandn mm6,[ebx+24] + por mm7,mm6 + movq [ebx+24],mm7 - movq mm7,[esi+24] - pand mm7,mm3 - movq mm6,mm3 - pandn mm6,[ebx+24] - por mm7,mm6 - movq [ebx+24],mm7 + movq mm6,[esi+32] + pand mm6,mm4 + movq mm7,mm4 + pandn mm7,[ebx+32] + por mm6,mm7 + movq [ebx+32],mm6 - movq mm6,[esi+32] - pand mm6,mm4 - movq mm7,mm4 - pandn mm7,[ebx+32] - por mm6,mm7 - movq [ebx+32],mm6 + movq mm7,[esi+40] + pand mm7,mm5 + movq mm6,mm5 + pandn mm6,[ebx+40] + por mm7,mm6 + movq [ebx+40],mm7 - movq mm7,[esi+40] - pand mm7,mm5 - movq mm6,mm5 - pandn mm6,[ebx+40] - por mm7,mm6 - movq [ebx+40],mm7 + add esi,48 //inc by 32 bytes processed + add ebx,48 + sub ecx,8 //dec by 8 pixels processed - add esi,48 //inc by 32 bytes processed - add ebx,48 - sub ecx,8 //dec by 8 pixels processed - - ja mainloop48 + ja mainloop48 mainloop48end: - mov ecx,diff - cmp ecx,0 - jz end48 + mov ecx,diff + cmp ecx,0 + jz end48 - mov edx,mask - sal edx,24 //make low byte the high byte + mov edx,mask + sal edx,24 //make low byte the high byte secondloop48: - sal edx,1 //move high bit to CF - jnc skip48 //if CF = 0 - mov eax,[esi] - mov [ebx],eax + sal edx,1 //move high bit to CF + jnc skip48 //if CF = 0 + mov eax,[esi] + mov [ebx],eax skip48: - add esi,4 - add ebx,4 + add esi,4 + add ebx,4 - dec ecx - jnz secondloop48 + dec ecx + jnz secondloop48 end48: - emms - } - } - else /* mmx _not supported - Use modified C routine */ - { - register unsigned int incr1, initial_val, final_val; - png_size_t pixel_bytes; - png_uint_32 i; - register int disp = png_pass_inc[png_ptr->pass]; - int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; - pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); - srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* - pixel_bytes; - dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; - initial_val = offset_table[png_ptr->pass]*pixel_bytes; - final_val = png_ptr->width*pixel_bytes; - incr1 = (disp)*pixel_bytes; - for (i = initial_val; i < final_val; i += incr1) - { - png_memcpy(dstptr, srcptr, pixel_bytes); - srcptr += incr1; - dstptr += incr1; - } - } /* end of else */ - break; // end 48 bpp - } - default: - { - png_bytep sptr; - png_bytep dp; - png_size_t pixel_bytes; - int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; - unsigned int i; - register int disp = png_pass_inc[png_ptr->pass]; // get the offset - register unsigned int incr1, initial_val, final_val; - pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); - sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*pixel_bytes; - dp = row + offset_table[png_ptr->pass]*pixel_bytes; - initial_val = offset_table[png_ptr->pass]*pixel_bytes; - final_val = png_ptr->width*pixel_bytes; - incr1 = (disp)*pixel_bytes; - for (i = initial_val; i < final_val; i += incr1) - { - png_memcpy(dp, sptr, pixel_bytes); - sptr += incr1; - dp += incr1; - } + emms + } + } + else /* mmx _not supported - Use modified C routine */ + { + register unsigned int incr1, initial_val, final_val; + png_size_t pixel_bytes; + png_uint_32 i; + register int disp = png_pass_inc[png_ptr->pass]; + int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; + pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); + srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* + pixel_bytes; + dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; + initial_val = offset_table[png_ptr->pass]*pixel_bytes; + final_val = png_ptr->width*pixel_bytes; + incr1 = (disp)*pixel_bytes; + for (i = initial_val; i < final_val; i += incr1) + { + png_memcpy(dstptr, srcptr, pixel_bytes); + srcptr += incr1; + dstptr += incr1; + } + } /* end of else */ + break; // end 48 bpp + } - break; - } - } - } -} + default: + { + png_bytep sptr; + png_bytep dp; + png_size_t pixel_bytes; + int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; + unsigned int i; + register int disp = png_pass_inc[png_ptr->pass]; // get the offset + register unsigned int incr1, initial_val, final_val; + pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); + sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* + pixel_bytes; + dp = row + offset_table[png_ptr->pass]*pixel_bytes; + initial_val = offset_table[png_ptr->pass]*pixel_bytes; + final_val = png_ptr->width*pixel_bytes; + incr1 = (disp)*pixel_bytes; + for (i = initial_val; i < final_val; i += incr1) + { + png_memcpy(dp, sptr, pixel_bytes); + sptr += incr1; + dp += incr1; + } + break; + } + } /* end switch (png_ptr->row_info.pixel_depth) */ + } + mmx_supported = save_mmx_supported; + +} /* end png_combine_row() */ #if defined(PNG_READ_INTERLACING_SUPPORTED) @@ -946,9 +945,11 @@ png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass, png_uint_32 transformations) { + int save_mmx_supported = mmx_supported; png_debug(1,"in png_do_read_interlace\n"); - if (mmx_supported==2) - mmx_supported=mmxsupport(); + + // mmx_supported = mmxsupport(); // doesn't work + mmx_supported = 0; if (row != NULL && row_info != NULL) { @@ -958,1068 +959,931 @@ png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass, switch (row_info->pixel_depth) { - case 1: - { - png_bytep sp, dp; - int sshift, dshift; - int s_start, s_end, s_inc; - png_byte v; - png_uint_32 i; - int j; + case 1: + { + png_bytep sp, dp; + int sshift, dshift; + int s_start, s_end, s_inc; + png_byte v; + png_uint_32 i; + int j; - sp = row + (png_size_t)((row_info->width - 1) >> 3); - dp = row + (png_size_t)((final_width - 1) >> 3); + sp = row + (png_size_t)((row_info->width - 1) >> 3); + dp = row + (png_size_t)((final_width - 1) >> 3); #if defined(PNG_READ_PACKSWAP_SUPPORTED) - if (transformations & PNG_PACKSWAP) - { - sshift = (int)((row_info->width + 7) & 7); - dshift = (int)((final_width + 7) & 7); - s_start = 7; - s_end = 0; - s_inc = -1; - } - else + if (transformations & PNG_PACKSWAP) + { + sshift = (int)((row_info->width + 7) & 7); + dshift = (int)((final_width + 7) & 7); + s_start = 7; + s_end = 0; + s_inc = -1; + } + else #endif - { - sshift = 7 - (int)((row_info->width + 7) & 7); - dshift = 7 - (int)((final_width + 7) & 7); - s_start = 0; - s_end = 7; - s_inc = 1; - } - - for (i = row_info->width; i; i--) - { - v = (png_byte)((*sp >> sshift) & 0x1); - for (j = 0; j < png_pass_inc[pass]; j++) - { - *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff); - *dp |= (png_byte)(v << dshift); - if (dshift == s_end) - { - dshift = s_start; - dp--; - } - else - dshift += s_inc; - } - if (sshift == s_end) - { - sshift = s_start; - sp--; - } - else - sshift += s_inc; - } - break; - } - case 2: - { - png_bytep sp, dp; - int sshift, dshift; - int s_start, s_end, s_inc; - png_uint_32 i; - - sp = row + (png_size_t)((row_info->width - 1) >> 2); - dp = row + (png_size_t)((final_width - 1) >> 2); -#if defined(PNG_READ_PACKSWAP_SUPPORTED) - if (transformations & PNG_PACKSWAP) - { - sshift = (png_size_t)(((row_info->width + 3) & 3) << 1); - dshift = (png_size_t)(((final_width + 3) & 3) << 1); - s_start = 6; - s_end = 0; - s_inc = -2; - } - else -#endif - { - sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1); - dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1); - s_start = 0; - s_end = 6; - s_inc = 2; - } - - for (i = row_info->width; i; i--) - { - png_byte v; - int j; - - v = (png_byte)((*sp >> sshift) & 0x3); - for (j = 0; j < png_pass_inc[pass]; j++) - { - *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff); - *dp |= (png_byte)(v << dshift); - if (dshift == s_end) - { - dshift = s_start; - dp--; - } - else - dshift += s_inc; - } - if (sshift == s_end) - { - sshift = s_start; - sp--; - } - else - sshift += s_inc; - } - break; - } - case 4: - { - png_bytep sp, dp; - int sshift, dshift; - int s_start, s_end, s_inc; - png_uint_32 i; - - sp = row + (png_size_t)((row_info->width - 1) >> 1); - dp = row + (png_size_t)((final_width - 1) >> 1); -#if defined(PNG_READ_PACKSWAP_SUPPORTED) - if (transformations & PNG_PACKSWAP) - { - sshift = (png_size_t)(((row_info->width + 1) & 1) << 2); - dshift = (png_size_t)(((final_width + 1) & 1) << 2); - s_start = 4; - s_end = 0; - s_inc = -4; - } - else -#endif - { - sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2); - dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2); - s_start = 0; - s_end = 4; - s_inc = 4; - } - - for (i = row_info->width; i; i--) - { - png_byte v; - int j; - - v = (png_byte)((*sp >> sshift) & 0xf); - for (j = 0; j < png_pass_inc[pass]; j++) - { - *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff); - *dp |= (png_byte)(v << dshift); - if (dshift == s_end) - { - dshift = s_start; - dp--; - } - else - dshift += s_inc; - } - if (sshift == s_end) - { - sshift = s_start; - sp--; - } - else - sshift += s_inc; - } - break; - } - default: // This is the place where the routine is modified - { - __int64 const4 = 0x0000000000FFFFFF; - __int64 const5 = 0x000000FFFFFF0000; - __int64 const6 = 0x00000000000000FF; - //int mmx_supported = 1; - - png_bytep sptr, dp; - png_uint_32 i; - png_size_t pixel_bytes; - - int width = row_info->width; - - pixel_bytes = (row_info->pixel_depth >> 3); - - sptr = row + (row_info->width - 1) * pixel_bytes; - dp = row + (final_width - 1) * pixel_bytes; - // New code by Nirav Chhatrapati - Intel Corporation - - if (mmx_supported) // If machine supports MMX technology use MMX routine - { - if (pixel_bytes == 3) - { - if ((pass == 0) || (pass == 1)) - { - _asm - { - mov esi, sptr - - mov edi, dp - - mov ecx, width - - sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes - -loop_pass0: - - movd mm0, [esi] ; X X X X X val2 val1 val0 - - pand mm0, const4 ; 0 0 0 0 0 val2 val1 val0 - - movq mm1, mm0 ; 0 0 0 0 0 val2 val1 val0 - - psllq mm0, 16 ; 0 0 0 val2 val1 val0 0 0 - - movq mm2, mm0 ; 0 0 0 val2 val1 val0 0 0 - - psllq mm0, 24 ; val2 val1 val0 0 0 0 0 0 - - psrlq mm1, 8 ; 0 0 0 0 0 0 val2 val1 - - por mm0, mm2 ; val2 val1 val0 val2 val1 val0 0 0 - - por mm0, mm1 ; val2 val1 val0 val2 val1 val0 val2 val1 - - movq mm3, mm0 ; val2 val1 val0 val2 val1 val0 val2 val1 - - psllq mm0, 16 ; val0 val2 val1 val0 val2 val1 0 0 - - movq mm4, mm3 ; val2 val1 val0 val2 val1 val0 val2 val1 - - punpckhdq mm3, mm0 ; val0 val2 val1 val0 val2 val1 val0 val2 - - movq [edi+16] , mm4 - - psrlq mm0, 32 ; 0 0 0 0 val0 val2 val1 val0 - - movq [edi+8] , mm3 - - punpckldq mm0, mm4 ; val1 val0 val2 val1 val0 val2 val1 val0 - - sub esi, 3 - - movq [edi], mm0 - - sub edi, 24 - - //sub esi, 3 - - dec ecx - - jnz loop_pass0 - - EMMS - } - - } - - else if ((pass == 2) || (pass == 3)) - { - _asm - { - mov esi, sptr - - mov edi, dp - - mov ecx, width - - sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes - -loop_pass2: - - movd mm0, [esi] ; X X X X X val2 val1 val0 - - pand mm0, const4 ; 0 0 0 0 0 val2 val1 val0 - - movq mm1, mm0 ; 0 0 0 0 0 val2 val1 val0 - - psllq mm0, 16 ; 0 0 0 val2 val1 val0 0 0 - - movq mm2, mm0 ; 0 0 0 val2 val1 val0 0 0 - - psllq mm0, 24 ; val2 val1 val0 0 0 0 0 0 - - psrlq mm1, 8 ; 0 0 0 0 0 0 val2 val1 - - por mm0, mm2 ; val2 val1 val0 val2 val1 val0 0 0 - - por mm0, mm1 ; val2 val1 val0 val2 val1 val0 val2 val1 - - movq [edi+4], mm0 ; move to memory - - psrlq mm0, 16 ; 0 0 val2 val1 val0 val2 val1 val0 - - movd [edi], mm0 ; move to memory - - sub esi, 3 - - sub edi, 12 - - dec ecx - - jnz loop_pass2 - - EMMS - } - } - - else /*if ((pass == 4) || (pass == 5)) */ - { - - int width_mmx = ((width >> 1) << 1) - 8; - width -= width_mmx; - if(width_mmx) - _asm - { - mov esi, sptr - - mov edi, dp - - mov ecx, width_mmx - - sub esi, 3 - - sub edi, 9 - -loop_pass4: - - movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3 - - movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3 - - movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3 - - psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0 - - pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3 - - psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0 - - por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3 - - movq mm5, mm6 ; 0 0 0 X X v2 v1 v0 - - psllq mm6, 8 ; 0 0 X X v2 v1 v0 0 - - movq [edi], mm0 ; move quad to memory - - psrlq mm5, 16 ; 0 0 0 0 0 X X v2 - - pand mm5, const6 ; 0 0 0 0 0 0 0 v2 - - por mm6, mm5 ; 0 0 X X v2 v1 v0 v2 - - movd [edi+8], mm6 ; move double to memory - - sub esi, 6 - - sub edi, 12 - - sub ecx, 2 - - jnz loop_pass4 - - EMMS - } - - sptr -= width_mmx*3; - dp -= width_mmx*6; - for (i = width; i; i--) - { - png_byte v[8]; - int j; - - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; + sshift = 7 - (int)((row_info->width + 7) & 7); + dshift = 7 - (int)((final_width + 7) & 7); + s_start = 0; + s_end = 7; + s_inc = 1; } - sptr -= pixel_bytes; - } - } - - } /* end of pixel_bytes == 3 */ - - else if (pixel_bytes == 1) - { - - if ((pass == 0) || (pass == 1)) - { - int width_mmx = ((width >> 2) << 2); - width -= width_mmx; - if(width_mmx) - _asm - { - - mov esi, sptr - - mov edi, dp - - mov ecx, width_mmx - - sub edi, 31 - - sub esi, 3 - -loop1_pass0: - - movd mm0, [esi] ; X X X X v0 v1 v2 v3 - - movq mm1, mm0 ; X X X X v0 v1 v2 v3 - - punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 - - movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 - - punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 - - movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 - - punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3 - - punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2 - - movq [edi], mm0 ; move to memory v3 - - punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 - - movq [edi+8], mm3 ; move to memory v2 - - movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 - - punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1 - - punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0 - - movq [edi+16], mm2 ; move to memory v1 - - movq [edi+24], mm4 ; move to memory v0 - - sub esi, 4 - - sub edi, 32 - - sub ecx, 4 - - jnz loop1_pass0 - - EMMS - } - - sptr -= width_mmx; - dp -= width_mmx*8; - for (i = width; i; i--) - { - png_byte v[8]; - int j; - - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) + for (i = row_info->width; i; i--) { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - - } - - - else if ((pass == 2) || (pass == 3)) - { - int width_mmx = ((width >> 2) << 2); - width -= width_mmx; - if(width_mmx) - _asm - { - - mov esi, sptr - - mov edi, dp - - mov ecx, width_mmx - - sub edi, 15 - - sub esi, 3 - -loop1_pass2: - - movd mm0, [esi] ; X X X X v0 v1 v2 v3 - - punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 - - movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 - - punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 - - punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1 - - movq [edi], mm0 ; move to memory v2 and v3 - - sub esi, 4 - - movq [edi+8], mm1 ; move to memory v1 and v0 - - sub edi, 16 - - sub ecx, 4 - - jnz loop1_pass2 - - EMMS - } - - sptr -= width_mmx; - dp -= width_mmx*4; - for (i = width; i; i--) - { - png_byte v[8]; - int j; - - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - - } - - else //if ((pass == 4) || (pass == 5)) - { - int width_mmx = ((width >> 3) << 3); - width -= width_mmx; - if(width_mmx) - _asm - { - - mov esi, sptr - mov edi, dp - mov ecx, width_mmx - sub edi, 15 - sub esi, 7 - -loop1_pass4: - - movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7 - movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7 - punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7 - //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 - punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3 - movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3 - sub esi, 8 - movq [edi], mm0 ; move to memory v4 v5 v6 and v7 - //sub esi, 4 - sub edi, 16 - sub ecx, 8 - jnz loop1_pass4 - - EMMS - } - - sptr -= width_mmx; - dp -= width_mmx*2; - for (i = width; i; i--) - { - png_byte v[8]; - int j; - - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - - } - - } /* end of pixel_bytes == 1 */ - - else if (pixel_bytes == 2) - { - - if ((pass == 0) || (pass == 1)) - { - int width_mmx = ((width >> 1) << 1); - width -= width_mmx; - if(width_mmx) - _asm - { - mov esi, sptr - mov edi, dp - mov ecx, width_mmx - sub esi, 2 - sub edi, 30 - -loop2_pass0: - movd mm0, [esi] ; X X X X v1 v0 v3 v2 - punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 - movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 - punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 - punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 - movq [edi], mm0 - movq [edi + 8], mm0 - movq [edi + 16], mm1 - movq [edi + 24], mm1 - sub esi, 4 - sub edi, 32 - sub ecx, 2 - jnz loop2_pass0 - - EMMS - } - - sptr -= (width_mmx*2 + 2); - dp -= (width_mmx*16 + 2); - - for (i = width; i; i--) - { - - png_byte v[8]; - int j; - sptr -= pixel_bytes; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - dp -= pixel_bytes; - png_memcpy(dp, v, pixel_bytes); - //dp -= pixel_bytes; - } - //sptr -= pixel_bytes; - } - } - - else if ((pass == 2) || (pass == 3)) - { - int width_mmx = ((width >> 1) << 1) ; - width -= width_mmx; - if(width_mmx) - _asm - { - mov esi, sptr - mov edi, dp - mov ecx, width_mmx - sub esi, 2 - sub edi, 14 - -loop2_pass2: - movd mm0, [esi] ; X X X X v1 v0 v3 v2 - punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 - movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 - punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 - punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 - movq [edi], mm0 - sub esi, 4 - movq [edi + 8], mm1 - //sub esi, 4 - sub edi, 16 - sub ecx, 2 - jnz loop2_pass2 - - EMMS - } - - sptr -= (width_mmx*2 + 2); - dp -= (width_mmx*8 + 2); - - for (i = width; i; i--) - { - - png_byte v[8]; - int j; - sptr -= pixel_bytes; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - dp -= pixel_bytes; - png_memcpy(dp, v, pixel_bytes); - //dp -= pixel_bytes; - } - //sptr -= pixel_bytes; - } - } - - else // pass == 4 or 5 - { - int width_mmx = ((width >> 1) << 1) ; - width -= width_mmx; - if(width_mmx) - _asm - { - mov esi, sptr - mov edi, dp - mov ecx, width_mmx - sub esi, 2 - sub edi, 6 - -loop2_pass4: - movd mm0, [esi] ; X X X X v1 v0 v3 v2 - punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 - sub esi, 4 - movq [edi], mm0 - sub edi, 8 - sub ecx, 2 - jnz loop2_pass4 - - EMMS - } - - sptr -= (width_mmx*2 + 2); - dp -= (width_mmx*4 + 2); - - for (i = width; i; i--) - { - - png_byte v[8]; - int j; - sptr -= pixel_bytes; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - dp -= pixel_bytes; - png_memcpy(dp, v, pixel_bytes); - //dp -= pixel_bytes; - } - //sptr -= pixel_bytes; - } - } - - } /* end of pixel_bytes == 2 */ - - else if (pixel_bytes == 4) - { - if ((pass == 0) || (pass == 1)) - { - int width_mmx = ((width >> 1) << 1) ; - width -= width_mmx; - if(width_mmx) - _asm - { - mov esi, sptr - mov edi, dp - mov ecx, width_mmx - sub esi, 4 - sub edi, 60 - -loop4_pass0: - movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 - movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 - punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 - punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 - movq [edi], mm0 - movq [edi + 8], mm0 - movq [edi + 16], mm0 - movq [edi + 24], mm0 - movq [edi+32], mm1 - movq [edi + 40], mm1 - movq [edi+ 48], mm1 - sub esi, 8 - movq [edi + 56], mm1 - sub edi, 64 - sub ecx, 2 - jnz loop4_pass0 - - EMMS - } - - sptr -= (width_mmx*4 + 4); - dp -= (width_mmx*32 + 4); - - for (i = width; i; i--) - { - - png_byte v[8]; - int j; - sptr -= pixel_bytes; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - dp -= pixel_bytes; - png_memcpy(dp, v, pixel_bytes); - //dp -= pixel_bytes; - } - //sptr -= pixel_bytes; - } - } - - else if ((pass == 2) || (pass == 3)) - { - int width_mmx = ((width >> 1) << 1) ; - width -= width_mmx; - if(width_mmx) - _asm - { - mov esi, sptr - mov edi, dp - mov ecx, width_mmx - sub esi, 4 - sub edi, 28 - -loop4_pass2: - movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 - movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 - punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 - punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 - movq [edi], mm0 - movq [edi + 8], mm0 - movq [edi+16], mm1 - movq [edi + 24], mm1 - sub esi, 8 - sub edi, 32 - sub ecx, 2 - jnz loop4_pass2 - - EMMS - } - - sptr -= (width_mmx*4 + 4); - dp -= (width_mmx*16 + 4); - - for (i = width; i; i--) - { - - png_byte v[8]; - int j; - sptr -= pixel_bytes; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - dp -= pixel_bytes; - png_memcpy(dp, v, pixel_bytes); - //dp -= pixel_bytes; - } - //sptr -= pixel_bytes; - } - } - - else // pass == 4 or 5 - { - int width_mmx = ((width >> 1) << 1) ; - width -= width_mmx; - if(width_mmx) - _asm - { - mov esi, sptr - mov edi, dp - mov ecx, width_mmx - sub esi, 4 - sub edi, 12 - -loop4_pass4: - movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 - movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 - punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 - punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 - movq [edi], mm0 - sub esi, 8 - movq [edi + 8], mm1 - sub edi, 16 - sub ecx, 2 - jnz loop4_pass4 - - EMMS - } - - sptr -= (width_mmx*4 + 4); - dp -= (width_mmx*8 + 4); - - for (i = width; i; i--) - { - - png_byte v[8]; - int j; - sptr -= pixel_bytes; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - dp -= pixel_bytes; - png_memcpy(dp, v, pixel_bytes); - //dp -= pixel_bytes; - } - //sptr -= pixel_bytes; - } - } - - } /* end of pixel_bytes == 4 */ - - else if (pixel_bytes == 6) - { - for (i = row_info->width; i; i--) - { - - png_byte v[8]; - int j; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - } /* end of pixel_bytes == 6 */ - - else - { - for (i = row_info->width; i; i--) - { - - png_byte v[8]; - int j; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr-= pixel_bytes; - } - } - } /* end of mmx_supported */ - - else /* MMX not supported */ - /* use modified C code - takes advantage of inlining of memcpy for - a constant */ - { - if (pixel_bytes == 1) - { - for (i = row_info->width; i; i--) - { - png_byte v[8]; - int j; - - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - } - else if (pixel_bytes == 3) - { - for (i = row_info->width; i; i--) - { - png_byte v[8]; - int j; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - } - else if (pixel_bytes == 2) - { - for (i = row_info->width; i; i--) - { - png_byte v[8]; - int j; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - } - else if (pixel_bytes == 4) - { - for (i = row_info->width; i; i--) - { - png_byte v[8]; - int j; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - } - else if (pixel_bytes == 6) - { - for (i = row_info->width; i; i--) - { - png_byte v[8]; - int j; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) - { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; - } - sptr -= pixel_bytes; - } - } - else - { - for (i = row_info->width; i; i--) - { - png_byte v[8]; - int j; - png_memcpy(v, sptr, pixel_bytes); - for (j = 0; j < png_pass_inc[pass]; j++) + v = (png_byte)((*sp >> sshift) & 0x1); + for (j = 0; j < png_pass_inc[pass]; j++) { - png_memcpy(dp, v, pixel_bytes); - dp -= pixel_bytes; + *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff); + *dp |= (png_byte)(v << dshift); + if (dshift == s_end) + { + dshift = s_start; + dp--; + } + else + dshift += s_inc; + } + if (sshift == s_end) + { + sshift = s_start; + sp--; + } + else + sshift += s_inc; } - sptr -= pixel_bytes; - } - } + break; + } - } /* end of MMX not supported */ - break; - } - } - row_info->width = final_width; + case 2: + { + png_bytep sp, dp; + int sshift, dshift; + int s_start, s_end, s_inc; + png_uint_32 i; + + sp = row + (png_size_t)((row_info->width - 1) >> 2); + dp = row + (png_size_t)((final_width - 1) >> 2); +#if defined(PNG_READ_PACKSWAP_SUPPORTED) + if (transformations & PNG_PACKSWAP) + { + sshift = (png_size_t)(((row_info->width + 3) & 3) << 1); + dshift = (png_size_t)(((final_width + 3) & 3) << 1); + s_start = 6; + s_end = 0; + s_inc = -2; + } + else +#endif + { + sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1); + dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1); + s_start = 0; + s_end = 6; + s_inc = 2; + } + + for (i = row_info->width; i; i--) + { + png_byte v; + int j; + + v = (png_byte)((*sp >> sshift) & 0x3); + for (j = 0; j < png_pass_inc[pass]; j++) + { + *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff); + *dp |= (png_byte)(v << dshift); + if (dshift == s_end) + { + dshift = s_start; + dp--; + } + else + dshift += s_inc; + } + if (sshift == s_end) + { + sshift = s_start; + sp--; + } + else + sshift += s_inc; + } + break; + } + + case 4: + { + png_bytep sp, dp; + int sshift, dshift; + int s_start, s_end, s_inc; + png_uint_32 i; + + sp = row + (png_size_t)((row_info->width - 1) >> 1); + dp = row + (png_size_t)((final_width - 1) >> 1); +#if defined(PNG_READ_PACKSWAP_SUPPORTED) + if (transformations & PNG_PACKSWAP) + { + sshift = (png_size_t)(((row_info->width + 1) & 1) << 2); + dshift = (png_size_t)(((final_width + 1) & 1) << 2); + s_start = 4; + s_end = 0; + s_inc = -4; + } + else +#endif + { + sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2); + dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2); + s_start = 0; + s_end = 4; + s_inc = 4; + } + + for (i = row_info->width; i; i--) + { + png_byte v; + int j; + + v = (png_byte)((*sp >> sshift) & 0xf); + for (j = 0; j < png_pass_inc[pass]; j++) + { + *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff); + *dp |= (png_byte)(v << dshift); + if (dshift == s_end) + { + dshift = s_start; + dp--; + } + else + dshift += s_inc; + } + if (sshift == s_end) + { + sshift = s_start; + sp--; + } + else + sshift += s_inc; + } + break; + } + + default: // This is the place where the routine is modified + { + __int64 const4 = 0x0000000000FFFFFF; + __int64 const5 = 0x000000FFFFFF0000; + __int64 const6 = 0x00000000000000FF; + //int mmx_supported = 1; + + png_bytep sptr, dp; + png_uint_32 i; + png_size_t pixel_bytes; + + int width = row_info->width; + + pixel_bytes = (row_info->pixel_depth >> 3); + + sptr = row + (row_info->width - 1) * pixel_bytes; + dp = row + (final_width - 1) * pixel_bytes; + // New code by Nirav Chhatrapati - Intel Corporation + + if (mmx_supported) // use MMX routine if machine supports it + { + if (pixel_bytes == 3) + { + if ((pass == 0) || (pass == 1)) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width + sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes +loop_pass0: + movd mm0, [esi] ; X X X X X v2 v1 v0 + pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0 + movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0 + psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0 + movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0 + psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0 + psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1 + por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0 + por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1 + movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1 + psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0 + movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1 + punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2 + movq [edi+16] , mm4 + psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0 + movq [edi+8] , mm3 + punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0 + sub esi, 3 + movq [edi], mm0 + sub edi, 24 + //sub esi, 3 + dec ecx + jnz loop_pass0 + EMMS + } + } + else if ((pass == 2) || (pass == 3)) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width + sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes +loop_pass2: + movd mm0, [esi] ; X X X X X v2 v1 v0 + pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0 + movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0 + psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0 + movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0 + psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0 + psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1 + por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0 + por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1 + movq [edi+4], mm0 ; move to memory + psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0 + movd [edi], mm0 ; move to memory + sub esi, 3 + sub edi, 12 + dec ecx + jnz loop_pass2 + EMMS + } + } + else /* if ((pass == 4) || (pass == 5)) */ + { + int width_mmx = ((width >> 1) << 1) - 8; + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub esi, 3 + sub edi, 9 +loop_pass4: + movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3 + movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3 + movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3 + psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0 + pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3 + psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0 + por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3 + movq mm5, mm6 ; 0 0 0 X X v2 v1 v0 + psllq mm6, 8 ; 0 0 X X v2 v1 v0 0 + movq [edi], mm0 ; move quad to memory + psrlq mm5, 16 ; 0 0 0 0 0 X X v2 + pand mm5, const6 ; 0 0 0 0 0 0 0 v2 + por mm6, mm5 ; 0 0 X X v2 v1 v0 v2 + movd [edi+8], mm6 ; move double to memory + sub esi, 6 + sub edi, 12 + sub ecx, 2 + jnz loop_pass4 + EMMS + } + } + + sptr -= width_mmx*3; + dp -= width_mmx*6; + for (i = width; i; i--) + { + png_byte v[8]; + int j; + + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + } /* end of pixel_bytes == 3 */ + + else if (pixel_bytes == 1) + { + if ((pass == 0) || (pass == 1)) + { + int width_mmx = ((width >> 2) << 2); + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub edi, 31 + sub esi, 3 +loop1_pass0: + movd mm0, [esi] ; X X X X v0 v1 v2 v3 + movq mm1, mm0 ; X X X X v0 v1 v2 v3 + punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 + movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 + punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 + movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 + punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3 + punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2 + movq [edi], mm0 ; move to memory v3 + punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 + movq [edi+8], mm3 ; move to memory v2 + movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 + punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1 + punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0 + movq [edi+16], mm2 ; move to memory v1 + movq [edi+24], mm4 ; move to memory v0 + sub esi, 4 + sub edi, 32 + sub ecx, 4 + jnz loop1_pass0 + EMMS + } + } + + sptr -= width_mmx; + dp -= width_mmx*8; + for (i = width; i; i--) + { + png_byte v[8]; + int j; + + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + else if ((pass == 2) || (pass == 3)) + { + int width_mmx = ((width >> 2) << 2); + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub edi, 15 + sub esi, 3 +loop1_pass2: + movd mm0, [esi] ; X X X X v0 v1 v2 v3 + punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 + movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 + punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 + punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1 + movq [edi], mm0 ; move to memory v2 and v3 + sub esi, 4 + movq [edi+8], mm1 ; move to memory v1 and v0 + sub edi, 16 + sub ecx, 4 + jnz loop1_pass2 + EMMS + } + } + + sptr -= width_mmx; + dp -= width_mmx*4; + for (i = width; i; i--) + { + png_byte v[8]; + int j; + + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + else //if ((pass == 4) || (pass == 5)) + { + int width_mmx = ((width >> 3) << 3); + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub edi, 15 + sub esi, 7 +loop1_pass4: + movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7 + movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7 + punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7 + //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 + punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3 + movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3 + sub esi, 8 + movq [edi], mm0 ; move to memory v4 v5 v6 and v7 + //sub esi, 4 + sub edi, 16 + sub ecx, 8 + jnz loop1_pass4 + EMMS + } + } + + sptr -= width_mmx; + dp -= width_mmx*2; + for (i = width; i; i--) + { + png_byte v[8]; + int j; + + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + } /* end of pixel_bytes == 1 */ + + else if (pixel_bytes == 2) + { + if ((pass == 0) || (pass == 1)) + { + int width_mmx = ((width >> 1) << 1); + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub esi, 2 + sub edi, 30 +loop2_pass0: + movd mm0, [esi] ; X X X X v1 v0 v3 v2 + punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 + movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 + punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 + punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 + movq [edi], mm0 + movq [edi + 8], mm0 + movq [edi + 16], mm1 + movq [edi + 24], mm1 + sub esi, 4 + sub edi, 32 + sub ecx, 2 + jnz loop2_pass0 + EMMS + } + } + + sptr -= (width_mmx*2 + 2); + dp -= (width_mmx*16 + 2); + for (i = width; i; i--) + { + png_byte v[8]; + int j; + sptr -= pixel_bytes; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + dp -= pixel_bytes; + png_memcpy(dp, v, pixel_bytes); + //dp -= pixel_bytes; + } + //sptr -= pixel_bytes; + } + } + + else if ((pass == 2) || (pass == 3)) + { + int width_mmx = ((width >> 1) << 1) ; + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub esi, 2 + sub edi, 14 +loop2_pass2: + movd mm0, [esi] ; X X X X v1 v0 v3 v2 + punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 + movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 + punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 + punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 + movq [edi], mm0 + sub esi, 4 + movq [edi + 8], mm1 + //sub esi, 4 + sub edi, 16 + sub ecx, 2 + jnz loop2_pass2 + EMMS + } + } + + sptr -= (width_mmx*2 + 2); + dp -= (width_mmx*8 + 2); + for (i = width; i; i--) + { + png_byte v[8]; + int j; + sptr -= pixel_bytes; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + dp -= pixel_bytes; + png_memcpy(dp, v, pixel_bytes); + //dp -= pixel_bytes; + } + //sptr -= pixel_bytes; + } + } + + else // pass == 4 or 5 + { + int width_mmx = ((width >> 1) << 1) ; + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub esi, 2 + sub edi, 6 +loop2_pass4: + movd mm0, [esi] ; X X X X v1 v0 v3 v2 + punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 + sub esi, 4 + movq [edi], mm0 + sub edi, 8 + sub ecx, 2 + jnz loop2_pass4 + EMMS + } + } + + sptr -= (width_mmx*2 + 2); + dp -= (width_mmx*4 + 2); + for (i = width; i; i--) + { + png_byte v[8]; + int j; + sptr -= pixel_bytes; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + dp -= pixel_bytes; + png_memcpy(dp, v, pixel_bytes); + //dp -= pixel_bytes; + } + //sptr -= pixel_bytes; + } + } + } /* end of pixel_bytes == 2 */ + + else if (pixel_bytes == 4) + { + if ((pass == 0) || (pass == 1)) + { + int width_mmx = ((width >> 1) << 1) ; + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub esi, 4 + sub edi, 60 +loop4_pass0: + movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 + movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 + punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 + punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 + movq [edi], mm0 + movq [edi + 8], mm0 + movq [edi + 16], mm0 + movq [edi + 24], mm0 + movq [edi+32], mm1 + movq [edi + 40], mm1 + movq [edi+ 48], mm1 + sub esi, 8 + movq [edi + 56], mm1 + sub edi, 64 + sub ecx, 2 + jnz loop4_pass0 + EMMS + } + } + + sptr -= (width_mmx*4 + 4); + dp -= (width_mmx*32 + 4); + for (i = width; i; i--) + { + png_byte v[8]; + int j; + sptr -= pixel_bytes; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + dp -= pixel_bytes; + png_memcpy(dp, v, pixel_bytes); + //dp -= pixel_bytes; + } + //sptr -= pixel_bytes; + } + } + + else if ((pass == 2) || (pass == 3)) + { + int width_mmx = ((width >> 1) << 1) ; + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub esi, 4 + sub edi, 28 +loop4_pass2: + movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 + movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 + punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 + punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 + movq [edi], mm0 + movq [edi + 8], mm0 + movq [edi+16], mm1 + movq [edi + 24], mm1 + sub esi, 8 + sub edi, 32 + sub ecx, 2 + jnz loop4_pass2 + EMMS + } + } + + sptr -= (width_mmx*4 + 4); + dp -= (width_mmx*16 + 4); + for (i = width; i; i--) + { + png_byte v[8]; + int j; + sptr -= pixel_bytes; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + dp -= pixel_bytes; + png_memcpy(dp, v, pixel_bytes); + //dp -= pixel_bytes; + } + //sptr -= pixel_bytes; + } + } + + else // pass == 4 or 5 + { + int width_mmx = ((width >> 1) << 1) ; + width -= width_mmx; + if (width_mmx) + { + _asm + { + mov esi, sptr + mov edi, dp + mov ecx, width_mmx + sub esi, 4 + sub edi, 12 +loop4_pass4: + movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 + movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 + punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 + punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 + movq [edi], mm0 + sub esi, 8 + movq [edi + 8], mm1 + sub edi, 16 + sub ecx, 2 + jnz loop4_pass4 + EMMS + } + } + + sptr -= (width_mmx*4 + 4); + dp -= (width_mmx*8 + 4); + for (i = width; i; i--) + { + png_byte v[8]; + int j; + sptr -= pixel_bytes; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + dp -= pixel_bytes; + png_memcpy(dp, v, pixel_bytes); + //dp -= pixel_bytes; + } + //sptr -= pixel_bytes; + } + } + + } /* end of pixel_bytes == 4 */ + + else if (pixel_bytes == 6) + { + for (i = row_info->width; i; i--) + { + png_byte v[8]; + int j; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } /* end of pixel_bytes == 6 */ + + else + { + for (i = row_info->width; i; i--) + { + png_byte v[8]; + int j; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr-= pixel_bytes; + } + } + } /* end of mmx_supported */ + + else /* MMX not supported: use modified C code - takes advantage + * of inlining of memcpy for a constant */ + { + if (pixel_bytes == 1) + { + for (i = row_info->width; i; i--) + { + png_byte v[8]; + int j; + + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + else if (pixel_bytes == 3) + { + for (i = row_info->width; i; i--) + { + png_byte v[8]; + int j; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + else if (pixel_bytes == 2) + { + for (i = row_info->width; i; i--) + { + png_byte v[8]; + int j; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + else if (pixel_bytes == 4) + { + for (i = row_info->width; i; i--) + { + png_byte v[8]; + int j; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + else if (pixel_bytes == 6) + { + for (i = row_info->width; i; i--) + { + png_byte v[8]; + int j; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + else + { + for (i = row_info->width; i; i--) + { + png_byte v[8]; + int j; + png_memcpy(v, sptr, pixel_bytes); + for (j = 0; j < png_pass_inc[pass]; j++) + { + png_memcpy(dp, v, pixel_bytes); + dp -= pixel_bytes; + } + sptr -= pixel_bytes; + } + } + + } /* end of MMX not supported */ + break; + } + } /* end switch (row_info->pixel_depth) */ + + row_info->width = final_width; row_info->rowbytes = ((final_width * - (png_uint_32)row_info->pixel_depth + 7) >> 3); + (png_uint_32)row_info->pixel_depth + 7) >> 3); } + mmx_supported = save_mmx_supported; } -#endif - +#endif /* PNG_READ_INTERLACING_SUPPORTED */ // These variables are utilized in the functions below. They are declared // globally here to ensure alignment on 8-byte boundaries. + union uAll { __int64 use; double align; -} LBCarryMask = {0x0101010101010101}, HBClearMask = {0x7f7f7f7f7f7f7f7f}, - ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem; +} LBCarryMask = {0x0101010101010101}, + HBClearMask = {0x7f7f7f7f7f7f7f7f}, + ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem; + // Optimized code for PNG Average filter decoder void png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row , png_bytep prev_row) { - int bpp; - png_uint_32 FullLength; - png_uint_32 MMXLength; - //png_uint_32 len; - int diff; - bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel - FullLength = row_info->rowbytes; // # of bytes to filter - _asm { + int bpp; + png_uint_32 FullLength; + png_uint_32 MMXLength; + //png_uint_32 len; + int diff; + + bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel + FullLength = row_info->rowbytes; // # of bytes to filter + _asm { // Init address pointers and offset mov edi, row // edi ==> Avg(x) xor ebx, ebx // ebx ==> x mov edx, edi - mov esi, prev_row // esi ==> Prior(x) + mov esi, prev_row // esi ==> Prior(x) sub edx, bpp // edx ==> Raw(x-bpp) xor eax, eax @@ -2027,12 +1891,12 @@ png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row // Raw(x) = Avg(x) + (Prior(x)/2) davgrlp: mov al, [esi + ebx] // Load al with Prior(x) - inc ebx + inc ebx shr al, 1 // divide by 2 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx cmp ebx, bpp - mov [edi+ebx-1], al // Write back Raw(x); - // mov does not affect flags; -1 to offset inc ebx + mov [edi+ebx-1], al // Write back Raw(x); + // mov does not affect flags; -1 to offset inc ebx jb davgrlp // get # of bytes to alignment mov diff, edi // take start of row @@ -2047,27 +1911,27 @@ davgrlp: xor ecx, ecx davglp1: xor eax, eax - mov cl, [esi + ebx] // load cl with Prior(x) + mov cl, [esi + ebx] // load cl with Prior(x) mov al, [edx + ebx] // load al with Raw(x-bpp) add ax, cx - inc ebx + inc ebx shr ax, 1 // divide by 2 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx - cmp ebx, diff // Check if at alignment boundary - mov [edi+ebx-1], al // Write back Raw(x); + cmp ebx, diff // Check if at alignment boundary + mov [edi+ebx-1], al // Write back Raw(x); // mov does not affect flags; -1 to offset inc ebx - jb davglp1 // Repeat until at alignment boundary + jb davglp1 // Repeat until at alignment boundary davggo: - mov eax, FullLength + mov eax, FullLength mov ecx, eax sub eax, ebx // subtract alignment fix and eax, 0x00000007 // calc bytes over mult of 8 sub ecx, eax // drop over bytes from original length mov MMXLength, ecx - } // end _asm block - // Now do the math for the rest of the row - switch ( bpp ) - { + } // end _asm block + // Now do the math for the rest of the row + switch ( bpp ) + { case 3: { ActiveMask.use = 0x0000000000ffffff; @@ -2080,21 +1944,21 @@ davggo: movq mm5, LBCarryMask mov edi, row // edi ==> Avg(x) movq mm4, HBClearMask - mov esi, prev_row // esi ==> Prior(x) + mov esi, prev_row // esi ==> Prior(x) // PRIME the pump (load the first Raw(x-bpp) data set - movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes + movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes // (we correct position in loop below) davg3lp: - movq mm0, [edi + ebx] // Load mm0 with Avg(x) + movq mm0, [edi + ebx] // Load mm0 with Avg(x) // Add (Prev_row/2) to Average movq mm3, mm5 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data - movq mm1, [esi + ebx] // Load mm1 with Prior(x) + movq mm1, [esi + ebx] // Load mm1 with Prior(x) movq mm6, mm7 pand mm3, mm1 // get lsb for each prev_row byte psrlq mm1, 1 // divide prev_row bytes by 2 pand mm1, mm4 // clear invalid bit 7 of each byte - paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte + paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry movq mm1, mm3 // now use mm1 for getting LBCarrys pand mm1, mm2 // get LBCarrys for each byte where both @@ -2103,173 +1967,180 @@ davg3lp: pand mm2, mm4 // clear invalid bit 7 of each byte paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active + // byte // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5 movq mm2, mm0 // mov updated Raws to mm2 psllq mm2, ShiftBpp // shift data to position correctly movq mm1, mm3 // now use mm1 for getting LBCarrys - pand mm1, mm2 // get LBCarrys for each byte where both - // lsb's were == 1 (Only valid for active group) - psrlq mm2, 1 // divide raw bytes by 2 - pand mm2, mm4 // clear invalid bit 7 of each byte - paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte - pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte + pand mm1, mm2 // get LBCarrys for each byte where both + // lsb's were == 1 (Only valid for active group) + psrlq mm2, 1 // divide raw bytes by 2 + pand mm2, mm4 // clear invalid bit 7 of each byte + paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte + pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active + // byte // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry - psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two bytes + psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two + // bytes movq mm2, mm0 // mov updated Raws to mm2 psllq mm2, ShiftBpp // shift data to position correctly - // Data only needs to be shifted once here to - // get the correct x-bpp offset. - movq mm1, mm3 // now use mm1 for getting LBCarrys - pand mm1, mm2 // get LBCarrys for each byte where both - // lsb's were == 1 (Only valid for active group) - psrlq mm2, 1 // divide raw bytes by 2 - pand mm2, mm4 // clear invalid bit 7 of each byte - paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte - pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg - add ebx, 8 - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte + // Data only needs to be shifted once here to + // get the correct x-bpp offset. + movq mm1, mm3 // now use mm1 for getting LBCarrys + pand mm1, mm2 // get LBCarrys for each byte where both + // lsb's were == 1 (Only valid for active group) + psrlq mm2, 1 // divide raw bytes by 2 + pand mm2, mm4 // clear invalid bit 7 of each byte + paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte + pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg + add ebx, 8 + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active + // byte // Now ready to write back to memory - movq [edi + ebx - 8], mm0 + movq [edi + ebx - 8], mm0 // Move updated Raw(x) to use as Raw(x-bpp) for next loop - cmp ebx, MMXLength + cmp ebx, MMXLength movq mm2, mm0 // mov updated Raw(x) to mm2 - jb davg3lp - } // end _asm block + jb davg3lp + } // end _asm block } break; + case 6: case 4: case 7: case 5: { ActiveMask.use = 0xffffffffffffffff; // use shift below to clear - // appropriate inactive bytes + // appropriate inactive bytes ShiftBpp.use = bpp << 3; ShiftRem.use = 64 - ShiftBpp.use; - _asm { + _asm { movq mm4, HBClearMask // Re-init address pointers and offset mov ebx, diff // ebx ==> x = offset to alignment boundary // Load ActiveMask and clear all bytes except for 1st active group movq mm7, ActiveMask - mov edi, row // edi ==> Avg(x) + mov edi, row // edi ==> Avg(x) psrlq mm7, ShiftRem - mov esi, prev_row // esi ==> Prior(x) + mov esi, prev_row // esi ==> Prior(x) movq mm6, mm7 movq mm5, LBCarryMask - psllq mm6, ShiftBpp // Create mask for 2nd active group + psllq mm6, ShiftBpp // Create mask for 2nd active group // PRIME the pump (load the first Raw(x-bpp) data set - movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes + movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes // (we correct position in loop below) davg4lp: - movq mm0, [edi + ebx] + movq mm0, [edi + ebx] psrlq mm2, ShiftRem // shift data to position correctly - movq mm1, [esi + ebx] + movq mm1, [esi + ebx] // Add (Prev_row/2) to Average movq mm3, mm5 - pand mm3, mm1 // get lsb for each prev_row byte - psrlq mm1, 1 // divide prev_row bytes by 2 - pand mm1, mm4 // clear invalid bit 7 of each byte - paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte + pand mm3, mm1 // get lsb for each prev_row byte + psrlq mm1, 1 // divide prev_row bytes by 2 + pand mm1, mm4 // clear invalid bit 7 of each byte + paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry - movq mm1, mm3 // now use mm1 for getting LBCarrys - pand mm1, mm2 // get LBCarrys for each byte where both - // lsb's were == 1 (Only valid for active group) - psrlq mm2, 1 // divide raw bytes by 2 - pand mm2, mm4 // clear invalid bit 7 of each byte - paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte - pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte + movq mm1, mm3 // now use mm1 for getting LBCarrys + pand mm1, mm2 // get LBCarrys for each byte where both + // lsb's were == 1 (Only valid for active group) + psrlq mm2, 1 // divide raw bytes by 2 + pand mm2, mm4 // clear invalid bit 7 of each byte + paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte + pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active + // byte // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry - movq mm2, mm0 // mov updated Raws to mm2 - psllq mm2, ShiftBpp // shift data to position correctly - add ebx, 8 - movq mm1, mm3 // now use mm1 for getting LBCarrys - pand mm1, mm2 // get LBCarrys for each byte where both - // lsb's were == 1 (Only valid for active group) - psrlq mm2, 1 // divide raw bytes by 2 - pand mm2, mm4 // clear invalid bit 7 of each byte - paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte - pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte - cmp ebx, MMXLength + movq mm2, mm0 // mov updated Raws to mm2 + psllq mm2, ShiftBpp // shift data to position correctly + add ebx, 8 + movq mm1, mm3 // now use mm1 for getting LBCarrys + pand mm1, mm2 // get LBCarrys for each byte where both + // lsb's were == 1 (Only valid for active group) + psrlq mm2, 1 // divide raw bytes by 2 + pand mm2, mm4 // clear invalid bit 7 of each byte + paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte + pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active + // byte + cmp ebx, MMXLength // Now ready to write back to memory - movq [edi + ebx - 8], mm0 + movq [edi + ebx - 8], mm0 // Prep Raw(x-bpp) for next loop - movq mm2, mm0 // mov updated Raws to mm2 - jb davg4lp - } // end _asm block + movq mm2, mm0 // mov updated Raws to mm2 + jb davg4lp + } // end _asm block } break; case 2: { ActiveMask.use = 0x000000000000ffff; - ShiftBpp.use = 24; // == 3 * 8 - ShiftRem.use = 40; // == 64 - 24 - _asm { + ShiftBpp.use = 24; // == 3 * 8 + ShiftRem.use = 40; // == 64 - 24 + _asm { // Load ActiveMask movq mm7, ActiveMask // Re-init address pointers and offset - mov ebx, diff // ebx ==> x = offset to alignment boundary + mov ebx, diff // ebx ==> x = offset to alignment boundary movq mm5, LBCarryMask - mov edi, row // edi ==> Avg(x) + mov edi, row // edi ==> Avg(x) movq mm4, HBClearMask - mov esi, prev_row // esi ==> Prior(x) + mov esi, prev_row // esi ==> Prior(x) // PRIME the pump (load the first Raw(x-bpp) data set - movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes - // (we correct position in loop below) + movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes + // (we correct position in loop below) davg2lp: - movq mm0, [edi + ebx] + movq mm0, [edi + ebx] psllq mm2, ShiftRem // shift data to position correctly - movq mm1, [esi + ebx] + movq mm1, [esi + ebx] // Add (Prev_row/2) to Average movq mm3, mm5 - pand mm3, mm1 // get lsb for each prev_row byte - psrlq mm1, 1 // divide prev_row bytes by 2 - pand mm1, mm4 // clear invalid bit 7 of each byte + pand mm3, mm1 // get lsb for each prev_row byte + psrlq mm1, 1 // divide prev_row bytes by 2 + pand mm1, mm4 // clear invalid bit 7 of each byte movq mm6, mm7 - paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte + paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry - movq mm1, mm3 // now use mm1 for getting LBCarrys - pand mm1, mm2 // get LBCarrys for each byte where both - // lsb's were == 1 (Only valid for active group) - psrlq mm2, 1 // divide raw bytes by 2 - pand mm2, mm4 // clear invalid bit 7 of each byte - paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte - pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte + movq mm1, mm3 // now use mm1 for getting LBCarrys + pand mm1, mm2 // get LBCarrys for each byte where both + // lsb's were == 1 (Only valid for active group) + psrlq mm2, 1 // divide raw bytes by 2 + pand mm2, mm4 // clear invalid bit 7 of each byte + paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte + pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry - psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3 - movq mm2, mm0 // mov updated Raws to mm2 - psllq mm2, ShiftBpp // shift data to position correctly - movq mm1, mm3 // now use mm1 for getting LBCarrys - pand mm1, mm2 // get LBCarrys for each byte where both - // lsb's were == 1 (Only valid for active group) - psrlq mm2, 1 // divide raw bytes by 2 - pand mm2, mm4 // clear invalid bit 7 of each byte - paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte - pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte + psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3 + movq mm2, mm0 // mov updated Raws to mm2 + psllq mm2, ShiftBpp // shift data to position correctly + movq mm1, mm3 // now use mm1 for getting LBCarrys + pand mm1, mm2 // get LBCarrys for each byte where both + // lsb's were == 1 (Only valid for active group) + psrlq mm2, 1 // divide raw bytes by 2 + pand mm2, mm4 // clear invalid bit 7 of each byte + paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte + pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry - psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5 - movq mm2, mm0 // mov updated Raws to mm2 - psllq mm2, ShiftBpp // shift data to position correctly - // Data only needs to be shifted once here to - // get the correct x-bpp offset. - movq mm1, mm3 // now use mm1 for getting LBCarrys - pand mm1, mm2 // get LBCarrys for each byte where both - // lsb's were == 1 (Only valid for active group) - psrlq mm2, 1 // divide raw bytes by 2 - pand mm2, mm4 // clear invalid bit 7 of each byte - paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte - pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte + psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5 + movq mm2, mm0 // mov updated Raws to mm2 + psllq mm2, ShiftBpp // shift data to position correctly + // Data only needs to be shifted once here to + // get the correct x-bpp offset. + movq mm1, mm3 // now use mm1 for getting LBCarrys + pand mm1, mm2 // get LBCarrys for each byte where both + // lsb's were == 1 (Only valid for active group) + psrlq mm2, 1 // divide raw bytes by 2 + pand mm2, mm4 // clear invalid bit 7 of each byte + paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte + pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7 @@ -2278,72 +2149,73 @@ davg2lp: // Data only needs to be shifted once here to // get the correct x-bpp offset. add ebx, 8 - movq mm1, mm3 // now use mm1 for getting LBCarrys - pand mm1, mm2 // get LBCarrys for each byte where both - // lsb's were == 1 (Only valid for active group) - psrlq mm2, 1 // divide raw bytes by 2 - pand mm2, mm4 // clear invalid bit 7 of each byte - paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte - pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg - paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte + movq mm1, mm3 // now use mm1 for getting LBCarrys + pand mm1, mm2 // get LBCarrys for each byte where both + // lsb's were == 1 (Only valid for active group) + psrlq mm2, 1 // divide raw bytes by 2 + pand mm2, mm4 // clear invalid bit 7 of each byte + paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte + pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg + paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte - cmp ebx, MMXLength + cmp ebx, MMXLength // Now ready to write back to memory - movq [edi + ebx - 8], mm0 + movq [edi + ebx - 8], mm0 // Prep Raw(x-bpp) for next loop - movq mm2, mm0 // mov updated Raws to mm2 - jb davg2lp - } // end _asm block + movq mm2, mm0 // mov updated Raws to mm2 + jb davg2lp + } // end _asm block } break; - case 1: // bpp == 1 + + case 1: // bpp == 1 { _asm { // Re-init address pointers and offset - mov ebx, diff // ebx ==> x = offset to alignment boundary - mov edi, row // edi ==> Avg(x) + mov ebx, diff // ebx ==> x = offset to alignment boundary + mov edi, row // edi ==> Avg(x) cmp ebx, FullLength // Test if offset at end of array - jnb davg1end + jnb davg1end // Do Paeth decode for remaining bytes - mov esi, prev_row // esi ==> Prior(x) + mov esi, prev_row // esi ==> Prior(x) mov edx, edi xor ecx, ecx // zero ecx before using cl & cx in loop below sub edx, bpp // edx ==> Raw(x-bpp) davg1lp: // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) xor eax, eax - mov cl, [esi + ebx] // load cl with Prior(x) + mov cl, [esi + ebx] // load cl with Prior(x) mov al, [edx + ebx] // load al with Raw(x-bpp) add ax, cx - inc ebx + inc ebx shr ax, 1 // divide by 2 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx - cmp ebx, FullLength // Check if at end of array - mov [edi+ebx-1], al // Write back Raw(x); + cmp ebx, FullLength // Check if at end of array + mov [edi+ebx-1], al // Write back Raw(x); // mov does not affect flags; -1 to offset inc ebx - jb davg1lp + jb davg1lp davg1end: - } // end _asm block + } // end _asm block } return; case 8: // bpp == 8 { - _asm { + _asm { // Re-init address pointers and offset mov ebx, diff // ebx ==> x = offset to alignment boundary movq mm5, LBCarryMask mov edi, row // edi ==> Avg(x) movq mm4, HBClearMask - mov esi, prev_row // esi ==> Prior(x) + mov esi, prev_row // esi ==> Prior(x) // PRIME the pump (load the first Raw(x-bpp) data set - movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes + movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes // (NO NEED to correct position in loop below) davg8lp: - movq mm0, [edi + ebx] + movq mm0, [edi + ebx] movq mm3, mm5 - movq mm1, [esi + ebx] - add ebx, 8 + movq mm1, [esi + ebx] + add ebx, 8 pand mm3, mm1 // get lsb for each prev_row byte psrlq mm1, 1 // divide prev_row bytes by 2 pand mm3, mm2 // get LBCarrys for each byte where both @@ -2353,31 +2225,31 @@ davg8lp: paddb mm0, mm3 // add LBCarrys to Avg for each byte pand mm2, mm4 // clear invalid bit 7 of each byte paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte - paddb mm0, mm2 // add (Raw/2) to Avg for each byte - cmp ebx, MMXLength - movq [edi + ebx - 8], mm0 + paddb mm0, mm2 // add (Raw/2) to Avg for each byte + cmp ebx, MMXLength + movq [edi + ebx - 8], mm0 movq mm2, mm0 // reuse as Raw(x-bpp) - jb davg8lp - } // end _asm block + jb davg8lp + } // end _asm block } break; default: // bpp greater than 8 { - _asm { + _asm { movq mm5, LBCarryMask // Re-init address pointers and offset mov ebx, diff // ebx ==> x = offset to alignment boundary mov edi, row // edi ==> Avg(x) movq mm4, HBClearMask mov edx, edi - mov esi, prev_row // esi ==> Prior(x) + mov esi, prev_row // esi ==> Prior(x) sub edx, bpp // edx ==> Raw(x-bpp) davgAlp: - movq mm0, [edi + ebx] + movq mm0, [edi + ebx] movq mm3, mm5 - movq mm1, [esi + ebx] + movq mm1, [esi + ebx] pand mm3, mm1 // get lsb for each prev_row byte - movq mm2, [edx + ebx] + movq mm2, [edx + ebx] psrlq mm1, 1 // divide prev_row bytes by 2 pand mm3, mm2 // get LBCarrys for each byte where both // lsb's were == 1 @@ -2386,70 +2258,72 @@ davgAlp: paddb mm0, mm3 // add LBCarrys to Avg for each byte pand mm2, mm4 // clear invalid bit 7 of each byte paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte - add ebx, 8 - paddb mm0, mm2 // add (Raw/2) to Avg for each byte - cmp ebx, MMXLength - movq [edi + ebx - 8], mm0 - jb davgAlp - } // end _asm block + add ebx, 8 + paddb mm0, mm2 // add (Raw/2) to Avg for each byte + cmp ebx, MMXLength + movq [edi + ebx - 8], mm0 + jb davgAlp + } // end _asm block } break; - } // end switch ( bpp ) + } // end switch ( bpp ) - _asm { + _asm { // MMX acceleration complete now do clean-up // Check if any remaining bytes left to decode - mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX - mov edi, row // edi ==> Avg(x) - cmp ebx, FullLength // Test if offset at end of array - jnb davgend + mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX + mov edi, row // edi ==> Avg(x) + cmp ebx, FullLength // Test if offset at end of array + jnb davgend // Do Paeth decode for remaining bytes - mov esi, prev_row // esi ==> Prior(x) + mov esi, prev_row // esi ==> Prior(x) mov edx, edi - xor ecx, ecx // zero ecx before using cl & cx in loop below - sub edx, bpp // edx ==> Raw(x-bpp) + xor ecx, ecx // zero ecx before using cl & cx in loop below + sub edx, bpp // edx ==> Raw(x-bpp) davglp2: // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) xor eax, eax - mov cl, [esi + ebx] // load cl with Prior(x) - mov al, [edx + ebx] // load al with Raw(x-bpp) + mov cl, [esi + ebx] // load cl with Prior(x) + mov al, [edx + ebx] // load al with Raw(x-bpp) add ax, cx - inc ebx + inc ebx shr ax, 1 // divide by 2 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx - cmp ebx, FullLength // Check if at end of array - mov [edi+ebx-1], al // Write back Raw(x); + cmp ebx, FullLength // Check if at end of array + mov [edi+ebx-1], al // Write back Raw(x); // mov does not affect flags; -1 to offset inc ebx - jb davglp2 + jb davglp2 davgend: - emms // End MMX instructions; prep for possible FP instrs. + emms // End MMX instructions; prep for possible FP instrs. } // end _asm block } // Optimized code for PNG Paeth filter decoder void -png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row - , png_bytep prev_row) +png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row, + png_bytep prev_row) { - png_uint_32 FullLength; - png_uint_32 MMXLength; - //png_uint_32 len; - int bpp; - int diff; - //int ptemp; - int patemp, pbtemp, pctemp; - bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel - FullLength = row_info->rowbytes; // # of bytes to filter - _asm { - xor ebx, ebx // ebx ==> x offset - mov edi, row - xor edx, edx // edx ==> x-bpp offset - mov esi, prev_row + png_uint_32 FullLength; + png_uint_32 MMXLength; + //png_uint_32 len; + int bpp; + int diff; + //int ptemp; + int patemp, pbtemp, pctemp; + + bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel + FullLength = row_info->rowbytes; // # of bytes to filter + _asm + { + xor ebx, ebx // ebx ==> x offset + mov edi, row + xor edx, edx // edx ==> x-bpp offset + mov esi, prev_row xor eax, eax - // Compute the Raw value for the first bpp bytes - // Note: the formula works out to always be Paeth(x) = Raw(x) + Prior(x) - // where x < bpp + // Compute the Raw value for the first bpp bytes + // Note: the formula works out to be always + // Paeth(x) = Raw(x) + Prior(x) where x < bpp dpthrlp: mov al, [edi + ebx] add al, [esi + ebx] @@ -2460,7 +2334,7 @@ dpthrlp: // get # of bytes to alignment mov diff, edi // take start of row add diff, ebx // add bpp - xor ecx, ecx + xor ecx, ecx add diff, 0xf // add 7 + 8 to incr past alignment boundary and diff, 0xfffffff8 // mask to alignment boundary sub diff, edi // subtract from start ==> value ebx at alignment @@ -2523,33 +2397,34 @@ dpthabc: // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) mov cl, [edi + edx] // load Raw(x-bpp) into cl dpthpaeth: - inc ebx - inc edx + inc ebx + inc edx // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 add [edi + ebx - 1], cl - cmp ebx, diff - jb dpthlp1 + cmp ebx, diff + jb dpthlp1 dpthgo: - mov ecx, FullLength + mov ecx, FullLength mov eax, ecx sub eax, ebx // subtract alignment fix and eax, 0x00000007 // calc bytes over mult of 8 sub ecx, eax // drop over bytes from original length mov MMXLength, ecx - } // end _asm block - // Now do the math for the rest of the row - switch ( bpp ) - { + } // end _asm block + // Now do the math for the rest of the row + switch ( bpp ) + { case 3: { ActiveMask.use = 0x0000000000ffffff; ActiveMaskEnd.use = 0xffff000000000000; ShiftBpp.use = 24; // == bpp(3) * 8 ShiftRem.use = 40; // == 64 - 24 - _asm { + _asm + { mov ebx, diff - mov edi, row - mov esi, prev_row + mov edi, row + mov esi, prev_row pxor mm0, mm0 // PRIME the pump (load the first Raw(x-bpp) data set movq mm1, [edi+ebx-8] @@ -2574,23 +2449,23 @@ dpth3lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm4 // Create mask pav bytes < 0 + pcmpgtw mm0, mm4 // Create mask pav bytes < 0 paddw mm6, mm5 - pand mm0, mm4 // Only pav bytes < 0 in mm7 - pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 + pand mm0, mm4 // Only pav bytes < 0 in mm7 + pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 psubw mm4, mm0 - pand mm7, mm5 // Only pbv bytes < 0 in mm0 + pand mm7, mm5 // Only pbv bytes < 0 in mm0 psubw mm4, mm0 psubw mm5, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm5, mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm7 mask to merge pa & pb pand mm5, mm7 @@ -2601,7 +2476,7 @@ dpth3lp: paddw mm7, mm5 paddw mm0, mm2 // test ((pa <= pb)? pa:pb) <= pc - pcmpgtw mm7, mm6 // pab > pc? + pcmpgtw mm7, mm6 // pab > pc? pxor mm1, mm1 pand mm3, mm7 pandn mm7, mm0 @@ -2634,22 +2509,22 @@ dpth3lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm5 // Create mask pbv bytes < 0 - pcmpgtw mm7, mm4 // Create mask pav bytes < 0 - pand mm0, mm5 // Only pbv bytes < 0 in mm0 - pand mm7, mm4 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm5 // Create mask pbv bytes < 0 + pcmpgtw mm7, mm4 // Create mask pav bytes < 0 + pand mm0, mm5 // Only pbv bytes < 0 in mm0 + pand mm7, mm4 // Only pav bytes < 0 in mm7 psubw mm5, mm0 psubw mm4, mm7 psubw mm5, mm0 psubw mm4, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm7 mask to merge pa & pb pand mm5, mm7 @@ -2660,8 +2535,8 @@ dpth3lp: paddw mm7, mm5 paddw mm0, mm2 // test ((pa <= pb)? pa:pb) <= pc - pcmpgtw mm7, mm6 // pab > pc? - movq mm2, [esi + ebx] // load b=Prior(x) + pcmpgtw mm7, mm6 // pab > pc? + movq mm2, [esi + ebx] // load b=Prior(x) pand mm3, mm7 pandn mm7, mm0 pxor mm1, mm1 @@ -2696,22 +2571,22 @@ dpth3lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm4 // Create mask pav bytes < 0 - pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 - pand mm0, mm4 // Only pav bytes < 0 in mm7 - pand mm7, mm5 // Only pbv bytes < 0 in mm0 + pcmpgtw mm0, mm4 // Create mask pav bytes < 0 + pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 + pand mm0, mm4 // Only pav bytes < 0 in mm7 + pand mm7, mm5 // Only pbv bytes < 0 in mm0 psubw mm4, mm0 psubw mm5, mm7 psubw mm4, mm0 psubw mm5, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm0 mask copy to merge a & b pand mm2, mm0 @@ -2722,26 +2597,27 @@ dpth3lp: paddw mm0, mm2 paddw mm7, mm5 // test ((pa <= pb)? pa:pb) <= pc - pcmpgtw mm7, mm6 // pab > pc? + pcmpgtw mm7, mm6 // pab > pc? pand mm3, mm7 pandn mm7, mm0 paddw mm7, mm3 pxor mm1, mm1 packuswb mm1, mm7 // Step ebx to next set of 8 bytes and repeat loop til done - add ebx, 8 + add ebx, 8 pand mm1, ActiveMaskEnd paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) - cmp ebx, MMXLength + cmp ebx, MMXLength pxor mm0, mm0 // pxor does not affect flags movq [edi + ebx - 8], mm1 // write back updated value // mm1 will be used as Raw(x-bpp) next loop // mm3 ready to be used as Prior(x-bpp) next loop - jb dpth3lp - } // end _asm block + jb dpth3lp + } // end _asm block } break; + case 6: case 7: case 5: @@ -2750,18 +2626,19 @@ dpth3lp: ActiveMask2.use = 0xffffffff00000000; ShiftBpp.use = bpp << 3; // == bpp * 8 ShiftRem.use = 64 - ShiftBpp.use; - _asm { + _asm + { mov ebx, diff - mov edi, row // - mov esi, prev_row + mov edi, row + mov esi, prev_row // PRIME the pump (load the first Raw(x-bpp) data set - movq mm1, [edi+ebx-8] + movq mm1, [edi+ebx-8] pxor mm0, mm0 dpth6lp: // Must shift to position Raw(x-bpp) data psrlq mm1, ShiftRem // Do first set of 4 bytes - movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes + movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes punpcklbw mm1, mm0 // Unpack Low bytes of a movq mm2, [esi + ebx] // load b=Prior(x) punpcklbw mm2, mm0 // Unpack Low bytes of b @@ -2780,23 +2657,23 @@ dpth6lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm4 // Create mask pav bytes < 0 + pcmpgtw mm0, mm4 // Create mask pav bytes < 0 paddw mm6, mm5 - pand mm0, mm4 // Only pav bytes < 0 in mm7 - pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 + pand mm0, mm4 // Only pav bytes < 0 in mm7 + pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 psubw mm4, mm0 - pand mm7, mm5 // Only pbv bytes < 0 in mm0 + pand mm7, mm5 // Only pbv bytes < 0 in mm0 psubw mm4, mm0 psubw mm5, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm5, mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm7 mask to merge pa & pb pand mm5, mm7 @@ -2807,7 +2684,7 @@ dpth6lp: paddw mm7, mm5 paddw mm0, mm2 // test ((pa <= pb)? pa:pb) <= pc - pcmpgtw mm7, mm6 // pab > pc? + pcmpgtw mm7, mm6 // pab > pc? pxor mm1, mm1 pand mm3, mm7 pandn mm7, mm0 @@ -2821,7 +2698,7 @@ dpth6lp: paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) movq mm6, mm2 movq [edi + ebx], mm7 // write back updated value - movq mm1, [edi+ebx-8] + movq mm1, [edi+ebx-8] psllq mm6, ShiftBpp movq mm5, mm7 psrlq mm1, ShiftRem @@ -2844,23 +2721,23 @@ dpth6lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm4 // Create mask pav bytes < 0 + pcmpgtw mm0, mm4 // Create mask pav bytes < 0 paddw mm6, mm5 - pand mm0, mm4 // Only pav bytes < 0 in mm7 - pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 + pand mm0, mm4 // Only pav bytes < 0 in mm7 + pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 psubw mm4, mm0 - pand mm7, mm5 // Only pbv bytes < 0 in mm0 + pand mm7, mm5 // Only pbv bytes < 0 in mm0 psubw mm4, mm0 psubw mm5, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm5, mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm7 mask to merge pa & pb pand mm5, mm7 @@ -2879,29 +2756,31 @@ dpth6lp: paddw mm7, mm3 pxor mm0, mm0 // Step ex to next set of 8 bytes and repeat loop til done - add ebx, 8 + add ebx, 8 packuswb mm1, mm7 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) - cmp ebx, MMXLength + cmp ebx, MMXLength movq [edi + ebx - 8], mm1 // write back updated value // mm1 will be used as Raw(x-bpp) next loop - jb dpth6lp - } // end _asm block + jb dpth6lp + } // end _asm block } break; + case 4: { ActiveMask.use = 0x00000000ffffffff; - _asm { + _asm { mov ebx, diff - mov edi, row // - mov esi, prev_row + mov edi, row + mov esi, prev_row pxor mm0, mm0 // PRIME the pump (load the first Raw(x-bpp) data set - movq mm1, [edi+ebx-8] // Only time should need to read a=Raw(x-bpp) bytes + movq mm1, [edi+ebx-8] // Only time should need to read + // a=Raw(x-bpp) bytes dpth4lp: // Do first set of 4 bytes - movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes + movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes punpckhbw mm1, mm0 // Unpack Low bytes of a movq mm2, [esi + ebx] // load b=Prior(x) punpcklbw mm2, mm0 // Unpack High bytes of b @@ -2918,23 +2797,23 @@ dpth4lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm4 // Create mask pav bytes < 0 + pcmpgtw mm0, mm4 // Create mask pav bytes < 0 paddw mm6, mm5 - pand mm0, mm4 // Only pav bytes < 0 in mm7 - pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 + pand mm0, mm4 // Only pav bytes < 0 in mm7 + pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 psubw mm4, mm0 - pand mm7, mm5 // Only pbv bytes < 0 in mm0 + pand mm7, mm5 // Only pbv bytes < 0 in mm0 psubw mm4, mm0 psubw mm5, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm5, mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm7 mask to merge pa & pb pand mm5, mm7 @@ -2945,7 +2824,7 @@ dpth4lp: paddw mm7, mm5 paddw mm0, mm2 // test ((pa <= pb)? pa:pb) <= pc - pcmpgtw mm7, mm6 // pab > pc? + pcmpgtw mm7, mm6 // pab > pc? pxor mm1, mm1 pand mm3, mm7 pandn mm7, mm0 @@ -2974,23 +2853,23 @@ dpth4lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm4 // Create mask pav bytes < 0 + pcmpgtw mm0, mm4 // Create mask pav bytes < 0 paddw mm6, mm5 - pand mm0, mm4 // Only pav bytes < 0 in mm7 - pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 + pand mm0, mm4 // Only pav bytes < 0 in mm7 + pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 psubw mm4, mm0 - pand mm7, mm5 // Only pbv bytes < 0 in mm0 + pand mm7, mm5 // Only pbv bytes < 0 in mm0 psubw mm4, mm0 psubw mm5, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm5, mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm7 mask to merge pa & pb pand mm5, mm7 @@ -3001,7 +2880,7 @@ dpth4lp: paddw mm7, mm5 paddw mm0, mm2 // test ((pa <= pb)? pa:pb) <= pc - pcmpgtw mm7, mm6 // pab > pc? + pcmpgtw mm7, mm6 // pab > pc? pxor mm1, mm1 pand mm3, mm7 pandn mm7, mm0 @@ -3009,29 +2888,30 @@ dpth4lp: paddw mm7, mm3 pxor mm0, mm0 // Step ex to next set of 8 bytes and repeat loop til done - add ebx, 8 + add ebx, 8 packuswb mm1, mm7 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) - cmp ebx, MMXLength + cmp ebx, MMXLength movq [edi + ebx - 8], mm1 // write back updated value // mm1 will be used as Raw(x-bpp) next loop - jb dpth4lp - } // end _asm block + jb dpth4lp + } // end _asm block } break; case 8: // bpp == 8 { ActiveMask.use = 0x00000000ffffffff; - _asm { + _asm { mov ebx, diff - mov edi, row // - mov esi, prev_row + mov edi, row + mov esi, prev_row pxor mm0, mm0 // PRIME the pump (load the first Raw(x-bpp) data set - movq mm1, [edi+ebx-8] // Only time should need to read a=Raw(x-bpp) bytes + movq mm1, [edi+ebx-8] // Only time should need to read + // a=Raw(x-bpp) bytes dpth8lp: // Do first set of 4 bytes - movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes + movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes punpcklbw mm1, mm0 // Unpack Low bytes of a movq mm2, [esi + ebx] // load b=Prior(x) punpcklbw mm2, mm0 // Unpack Low bytes of b @@ -3048,23 +2928,23 @@ dpth8lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm4 // Create mask pav bytes < 0 + pcmpgtw mm0, mm4 // Create mask pav bytes < 0 paddw mm6, mm5 - pand mm0, mm4 // Only pav bytes < 0 in mm7 - pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 + pand mm0, mm4 // Only pav bytes < 0 in mm7 + pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 psubw mm4, mm0 - pand mm7, mm5 // Only pbv bytes < 0 in mm0 + pand mm7, mm5 // Only pbv bytes < 0 in mm0 psubw mm4, mm0 psubw mm5, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm5, mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm7 mask to merge pa & pb pand mm5, mm7 @@ -3075,24 +2955,24 @@ dpth8lp: paddw mm7, mm5 paddw mm0, mm2 // test ((pa <= pb)? pa:pb) <= pc - pcmpgtw mm7, mm6 // pab > pc? + pcmpgtw mm7, mm6 // pab > pc? pxor mm1, mm1 pand mm3, mm7 pandn mm7, mm0 paddw mm7, mm3 pxor mm0, mm0 packuswb mm7, mm1 - movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes + movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes pand mm7, ActiveMask - movq mm2, [esi + ebx] // load b=Prior(x) - paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) - punpckhbw mm3, mm0 // Unpack High bytes of c - movq [edi + ebx], mm7 // write back updated value - movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes + movq mm2, [esi + ebx] // load b=Prior(x) + paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) + punpckhbw mm3, mm0 // Unpack High bytes of c + movq [edi + ebx], mm7 // write back updated value + movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes // Do second set of 4 bytes - punpckhbw mm2, mm0 // Unpack High bytes of b - punpckhbw mm1, mm0 // Unpack High bytes of a + punpckhbw mm2, mm0 // Unpack High bytes of b + punpckhbw mm1, mm0 // Unpack High bytes of a // pav = p - a = (a + b - c) - a = b - c movq mm4, mm2 // pbv = p - b = (a + b - c) - b = a - c @@ -3105,23 +2985,23 @@ dpth8lp: // pa = abs(p-a) = abs(pav) // pb = abs(p-b) = abs(pbv) // pc = abs(p-c) = abs(pcv) - pcmpgtw mm0, mm4 // Create mask pav bytes < 0 + pcmpgtw mm0, mm4 // Create mask pav bytes < 0 paddw mm6, mm5 - pand mm0, mm4 // Only pav bytes < 0 in mm7 - pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 + pand mm0, mm4 // Only pav bytes < 0 in mm7 + pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 psubw mm4, mm0 - pand mm7, mm5 // Only pbv bytes < 0 in mm0 + pand mm7, mm5 // Only pbv bytes < 0 in mm0 psubw mm4, mm0 psubw mm5, mm7 pxor mm0, mm0 - pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 - pand mm0, mm6 // Only pav bytes < 0 in mm7 + pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 + pand mm0, mm6 // Only pav bytes < 0 in mm7 psubw mm5, mm7 psubw mm6, mm0 // test pa <= pb movq mm7, mm4 psubw mm6, mm0 - pcmpgtw mm7, mm5 // pa > pb? + pcmpgtw mm7, mm5 // pa > pb? movq mm0, mm7 // use mm7 mask to merge pa & pb pand mm5, mm7 @@ -3132,7 +3012,7 @@ dpth8lp: paddw mm7, mm5 paddw mm0, mm2 // test ((pa <= pb)? pa:pb) <= pc - pcmpgtw mm7, mm6 // pab > pc? + pcmpgtw mm7, mm6 // pab > pc? pxor mm1, mm1 pand mm3, mm7 pandn mm7, mm0 @@ -3140,26 +3020,27 @@ dpth8lp: paddw mm7, mm3 pxor mm0, mm0 // Step ex to next set of 8 bytes and repeat loop til done - add ebx, 8 + add ebx, 8 packuswb mm1, mm7 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) - cmp ebx, MMXLength + cmp ebx, MMXLength movq [edi + ebx - 8], mm1 // write back updated value // mm1 will be used as Raw(x-bpp) next loop - jb dpth8lp - } // end _asm block + jb dpth8lp + } // end _asm block } break; - case 1: // bpp = 1 - case 2: // bpp = 2 - default: // bpp > 8 + + case 1: // bpp = 1 + case 2: // bpp = 2 + default: // bpp > 8 { - _asm { - mov ebx, diff - cmp ebx, FullLength - jnb dpthdend - mov edi, row // - mov esi, prev_row + _asm { + mov ebx, diff + cmp ebx, FullLength + jnb dpthdend + mov edi, row + mov esi, prev_row // Do Paeth decode for remaining bytes mov edx, ebx xor ecx, ecx // zero ecx before using cl & cx in loop below @@ -3221,25 +3102,26 @@ dpthdabc: // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) mov cl, [edi + edx] // load Raw(x-bpp) into cl dpthdpaeth: - inc ebx - inc edx + inc ebx + inc edx // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 add [edi + ebx - 1], cl - cmp ebx, FullLength - jb dpthdlp + cmp ebx, FullLength + jb dpthdlp dpthdend: - } // end _asm block + } // end _asm block } return; // No need to go further with this one - } // end switch ( bpp ) - _asm { + } // end switch ( bpp ) + _asm + { // MMX acceleration complete now do clean-up // Check if any remaining bytes left to decode - mov ebx, MMXLength - cmp ebx, FullLength - jnb dpthend - mov edi, row - mov esi, prev_row + mov ebx, MMXLength + cmp ebx, FullLength + jnb dpthend + mov edi, row + mov esi, prev_row // Do Paeth decode for remaining bytes mov edx, ebx xor ecx, ecx // zero ecx before using cl & cx in loop below @@ -3301,69 +3183,71 @@ dpthabc2: // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) mov cl, [edi + edx] // load Raw(x-bpp) into cl dpthpaeth2: - inc ebx - inc edx + inc ebx + inc edx // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 add [edi + ebx - 1], cl - cmp ebx, FullLength - jb dpthlp2 + cmp ebx, FullLength + jb dpthlp2 dpthend: - emms // End MMX instructions; prep for possible FP instrs. - } // end _asm block + emms // End MMX instructions; prep for possible FP instrs. + } // end _asm block } // Optimized code for PNG Sub filter decoder void png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row) { - //int test; - int bpp; - png_uint_32 FullLength; - png_uint_32 MMXLength; - int diff; - bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel - FullLength = row_info->rowbytes - bpp; // # of bytes to filter - _asm { + //int test; + int bpp; + png_uint_32 FullLength; + png_uint_32 MMXLength; + int diff; + + bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel + FullLength = row_info->rowbytes - bpp; // # of bytes to filter + _asm { mov edi, row mov esi, edi // lp = row - add edi, bpp // rp = row + bpp - xor eax, eax - // get # of bytes to alignment - mov diff, edi // take start of row - add diff, 0xf // add 7 + 8 to incr past + add edi, bpp // rp = row + bpp + xor eax, eax + // get # of bytes to alignment + mov diff, edi // take start of row + add diff, 0xf // add 7 + 8 to incr past // alignment boundary - xor ebx, ebx - and diff, 0xfffffff8 // mask to alignment boundary - sub diff, edi // subtract from start ==> value + xor ebx, ebx + and diff, 0xfffffff8 // mask to alignment boundary + sub diff, edi // subtract from start ==> value // ebx at alignment - jz dsubgo - // fix alignment + jz dsubgo + // fix alignment dsublp1: - mov al, [esi+ebx] - add [edi+ebx], al - inc ebx - cmp ebx, diff - jb dsublp1 + mov al, [esi+ebx] + add [edi+ebx], al + inc ebx + cmp ebx, diff + jb dsublp1 dsubgo: - mov ecx, FullLength - mov edx, ecx - sub edx, ebx // subtract alignment fix - and edx, 0x00000007 // calc bytes over mult of 8 - sub ecx, edx // drop over bytes from length - mov MMXLength, ecx - } // end _asm block - // Now do the math for the rest of the row - switch ( bpp ) - { - case 3: - { + mov ecx, FullLength + mov edx, ecx + sub edx, ebx // subtract alignment fix + and edx, 0x00000007 // calc bytes over mult of 8 + sub ecx, edx // drop over bytes from length + mov MMXLength, ecx + } // end _asm block + + // Now do the math for the rest of the row + switch ( bpp ) + { + case 3: + { ActiveMask.use = 0x0000ffffff000000; ShiftBpp.use = 24; // == 3 * 8 ShiftRem.use = 40; // == 64 - 24 - _asm { + _asm { mov edi, row movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group - mov esi, edi // lp = row + mov esi, edi // lp = row add edi, bpp // rp = row + bpp movq mm6, mm7 mov ebx, diff @@ -3376,234 +3260,242 @@ dsub3lp: // no need for mask; shift clears inactive bytes // Add 1st active group movq mm0, [edi+ebx] - paddb mm0, mm1 + paddb mm0, mm1 // Add 2nd active group movq mm1, mm0 // mov updated Raws to mm1 psllq mm1, ShiftBpp // shift data to position correctly pand mm1, mm7 // mask to use only 2nd active group - paddb mm0, mm1 + paddb mm0, mm1 // Add 3rd active group movq mm1, mm0 // mov updated Raws to mm1 psllq mm1, ShiftBpp // shift data to position correctly pand mm1, mm6 // mask to use only 3rd active group - add ebx, 8 - paddb mm0, mm1 - cmp ebx, MMXLength - movq [edi+ebx-8], mm0 // Write updated Raws back to array + add ebx, 8 + paddb mm0, mm1 + cmp ebx, MMXLength + movq [edi+ebx-8], mm0 // Write updated Raws back to array // Prep for doing 1st add at top of loop movq mm1, mm0 - jb dsub3lp - } // end _asm block + jb dsub3lp + } // end _asm block } break; + case 1: - { - /* Placed here just in case this is a duplicate of the - non-MMX code for the SUB filter in png_read_filter_row - above - */ -// png_bytep rp; -// png_bytep lp; -// png_uint_32 i; -// bpp = (row_info->pixel_depth + 7) >> 3; -// for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; -// i < row_info->rowbytes; i++, rp++, lp++) -// { -// *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); -// } - _asm { + { + // Placed here just in case this is a duplicate of the + // non-MMX code for the SUB filter in png_read_filter_row above + // + // png_bytep rp; + // png_bytep lp; + // png_uint_32 i; + // bpp = (row_info->pixel_depth + 7) >> 3; + // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; + // i < row_info->rowbytes; i++, rp++, lp++) + // { + // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); + // } + _asm { mov ebx, diff mov edi, row - cmp ebx, FullLength - jnb dsub1end - mov esi, edi // lp = row - xor eax, eax + cmp ebx, FullLength + jnb dsub1end + mov esi, edi // lp = row + xor eax, eax add edi, bpp // rp = row + bpp dsub1lp: - mov al, [esi+ebx] - add [edi+ebx], al - inc ebx - cmp ebx, FullLength - jb dsub1lp + mov al, [esi+ebx] + add [edi+ebx], al + inc ebx + cmp ebx, FullLength + jb dsub1lp dsub1end: - } // end _asm block - } + } // end _asm block + } return; + case 6: case 7: case 4: case 5: - { + { ShiftBpp.use = bpp << 3; ShiftRem.use = 64 - ShiftBpp.use; - _asm { + _asm { mov edi, row mov ebx, diff - mov esi, edi // lp = row + mov esi, edi // lp = row add edi, bpp // rp = row + bpp // PRIME the pump (load the first Raw(x-bpp) data set movq mm1, [edi+ebx-8] dsub4lp: psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes // no need for mask; shift clears inactive bytes - movq mm0, [edi+ebx] - paddb mm0, mm1 + movq mm0, [edi+ebx] + paddb mm0, mm1 // Add 2nd active group movq mm1, mm0 // mov updated Raws to mm1 psllq mm1, ShiftBpp // shift data to position correctly // there is no need for any mask // since shift clears inactive bits/bytes - add ebx, 8 - paddb mm0, mm1 - cmp ebx, MMXLength - movq [edi+ebx-8], mm0 + add ebx, 8 + paddb mm0, mm1 + cmp ebx, MMXLength + movq [edi+ebx-8], mm0 movq mm1, mm0 // Prep for doing 1st add at top of loop - jb dsub4lp - } // end _asm block + jb dsub4lp + } // end _asm block } break; + case 2: - { + { ActiveMask.use = 0x00000000ffff0000; ShiftBpp.use = 16; // == 2 * 8 ShiftRem.use = 48; // == 64 - 16 - _asm { + _asm { movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group mov ebx, diff movq mm6, mm7 - mov edi, row - psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active byte group - mov esi, edi // lp = row + mov edi, row + psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active + // byte group + mov esi, edi // lp = row movq mm5, mm6 - add edi, bpp // rp = row + bpp - psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active byte group + add edi, bpp // rp = row + bpp + psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active + // byte group // PRIME the pump (load the first Raw(x-bpp) data set movq mm1, [edi+ebx-8] dsub2lp: // Add 1st active group - psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes - // no need for mask; shift clears inactive bytes + psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes + // no need for mask; shift clears inactive + // bytes movq mm0, [edi+ebx] - paddb mm0, mm1 + paddb mm0, mm1 // Add 2nd active group - movq mm1, mm0 // mov updated Raws to mm1 - psllq mm1, ShiftBpp // shift data to position correctly - pand mm1, mm7 // mask to use only 2nd active group - paddb mm0, mm1 + movq mm1, mm0 // mov updated Raws to mm1 + psllq mm1, ShiftBpp // shift data to position correctly + pand mm1, mm7 // mask to use only 2nd active group + paddb mm0, mm1 // Add 3rd active group - movq mm1, mm0 // mov updated Raws to mm1 - psllq mm1, ShiftBpp // shift data to position correctly - pand mm1, mm6 // mask to use only 3rd active group - paddb mm0, mm1 + movq mm1, mm0 // mov updated Raws to mm1 + psllq mm1, ShiftBpp // shift data to position correctly + pand mm1, mm6 // mask to use only 3rd active group + paddb mm0, mm1 // Add 4th active group - movq mm1, mm0 // mov updated Raws to mm1 - psllq mm1, ShiftBpp // shift data to position correctly - pand mm1, mm5 // mask to use only 4th active group - add ebx, 8 - paddb mm0, mm1 - cmp ebx, MMXLength - movq [edi+ebx-8], mm0 // Write updated Raws back to array - movq mm1, mm0 // Prep for doing 1st add at top of loop - jb dsub2lp - } // end _asm block + movq mm1, mm0 // mov updated Raws to mm1 + psllq mm1, ShiftBpp // shift data to position correctly + pand mm1, mm5 // mask to use only 4th active group + add ebx, 8 + paddb mm0, mm1 + cmp ebx, MMXLength + movq [edi+ebx-8], mm0 // Write updated Raws back to array + movq mm1, mm0 // Prep for doing 1st add at top of loop + jb dsub2lp + } // end _asm block } break; case 8: - { - _asm { - mov edi, row + { + _asm { + mov edi, row mov ebx, diff - mov esi, edi // lp = row - add edi, bpp // rp = row + bpp - mov ecx, MMXLength + mov esi, edi // lp = row + add edi, bpp // rp = row + bpp + mov ecx, MMXLength movq mm7, [edi+ebx-8] // PRIME the pump (load the first // Raw(x-bpp) data set and ecx, 0x0000003f // calc bytes over mult of 64 dsub8lp: - movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes - paddb mm0, mm7 - movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes - movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes + movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes + paddb mm0, mm7 + movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes + movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes // Now mm0 will be used as Raw(x-bpp) for // the 2nd group of 8 bytes. This will be // repeated for each group of 8 bytes with // the 8th group being used as the Raw(x-bpp) // for the 1st group of the next loop. - paddb mm1, mm0 - movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes - movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes - paddb mm2, mm1 - movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes - movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes - paddb mm3, mm2 - movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes - movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes - paddb mm4, mm3 - movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes - movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes - paddb mm5, mm4 - movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes - movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes - paddb mm6, mm5 - movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes - movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes - add ebx, 64 - paddb mm7, mm6 - cmp ebx, ecx - movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes - jb dsub8lp - cmp ebx, MMXLength - jnb dsub8lt8 + paddb mm1, mm0 + movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes + movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes + paddb mm2, mm1 + movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes + movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes + paddb mm3, mm2 + movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes + movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes + paddb mm4, mm3 + movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes + movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes + paddb mm5, mm4 + movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes + movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes + paddb mm6, mm5 + movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes + movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes + add ebx, 64 + paddb mm7, mm6 + cmp ebx, ecx + movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes + jb dsub8lp + cmp ebx, MMXLength + jnb dsub8lt8 dsub8lpA: movq mm0, [edi+ebx] - add ebx, 8 - paddb mm0, mm7 - cmp ebx, MMXLength - movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx - movq mm7, mm0 // Move calculated Raw(x) data to mm1 to - // be the new Raw(x-bpp) for the next loop - jb dsub8lpA + add ebx, 8 + paddb mm0, mm7 + cmp ebx, MMXLength + movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx + movq mm7, mm0 // Move calculated Raw(x) data to mm1 to + // be the new Raw(x-bpp) for the next loop + jb dsub8lpA dsub8lt8: - } // end _asm block + } // end _asm block } break; + default: // bpp greater than 8 bytes - { - _asm { + { + _asm { mov ebx, diff - mov edi, row - mov esi, edi // lp = row + mov edi, row + mov esi, edi // lp = row add edi, bpp // rp = row + bpp dsubAlp: - movq mm0, [edi+ebx] - movq mm1, [esi+ebx] - add ebx, 8 - paddb mm0, mm1 - cmp ebx, MMXLength - movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset add ebx - jb dsubAlp - } // end _asm block + movq mm0, [edi+ebx] + movq mm1, [esi+ebx] + add ebx, 8 + paddb mm0, mm1 + cmp ebx, MMXLength + movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset + // add ebx + jb dsubAlp + } // end _asm block } break; - } // end switch ( bpp ) - _asm { - mov ebx, MMXLength - mov edi, row + } // end switch ( bpp ) + + _asm { + mov ebx, MMXLength + mov edi, row cmp ebx, FullLength jnb dsubend mov esi, edi // lp = row xor eax, eax - add edi, bpp // rp = row + bpp + add edi, bpp // rp = row + bpp dsublp2: mov al, [esi+ebx] add [edi+ebx], al - inc ebx - cmp ebx, FullLength + inc ebx + cmp ebx, FullLength jb dsublp2 dsubend: - emms // End MMX instructions; prep for possible FP instrs. - } // end _asm block + emms // End MMX instructions; prep for possible FP instrs. + } // end _asm block } // Optimized code for PNG Up filter decoder @@ -3611,20 +3503,20 @@ void png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row, png_bytep prev_row) { - png_uint_32 len; - len = row_info->rowbytes; // # of bytes to filter - _asm { + png_uint_32 len; + len = row_info->rowbytes; // # of bytes to filter + _asm { mov edi, row - // get # of bytes to alignment - mov ecx, edi - xor ebx, ebx - add ecx, 0x7 - xor eax, eax - and ecx, 0xfffffff8 + // get # of bytes to alignment + mov ecx, edi + xor ebx, ebx + add ecx, 0x7 + xor eax, eax + and ecx, 0xfffffff8 mov esi, prev_row - sub ecx, edi - jz dupgo - // fix alignment + sub ecx, edi + jz dupgo + // fix alignment duplp1: mov al, [edi+ebx] add al, [esi+ebx] @@ -3634,47 +3526,47 @@ duplp1: jb duplp1 dupgo: mov ecx, len - mov edx, ecx - sub edx, ebx // subtract alignment fix - and edx, 0x0000003f // calc bytes over mult of 64 - sub ecx, edx // drop over bytes from length - // Unrolled loop - use all MMX registers and interleave to reduce - // number of branch instructions (loops) and reduce partial stalls + mov edx, ecx + sub edx, ebx // subtract alignment fix + and edx, 0x0000003f // calc bytes over mult of 64 + sub ecx, edx // drop over bytes from length + // Unrolled loop - use all MMX registers and interleave to reduce + // number of branch instructions (loops) and reduce partial stalls duploop: movq mm1, [esi+ebx] movq mm0, [edi+ebx] - movq mm3, [esi+ebx+8] + movq mm3, [esi+ebx+8] paddb mm0, mm1 - movq mm2, [edi+ebx+8] + movq mm2, [edi+ebx+8] movq [edi+ebx], mm0 - paddb mm2, mm3 - movq mm5, [esi+ebx+16] - movq [edi+ebx+8], mm2 - movq mm4, [edi+ebx+16] - movq mm7, [esi+ebx+24] - paddb mm4, mm5 - movq mm6, [edi+ebx+24] - movq [edi+ebx+16], mm4 - paddb mm6, mm7 + paddb mm2, mm3 + movq mm5, [esi+ebx+16] + movq [edi+ebx+8], mm2 + movq mm4, [edi+ebx+16] + movq mm7, [esi+ebx+24] + paddb mm4, mm5 + movq mm6, [edi+ebx+24] + movq [edi+ebx+16], mm4 + paddb mm6, mm7 movq mm1, [esi+ebx+32] - movq [edi+ebx+24], mm6 + movq [edi+ebx+24], mm6 movq mm0, [edi+ebx+32] - movq mm3, [esi+ebx+40] + movq mm3, [esi+ebx+40] paddb mm0, mm1 - movq mm2, [edi+ebx+40] + movq mm2, [edi+ebx+40] movq [edi+ebx+32], mm0 - paddb mm2, mm3 - movq mm5, [esi+ebx+48] - movq [edi+ebx+40], mm2 - movq mm4, [edi+ebx+48] - movq mm7, [esi+ebx+56] - paddb mm4, mm5 - movq mm6, [edi+ebx+56] - movq [edi+ebx+48], mm4 - add ebx, 64 - paddb mm6, mm7 + paddb mm2, mm3 + movq mm5, [esi+ebx+48] + movq [edi+ebx+40], mm2 + movq mm4, [edi+ebx+48] + movq mm7, [esi+ebx+56] + paddb mm4, mm5 + movq mm6, [edi+ebx+56] + movq [edi+ebx+48], mm4 + add ebx, 64 + paddb mm6, mm7 cmp ebx, ecx - movq [edi+ebx-8], mm6 // (+56)movq does not affect flags; + movq [edi+ebx-8], mm6 // (+56)movq does not affect flags; // -8 to offset add ebx jb duploop @@ -3682,17 +3574,17 @@ duploop: jz dupend - // 2 lines added by lcreeve@netins.net - // (mail 11 Jul 98 in png-implement list) - cmp edx, 8 //test for less than 8 bytes - jb duplt8 + // 2 lines added by lcreeve@netins.net + // (mail 11 Jul 98 in png-implement list) + cmp edx, 8 //test for less than 8 bytes + jb duplt8 - add ecx, edx - and edx, 0x00000007 // calc bytes over mult of 8 - sub ecx, edx // drop over bytes from length + add ecx, edx + and edx, 0x00000007 // calc bytes over mult of 8 + sub ecx, edx // drop over bytes from length jz duplt8 - // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously + // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously duplpA: movq mm1, [esi+ebx] movq mm0, [edi+ebx] @@ -3704,9 +3596,9 @@ duplpA: cmp edx, 0 // Test for bytes over mult of 8 jz dupend duplt8: - xor eax, eax + xor eax, eax add ecx, edx // move over byte count into counter - // Loop using x86 registers to update remaining bytes + // Loop using x86 registers to update remaining bytes duplp2: mov al, [edi + ebx] add al, [esi + ebx] @@ -3715,52 +3607,54 @@ duplp2: mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx jb duplp2 dupend: - // Conversion of filtered row completed + // Conversion of filtered row completed emms // End MMX instructions; prep for possible FP instrs. - } // end _asm block + } // end _asm block } - // Optimized png_read_filter_row routines void png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep row, png_bytep prev_row, int filter) { +#ifdef PNG_DEBUG char filnm[6]; +#endif #define UseMMX (1) + if (mmx_supported == 2) + mmx_supported = mmxsupport(); - if (mmx_supported==2) - mmx_supported=mmxsupport(); - //if (!mmx_supported) + if (!mmx_supported) { png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter); return ; } - +#ifdef PNG_DEBUG png_debug(1, "in png_read_filter_row\n"); png_debug1(0,"%s, ", (UseMMX?"MMX":"x86")); switch (filter) { - case 0: sprintf(filnm, "None "); - break; - case 1: sprintf(filnm, "Sub "); - break; - case 2: sprintf(filnm, "Up "); - break; - case 3: sprintf(filnm, "Avg "); - break; - case 4: sprintf(filnm, "Paeth"); - break; - default: sprintf(filnm, "Unknw"); - break; + case 0: sprintf(filnm, "None "); + break; + case 1: sprintf(filnm, "Sub "); + break; + case 2: sprintf(filnm, "Up "); + break; + case 3: sprintf(filnm, "Avg "); + break; + case 4: sprintf(filnm, "Paeth"); + break; + default: sprintf(filnm, "Unknw"); + break; } png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm); png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth, (int)((row_info->pixel_depth + 7) >> 3)); png_debug1(0,"len=%8d, ", row_info->rowbytes); +#endif switch (filter) { @@ -3775,16 +3669,17 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep } //end if UseMMX else { - int bpp; - png_bytep rp; - png_bytep lp; png_uint_32 i; - bpp = (row_info->pixel_depth + 7) >> 3; - for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; - i < row_info->rowbytes; i++, rp++, lp++) + png_uint_32 istop = row_info->rowbytes; + png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; + png_bytep rp = row + bpp; + png_bytep lp = row; + + for (i = bpp; i < istop; i++) { - *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); - } + *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff); + rp++; + } } //end !UseMMX break; } @@ -3817,23 +3712,26 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep } //end if UseMMX else { - png_uint_32 i; - int bpp; - png_bytep rp; - png_bytep pp; - png_bytep lp; - bpp = (row_info->pixel_depth + 7) >> 3; - for (i = 0, rp = row, pp = prev_row; - i < (png_uint_32)bpp; i++, rp++, pp++) - { + png_uint_32 i; + png_bytep rp = row; + png_bytep pp = prev_row; + png_bytep lp = row; + png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; + png_uint_32 istop = row_info->rowbytes - bpp; + + for (i = 0; i < bpp; i++) + { *rp = (png_byte)(((int)(*rp) + - ((int)(*pp) / 2)) & 0xff); - } - for (lp = row; i < row_info->rowbytes; i++, rp++, lp++, pp++) - { + ((int)(*pp++) >> 1)) & 0xff); + rp++; + } + + for (i = 0; i < istop; i++) + { *rp = (png_byte)(((int)(*rp) + - (int)(*pp + *lp) / 2) & 0xff); - } + ((int)(*pp++ + *lp++) >> 1)) & 0xff); + rp++; + } } //end !UseMMX break; } @@ -3846,36 +3744,54 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep } //end if UseMMX else { - int bpp; png_uint_32 i; - png_bytep rp; - png_bytep pp; - png_bytep lp; - png_bytep cp; - bpp = (row_info->pixel_depth + 7) >> 3; - for (i = 0, rp = row, pp = prev_row; - i < (png_uint_32)bpp; i++, rp++, pp++) + png_bytep rp = row; + png_bytep pp = prev_row; + png_bytep lp = row; + png_bytep cp = prev_row; + png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; + png_uint_32 istop=row_info->rowbytes - bpp; + + for (i = 0; i < bpp; i++) { - *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff); + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; } - for (lp = rp - bpp, cp = pp - bpp; - i < row_info->rowbytes; i++, rp++, pp++, lp++, cp++) + + for (i = 0; i < istop; i++) // use leftover rp,pp { int a, b, c, pa, pb, pc, p; - b = *pp; - c = *cp; - a = *lp; - p = a + b - c; - pa = abs(p - a); - pb = abs(p - b); - pc = abs(p - c); - if (pa <= pb && pa <= pc) - p = a; - else if (pb <= pc) - p = b; - else - p = c; + + a = *lp++; + b = *pp++; + c = *cp++; + + p = b - c; + pc = a - c; + +#ifdef PNG_USE_ABS + pa = abs(p); + pb = abs(pc); + pc = abs(p + pc); +#else + pa = p < 0 ? -p : p; + pb = pc < 0 ? -pc : pc; + pc = (p + pc) < 0 ? -(p + pc) : p + pc; +#endif + + /* + if (pa <= pb && pa <= pc) + p = a; + else if (pb <= pc) + p = b; + else + p = c; + */ + + p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c; + *rp = (png_byte)(((int)(*rp) + p) & 0xff); + rp++; } } //end !UseMMX break; diff --git a/pngwio.c b/pngwio.c index 3831acfb3..d5444a0ab 100644 --- a/pngwio.c +++ b/pngwio.c @@ -1,7 +1,7 @@ /* pngwio.c - functions for data output * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngwrite.c b/pngwrite.c index 9a3c928ed..9830ca121 100644 --- a/pngwrite.c +++ b/pngwrite.c @@ -1,7 +1,7 @@ /* pngwrite.c - general routines to write a PNG file * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngwtran.c b/pngwtran.c index 10f50c0fa..cd32a62d1 100644 --- a/pngwtran.c +++ b/pngwtran.c @@ -1,7 +1,7 @@ /* pngwtran.c - transforms the data in a row for PNG writers * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/pngwutil.c b/pngwutil.c index 446c4daf6..b7a104be9 100644 --- a/pngwutil.c +++ b/pngwutil.c @@ -1,7 +1,7 @@ /* pngwutil.c - utilities to write a PNG file * - * libpng 1.0.4 - September 19, 1999 + * libpng 1.0.4c - October 1, 1999 * For conditions of distribution and use, see copyright notice in png.h * Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc. * Copyright (c) 1996, 1997 Andreas Dilger diff --git a/scripts/makefile.beos b/scripts/makefile.beos index bc7be4af3..0a8915133 100644 --- a/scripts/makefile.beos +++ b/scripts/makefile.beos @@ -1,5 +1,5 @@ # makefile for libpng on BeOS x86 ELF with gcc -# modified from makefile.lnx by Sander Stoks +# modified from makefile.linux by Sander Stoks # Copyright (C) 1996, 1997 Andreas Dilger # Copyright (C) 1999 Greg Roelofs # For conditions of distribution and use, see copyright notice in png.h @@ -31,7 +31,7 @@ RANLIB=ranlib # read libpng.txt or png.h to see why PNGMAJ is 2. You should not # have to change it. PNGMAJ = 2 -PNGMIN = 1.0.4 +PNGMIN = 1.0.4c PNGVER = $(PNGMAJ).$(PNGMIN) # where make install puts libpng.a, libpng.so*, and png.h diff --git a/scripts/makefile.borland b/scripts/makefile.borland index 57f374f60..2d3fe5b26 100644 --- a/scripts/makefile.borland +++ b/scripts/makefile.borland @@ -2,8 +2,8 @@ # Borland C++ 4.5 (Note: All modules are compiled in C mode) # Will work with C++ 4.02 also # To build the library, do: -# "make -fmakefile.bor -DMODEL=m" -# or: "make -fmakefile.bor -DMODEL=l" +# "make -fmakefile.borland -DMODEL=m" +# or: "make -fmakefile.borland -DMODEL=l" # # ------------- Borland C++ 4.5 ------------- diff --git a/scripts/makefile.dec b/scripts/makefile.dec index 6f252e45e..51403caa4 100644 --- a/scripts/makefile.dec +++ b/scripts/makefile.dec @@ -14,7 +14,7 @@ ZLIBINC=../zlib # read libpng.txt or png.h to see why PNGMAJ is 2. You should not # have to change it. PNGMAJ = 2 -PNGMIN = 1.0.4 +PNGMIN = 1.0.4c PNGVER = $(PNGMAJ).$(PNGMIN) CC=cc diff --git a/scripts/makefile.linux b/scripts/makefile.linux index 42012015c..c84b6cea9 100644 --- a/scripts/makefile.linux +++ b/scripts/makefile.linux @@ -22,6 +22,8 @@ WARNMORE=-Wwrite-strings -Wpointer-arith -Wshadow \ -Wmissing-declarations -Wtraditional -Wcast-align \ -Wstrict-prototypes -Wmissing-prototypes #-Wconversion +# for pgcc version 2.95.1, -O3 is buggy; don't us it. + CFLAGS=-I$(ZLIBINC) -Wall -O3 -funroll-loops \ $(ALIGN) # $(WARNMORE) -g -DPNG_DEBUG=5 LDFLAGS=-L. -Wl,-rpath,. -L$(ZLIBLIB) -Wl,-rpath,$(ZLIBLIB) -lpng -lz -lm @@ -32,7 +34,7 @@ RANLIB=ranlib # read libpng.txt or png.h to see why PNGMAJ is 2. You should not # have to change it. PNGMAJ = 2 -PNGMIN = 1.0.4 +PNGMIN = 1.0.4c PNGVER = $(PNGMAJ).$(PNGMIN) INCPATH=$(prefix)/include diff --git a/scripts/makefile.msc b/scripts/makefile.msc index 6356218ae..96b2cfc62 100644 --- a/scripts/makefile.msc +++ b/scripts/makefile.msc @@ -3,7 +3,7 @@ # For conditions of distribution and use, see copyright notice in png.h # Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib -# ------------- Microsoft C 5.1 and later ------------- +# -------- Microsoft C 5.1 and later, does not use assembler code ----- MODEL=-AL CFLAGS=-Oait -Gs -nologo -W3 $(MODEL) -I..\zlib #-Ox generates bad code with MSC 5.1 diff --git a/scripts/makefile.sco b/scripts/makefile.sco index 9eee4a2d5..1e5100449 100644 --- a/scripts/makefile.sco +++ b/scripts/makefile.sco @@ -25,7 +25,7 @@ RANLIB=echo # read libpng.txt or png.h to see why PNGMAJ is 2. You should not # have to change it. PNGMAJ = 2 -PNGMIN = 1.0.4 +PNGMIN = 1.0.4c PNGVER = $(PNGMAJ).$(PNGMIN) INCPATH=$(prefix)/include diff --git a/scripts/makefile.solaris b/scripts/makefile.solaris index 5f3a412c9..fcc307800 100644 --- a/scripts/makefile.solaris +++ b/scripts/makefile.solaris @@ -1,5 +1,5 @@ # makefile for libpng on Solaris 2.x with gcc -# Contributed by William L. Sebok, based on makefile.lnx +# Contributed by William L. Sebok, based on makefile.linux # Copyright (C) 1996, 1997 Andreas Dilger # Copyright (C) 1998 Greg Roelofs # For conditions of distribution and use, see copyright notice in png.h @@ -36,7 +36,7 @@ RANLIB=echo # read libpng.txt or png.h to see why PNGMAJ is 2. You should not # have to change it. PNGMAJ = 2 -PNGMIN = 1.0.4 +PNGMIN = 1.0.4c PNGVER = $(PNGMAJ).$(PNGMIN) INCPATH=$(prefix)/include diff --git a/scripts/makefile.turboc3 b/scripts/makefile.turboc3 index c925831d4..f9a2269d2 100644 --- a/scripts/makefile.turboc3 +++ b/scripts/makefile.turboc3 @@ -1,7 +1,7 @@ # Makefile for libpng # TurboC++ 3.0 (Note: All modules are compiled in C mode) -# To use, do "make -fmakefile.tc3" +# To use, do "make -fmakefile.turboc3" # ------------- Turbo C++ 3.0 ------------- MODEL=-ml diff --git a/scripts/makefile.win32vc b/scripts/makefile.vcawin32 similarity index 86% rename from scripts/makefile.win32vc rename to scripts/makefile.vcawin32 index 52934c34a..be7fcc8a2 100644 --- a/scripts/makefile.win32vc +++ b/scripts/makefile.vcawin32 @@ -2,9 +2,15 @@ # Copyright (C) 1998 Tim Wegner # For conditions of distribution and use, see copyright notice in png.h # Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib -# To use, do "nmake /f scripts\makefile.w32" +# To use, do "nmake /f scripts\makefile.vcawin32" + +# ---------- Microsoft Visual C++ 5.0 and later, uses assembler code------ + +# Caution: the assembler code was introduced at libpng version 1.0.4 and has +# not yet been thoroughly tested. + +# If you don't want to use assembler code, use makefile.vcwin32 instead. -# ------------- Microsoft Visual C++ 4.0 and later ------------- MODEL=- CFLAGS=-DPNG_USE_PNGVCRD -Ox -GA3s -nologo -W3 -I..\zlib diff --git a/scripts/makefile.vcwin32 b/scripts/makefile.vcwin32 new file mode 100644 index 000000000..5b62fc316 --- /dev/null +++ b/scripts/makefile.vcwin32 @@ -0,0 +1,87 @@ +# makefile for libpng +# Copyright (C) 1998 Tim Wegner +# For conditions of distribution and use, see copyright notice in png.h +# Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib +# To use, do "nmake /f scripts\makefile.vcwin32" + +# ---------- Microsoft Visual C++ 4.0 and later, no assembler code------ +# If you want to use assembler code, use makefile.vcawin32 instead. + +MODEL=- +CFLAGS= -Ox -GA3s -nologo -W3 -I..\zlib + +CC=cl +LD=link +LDFLAGS= +O=.obj + +#uncomment next to put error messages in a file +#ERRFILE= >> pngerrs + +# variables +OBJS1 = png$(O) pngset$(O) pngget$(O) pngrutil$(O) pngtrans$(O) pngwutil$(O) +OBJS2 = pngmem$(O) pngpread$(O) pngread$(O) pngerror$(O) pngwrite$(O) +OBJS3 = pngrtran$(O) pngwtran$(O) pngrio$(O) pngwio$(O) + +all: libpng.lib + +png$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngset$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngget$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngread$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngpread$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngrtran$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngrutil$(O): png.h pngconf.h pngasmrd.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngerror$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngmem$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngrio$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngwio$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngtest$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngtrans$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngwrite$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngwtran$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +pngwutil$(O): png.h pngconf.h + $(CC) -c $(CFLAGS) $*.c $(ERRFILE) + +libpng.lib: $(OBJS1) $(OBJS2) $(OBJS3) + del libpng.lib + lib /OUT:libpng.lib $(OBJS1) $(OBJS2) $(OBJS3) + +pngtest.exe: pngtest.obj libpng.lib + $(LD) $(LDFLAGS) pngtest.obj libpng.lib ..\zlib\zlib.lib /OUT:pngtest.exe /SUBSYSTEM:CONSOLE + +test: pngtest.exe + pngtest + +# End of makefile for libpng + diff --git a/scripts/makefile.watcom b/scripts/makefile.watcom index a7d99c224..e14f162ac 100644 --- a/scripts/makefile.watcom +++ b/scripts/makefile.watcom @@ -5,7 +5,7 @@ # For conditions of distribution and use, see copyright notice in png.h # Assumes that zlib.lib, zconf.h, and zlib.h have been copied to ..\zlib -# To use, do "wmake /f scripts\makefile.wat" +# To use, do "wmake /f scripts\makefile.watcom" # ------------- Watcom 10.0 and later ------------- MODEL=-mf diff --git a/scripts/pngdef.pas b/scripts/pngdef.pas index 94e859acf..1441808a1 100644 --- a/scripts/pngdef.pas +++ b/scripts/pngdef.pas @@ -3,8 +3,8 @@ unit pngdef; interface const - PNG_LIBPNG_VER_STRING = '1.0.4'; - PNG_LIBPNG_VER = 10004; + PNG_LIBPNG_VER_STRING = '1.0.4c'; + PNG_LIBPNG_VER = 10005; type png_uint_32 = Cardinal;