diff --git a/cderror.h b/cderror.h index e19c475c5..6ccb37c3e 100644 --- a/cderror.h +++ b/cderror.h @@ -33,7 +33,7 @@ typedef enum { -#define JMESSAGE(code,string) code , +#define JMESSAGE(code,string) code , #endif /* JMAKE_ENUM_LIST */ @@ -62,7 +62,7 @@ JMESSAGE(JERR_GIF_IMAGENOTFOUND, "Too few images in GIF file") JMESSAGE(JERR_GIF_NOT, "Not a GIF file") JMESSAGE(JTRC_GIF, "%ux%ux%d GIF image") JMESSAGE(JTRC_GIF_BADVERSION, - "Warning: unexpected GIF version number '%c%c%c'") + "Warning: unexpected GIF version number '%c%c%c'") JMESSAGE(JTRC_GIF_EXTENSION, "Ignoring GIF extension block of type 0x%02x") JMESSAGE(JTRC_GIF_NONSQUARE, "Caution: nonsquare pixels in input") JMESSAGE(JWRN_GIF_BADDATA, "Corrupt data in GIF file") @@ -110,13 +110,13 @@ JMESSAGE(JERR_TGA_NOTCOMP, "Targa support was not compiled") #endif /* TARGA_SUPPORTED */ JMESSAGE(JERR_BAD_CMAP_FILE, - "Color map file is invalid or of unsupported format") + "Color map file is invalid or of unsupported format") JMESSAGE(JERR_TOO_MANY_COLORS, - "Output file format cannot handle %d colormap entries") + "Output file format cannot handle %d colormap entries") JMESSAGE(JERR_UNGETC_FAILED, "ungetc failed") #ifdef TARGA_SUPPORTED JMESSAGE(JERR_UNKNOWN_FORMAT, - "Unrecognized input file format --- perhaps you need -targa") + "Unrecognized input file format --- perhaps you need -targa") #else JMESSAGE(JERR_UNKNOWN_FORMAT, "Unrecognized input file format") #endif diff --git a/cdjpeg.c b/cdjpeg.c index b6250ff97..17f1de7d7 100644 --- a/cdjpeg.c +++ b/cdjpeg.c @@ -9,15 +9,15 @@ * programs (cjpeg, djpeg, jpegtran). */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ -#include /* to declare isupper(), tolower() */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include /* to declare isupper(), tolower() */ #ifdef NEED_SIGNAL_CATCHER -#include /* to declare signal() */ +#include /* to declare signal() */ #endif #ifdef USE_SETMODE -#include /* to declare setmode()'s parameter macros */ +#include /* to declare setmode()'s parameter macros */ /* If you have setmode() but not , just delete this line: */ -#include /* to declare setmode() */ +#include /* to declare setmode() */ #endif @@ -31,13 +31,13 @@ static j_common_ptr sig_cinfo; -void /* must be global for Manx C */ +void /* must be global for Manx C */ signal_catcher (int signum) { if (sig_cinfo != NULL) { if (sig_cinfo->err != NULL) /* turn off trace output */ sig_cinfo->err->trace_level = 0; - jpeg_destroy(sig_cinfo); /* clean up memory allocation & temp files */ + jpeg_destroy(sig_cinfo); /* clean up memory allocation & temp files */ } exit(EXIT_FAILURE); } @@ -47,10 +47,10 @@ GLOBAL(void) enable_signal_catcher (j_common_ptr cinfo) { sig_cinfo = cinfo; -#ifdef SIGINT /* not all systems have SIGINT */ +#ifdef SIGINT /* not all systems have SIGINT */ signal(SIGINT, signal_catcher); #endif -#ifdef SIGTERM /* not all systems have SIGTERM */ +#ifdef SIGTERM /* not all systems have SIGTERM */ signal(SIGTERM, signal_catcher); #endif } @@ -75,8 +75,8 @@ progress_monitor (j_common_ptr cinfo) prog->percent_done = percent_done; if (total_passes > 1) { fprintf(stderr, "\rPass %d/%d: %3d%% ", - prog->pub.completed_passes + prog->completed_extra_passes + 1, - total_passes, percent_done); + prog->pub.completed_passes + prog->completed_extra_passes + 1, + total_passes, percent_done); } else { fprintf(stderr, "\r %3d%% ", percent_done); } @@ -126,17 +126,17 @@ keymatch (char * arg, const char * keyword, int minchars) while ((ca = *arg++) != '\0') { if ((ck = *keyword++) == '\0') - return FALSE; /* arg longer than keyword, no good */ - if (isupper(ca)) /* force arg to lcase (assume ck is already) */ + return FALSE; /* arg longer than keyword, no good */ + if (isupper(ca)) /* force arg to lcase (assume ck is already) */ ca = tolower(ca); if (ca != ck) - return FALSE; /* no good */ - nmatched++; /* count matched characters */ + return FALSE; /* no good */ + nmatched++; /* count matched characters */ } /* reached end of argument; fail if it's too short for unique abbrev */ if (nmatched < minchars) return FALSE; - return TRUE; /* A-OK */ + return TRUE; /* A-OK */ } @@ -150,10 +150,10 @@ read_stdin (void) { FILE * input_file = stdin; -#ifdef USE_SETMODE /* need to hack file mode? */ +#ifdef USE_SETMODE /* need to hack file mode? */ setmode(fileno(stdin), O_BINARY); #endif -#ifdef USE_FDOPEN /* need to re-open in binary mode? */ +#ifdef USE_FDOPEN /* need to re-open in binary mode? */ if ((input_file = fdopen(fileno(stdin), READ_BINARY)) == NULL) { fprintf(stderr, "Cannot reopen stdin\n"); exit(EXIT_FAILURE); @@ -168,10 +168,10 @@ write_stdout (void) { FILE * output_file = stdout; -#ifdef USE_SETMODE /* need to hack file mode? */ +#ifdef USE_SETMODE /* need to hack file mode? */ setmode(fileno(stdout), O_BINARY); #endif -#ifdef USE_FDOPEN /* need to re-open in binary mode? */ +#ifdef USE_FDOPEN /* need to re-open in binary mode? */ if ((output_file = fdopen(fileno(stdout), WRITE_BINARY)) == NULL) { fprintf(stderr, "Cannot reopen stdout\n"); exit(EXIT_FAILURE); diff --git a/cdjpeg.h b/cdjpeg.h index ed024ac3a..0a8f197dd 100644 --- a/cdjpeg.h +++ b/cdjpeg.h @@ -9,12 +9,12 @@ * cjpeg and djpeg. It is NOT used by the core JPEG library. */ -#define JPEG_CJPEG_DJPEG /* define proper options in jconfig.h */ -#define JPEG_INTERNAL_OPTIONS /* cjpeg.c,djpeg.c need to see xxx_SUPPORTED */ +#define JPEG_CJPEG_DJPEG /* define proper options in jconfig.h */ +#define JPEG_INTERNAL_OPTIONS /* cjpeg.c,djpeg.c need to see xxx_SUPPORTED */ #include "jinclude.h" #include "jpeglib.h" -#include "jerror.h" /* get library error codes too */ -#include "cderror.h" /* get application-specific error codes */ +#include "jerror.h" /* get library error codes too */ +#include "cderror.h" /* get application-specific error codes */ /* @@ -25,11 +25,11 @@ typedef struct cjpeg_source_struct * cjpeg_source_ptr; struct cjpeg_source_struct { JMETHOD(void, start_input, (j_compress_ptr cinfo, - cjpeg_source_ptr sinfo)); + cjpeg_source_ptr sinfo)); JMETHOD(JDIMENSION, get_pixel_rows, (j_compress_ptr cinfo, - cjpeg_source_ptr sinfo)); + cjpeg_source_ptr sinfo)); JMETHOD(void, finish_input, (j_compress_ptr cinfo, - cjpeg_source_ptr sinfo)); + cjpeg_source_ptr sinfo)); FILE *input_file; @@ -49,14 +49,14 @@ struct djpeg_dest_struct { * The color map will be ready at this time, if one is needed. */ JMETHOD(void, start_output, (j_decompress_ptr cinfo, - djpeg_dest_ptr dinfo)); + djpeg_dest_ptr dinfo)); /* Emit the specified number of pixel rows from the buffer. */ JMETHOD(void, put_pixel_rows, (j_decompress_ptr cinfo, - djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied)); + djpeg_dest_ptr dinfo, + JDIMENSION rows_supplied)); /* Finish up at the end of the image. */ JMETHOD(void, finish_output, (j_decompress_ptr cinfo, - djpeg_dest_ptr dinfo)); + djpeg_dest_ptr dinfo)); /* Target file spec; filled in by djpeg.c after object is created. */ FILE * output_file; @@ -79,9 +79,9 @@ struct djpeg_dest_struct { */ struct cdjpeg_progress_mgr { - struct jpeg_progress_mgr pub; /* fields known to JPEG library */ - int completed_extra_passes; /* extra passes completed */ - int total_extra_passes; /* total extra */ + struct jpeg_progress_mgr pub; /* fields known to JPEG library */ + int completed_extra_passes; /* extra passes completed */ + int total_extra_passes; /* total extra */ /* last printed percentage stored here to avoid multiple printouts */ int percent_done; }; @@ -92,34 +92,34 @@ typedef struct cdjpeg_progress_mgr * cd_progress_ptr; /* Short forms of external names for systems with brain-damaged linkers. */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jinit_read_bmp jIRdBMP -#define jinit_write_bmp jIWrBMP -#define jinit_read_gif jIRdGIF -#define jinit_write_gif jIWrGIF -#define jinit_read_ppm jIRdPPM -#define jinit_write_ppm jIWrPPM -#define jinit_read_rle jIRdRLE -#define jinit_write_rle jIWrRLE -#define jinit_read_targa jIRdTarga -#define jinit_write_targa jIWrTarga -#define read_quant_tables RdQTables -#define read_scan_script RdScnScript +#define jinit_read_bmp jIRdBMP +#define jinit_write_bmp jIWrBMP +#define jinit_read_gif jIRdGIF +#define jinit_write_gif jIWrGIF +#define jinit_read_ppm jIRdPPM +#define jinit_write_ppm jIWrPPM +#define jinit_read_rle jIRdRLE +#define jinit_write_rle jIWrRLE +#define jinit_read_targa jIRdTarga +#define jinit_write_targa jIWrTarga +#define read_quant_tables RdQTables +#define read_scan_script RdScnScript #define set_quality_ratings SetQRates -#define set_quant_slots SetQSlots -#define set_sample_factors SetSFacts -#define read_color_map RdCMap -#define enable_signal_catcher EnSigCatcher -#define start_progress_monitor StProgMon -#define end_progress_monitor EnProgMon -#define read_stdin RdStdin -#define write_stdout WrStdout +#define set_quant_slots SetQSlots +#define set_sample_factors SetSFacts +#define read_color_map RdCMap +#define enable_signal_catcher EnSigCatcher +#define start_progress_monitor StProgMon +#define end_progress_monitor EnProgMon +#define read_stdin RdStdin +#define write_stdout WrStdout #endif /* NEED_SHORT_EXTERNAL_NAMES */ /* Module selection routines for I/O modules. */ EXTERN(cjpeg_source_ptr) jinit_read_bmp JPP((j_compress_ptr cinfo)); EXTERN(djpeg_dest_ptr) jinit_write_bmp JPP((j_decompress_ptr cinfo, - boolean is_os2)); + boolean is_os2)); EXTERN(cjpeg_source_ptr) jinit_read_gif JPP((j_compress_ptr cinfo)); EXTERN(djpeg_dest_ptr) jinit_write_gif JPP((j_decompress_ptr cinfo)); EXTERN(cjpeg_source_ptr) jinit_read_ppm JPP((j_compress_ptr cinfo)); @@ -132,10 +132,10 @@ EXTERN(djpeg_dest_ptr) jinit_write_targa JPP((j_decompress_ptr cinfo)); /* cjpeg support routines (in rdswitch.c) */ EXTERN(boolean) read_quant_tables JPP((j_compress_ptr cinfo, char * filename, - boolean force_baseline)); + boolean force_baseline)); EXTERN(boolean) read_scan_script JPP((j_compress_ptr cinfo, char * filename)); EXTERN(boolean) set_quality_ratings JPP((j_compress_ptr cinfo, char *arg, - boolean force_baseline)); + boolean force_baseline)); EXTERN(boolean) set_quant_slots JPP((j_compress_ptr cinfo, char *arg)); EXTERN(boolean) set_sample_factors JPP((j_compress_ptr cinfo, char *arg)); @@ -147,7 +147,7 @@ EXTERN(void) read_color_map JPP((j_decompress_ptr cinfo, FILE * infile)); EXTERN(void) enable_signal_catcher JPP((j_common_ptr cinfo)); EXTERN(void) start_progress_monitor JPP((j_common_ptr cinfo, - cd_progress_ptr progress)); + cd_progress_ptr progress)); EXTERN(void) end_progress_monitor JPP((j_common_ptr cinfo)); EXTERN(boolean) keymatch JPP((char * arg, const char * keyword, int minchars)); EXTERN(FILE *) read_stdin JPP((void)); @@ -155,32 +155,32 @@ EXTERN(FILE *) write_stdout JPP((void)); /* miscellaneous useful macros */ -#ifdef DONT_USE_B_MODE /* define mode parameters for fopen() */ -#define READ_BINARY "r" -#define WRITE_BINARY "w" +#ifdef DONT_USE_B_MODE /* define mode parameters for fopen() */ +#define READ_BINARY "r" +#define WRITE_BINARY "w" #else -#ifdef VMS /* VMS is very nonstandard */ -#define READ_BINARY "rb", "ctx=stm" -#define WRITE_BINARY "wb", "ctx=stm" -#else /* standard ANSI-compliant case */ -#define READ_BINARY "rb" -#define WRITE_BINARY "wb" +#ifdef VMS /* VMS is very nonstandard */ +#define READ_BINARY "rb", "ctx=stm" +#define WRITE_BINARY "wb", "ctx=stm" +#else /* standard ANSI-compliant case */ +#define READ_BINARY "rb" +#define WRITE_BINARY "wb" #endif #endif -#ifndef EXIT_FAILURE /* define exit() codes if not provided */ +#ifndef EXIT_FAILURE /* define exit() codes if not provided */ #define EXIT_FAILURE 1 #endif #ifndef EXIT_SUCCESS #ifdef VMS -#define EXIT_SUCCESS 1 /* VMS is very nonstandard */ +#define EXIT_SUCCESS 1 /* VMS is very nonstandard */ #else #define EXIT_SUCCESS 0 #endif #endif #ifndef EXIT_WARNING #ifdef VMS -#define EXIT_WARNING 1 /* VMS is very nonstandard */ +#define EXIT_WARNING 1 /* VMS is very nonstandard */ #else #define EXIT_WARNING 2 #endif diff --git a/cjpeg.c b/cjpeg.c index 7f0381988..4429c49d3 100644 --- a/cjpeg.c +++ b/cjpeg.c @@ -13,8 +13,8 @@ * * Two different command line styles are permitted, depending on the * compile-time switch TWO_FILE_COMMANDLINE: - * cjpeg [options] inputfile outputfile - * cjpeg [options] [inputfile] + * cjpeg [options] inputfile outputfile + * cjpeg [options] [inputfile] * In the second style, output is always to standard output, which you'd * normally redirect to a file or pipe to some other program. Input is * either from a named file or from standard input (typically redirected). @@ -22,28 +22,28 @@ * don't support pipes. Also, you MUST use the first style if your system * doesn't do binary I/O to stdin/stdout. * To simplify script writing, the "-outfile" switch is provided. The syntax - * cjpeg [options] -outfile outputfile inputfile + * cjpeg [options] -outfile outputfile inputfile * works regardless of which command line style is used. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ -#include "jversion.h" /* for version message */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "jversion.h" /* for version message */ #include "jconfigint.h" -#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ +#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ #ifdef __MWERKS__ #include /* Metrowerks needs this */ -#include /* ... and this */ +#include /* ... and this */ #endif #ifdef THINK_C -#include /* Think declares it here */ +#include /* Think declares it here */ #endif #endif /* Create the add-on message string table. */ -#define JMESSAGE(code,string) string , +#define JMESSAGE(code,string) string , static const char * const cdjpeg_message_table[] = { #include "cderror.h" @@ -77,7 +77,7 @@ static const char * const cdjpeg_message_table[] = { * seldom-used ID field), so we provide a switch to force Targa input mode. */ -static boolean is_targa; /* records user -targa switch */ +static boolean is_targa; /* records user -targa switch */ LOCAL(cjpeg_source_ptr) @@ -124,7 +124,7 @@ select_file_type (j_compress_ptr cinfo, FILE * infile) break; } - return NULL; /* suppress compiler warnings */ + return NULL; /* suppress compiler warnings */ } @@ -137,8 +137,8 @@ select_file_type (j_compress_ptr cinfo, FILE * infile) */ -static const char * progname; /* program name for error messages */ -static char * outfilename; /* for -outfile switch */ +static const char * progname; /* program name for error messages */ +static char * outfilename; /* for -outfile switch */ boolean memdst; /* for -memdst switch */ @@ -172,15 +172,15 @@ usage (void) #endif #ifdef DCT_ISLOW_SUPPORTED fprintf(stderr, " -dct int Use integer DCT method%s\n", - (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : "")); + (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : "")); #endif #ifdef DCT_IFAST_SUPPORTED fprintf(stderr, " -dct fast Use fast integer DCT (less accurate)%s\n", - (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : "")); + (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : "")); #endif #ifdef DCT_FLOAT_SUPPORTED fprintf(stderr, " -dct float Use floating-point DCT method%s\n", - (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : "")); + (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : "")); #endif fprintf(stderr, " -restart N Set restart interval in rows, or in blocks with B\n"); #ifdef INPUT_SMOOTHING_SUPPORTED @@ -206,7 +206,7 @@ usage (void) LOCAL(int) parse_switches (j_compress_ptr cinfo, int argc, char **argv, - int last_file_arg_seen, boolean for_real) + int last_file_arg_seen, boolean for_real) /* Parse optional switches. * Returns argv[] index of first file-name argument (== argc if none). * Any file names with indexes <= last_file_arg_seen are ignored; @@ -220,15 +220,15 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, char * arg; boolean force_baseline; boolean simple_progressive; - char * qualityarg = NULL; /* saves -quality parm if any */ - char * qtablefile = NULL; /* saves -qtables filename if any */ - char * qslotsarg = NULL; /* saves -qslots parm if any */ - char * samplearg = NULL; /* saves -sample parm if any */ - char * scansarg = NULL; /* saves -scans parm if any */ + char * qualityarg = NULL; /* saves -quality parm if any */ + char * qtablefile = NULL; /* saves -qtables filename if any */ + char * qslotsarg = NULL; /* saves -qslots parm if any */ + char * samplearg = NULL; /* saves -sample parm if any */ + char * scansarg = NULL; /* saves -scans parm if any */ /* Set up default JPEG parameters. */ - force_baseline = FALSE; /* by default, allow 16-bit quantizers */ + force_baseline = FALSE; /* by default, allow 16-bit quantizers */ simple_progressive = FALSE; is_targa = FALSE; outfilename = NULL; @@ -242,12 +242,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, if (*arg != '-') { /* Not a switch, must be a file name argument */ if (argn <= last_file_arg_seen) { - outfilename = NULL; /* -outfile applies to just one input file */ - continue; /* ignore this name if previously processed */ + outfilename = NULL; /* -outfile applies to just one input file */ + continue; /* ignore this name if previously processed */ } - break; /* else done parsing switches */ + break; /* else done parsing switches */ } - arg++; /* advance past switch marker character */ + arg++; /* advance past switch marker character */ if (keymatch(arg, "arithmetic", 1)) { /* Use arithmetic coding. */ @@ -255,7 +255,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, cinfo->arith_code = TRUE; #else fprintf(stderr, "%s: sorry, arithmetic coding not supported\n", - progname); + progname); exit(EXIT_FAILURE); #endif @@ -265,16 +265,16 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, } else if (keymatch(arg, "dct", 2)) { /* Select DCT algorithm. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (keymatch(argv[argn], "int", 1)) { - cinfo->dct_method = JDCT_ISLOW; + cinfo->dct_method = JDCT_ISLOW; } else if (keymatch(argv[argn], "fast", 2)) { - cinfo->dct_method = JDCT_IFAST; + cinfo->dct_method = JDCT_IFAST; } else if (keymatch(argv[argn], "float", 2)) { - cinfo->dct_method = JDCT_FLOAT; + cinfo->dct_method = JDCT_FLOAT; } else - usage(); + usage(); } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) { /* Enable debug printouts. */ @@ -282,12 +282,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, static boolean printed_version = FALSE; if (! printed_version) { - fprintf(stderr, "%s version %s (build %s)\n", - PACKAGE_NAME, VERSION, BUILD); - fprintf(stderr, "%s\n\n", JCOPYRIGHT); - fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n", - JVERSION); - printed_version = TRUE; + fprintf(stderr, "%s version %s (build %s)\n", + PACKAGE_NAME, VERSION, BUILD); + fprintf(stderr, "%s\n\n", JCOPYRIGHT); + fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n", + JVERSION); + printed_version = TRUE; } cinfo->err->trace_level++; @@ -304,12 +304,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, long lval; char ch = 'x'; - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1) - usage(); + usage(); if (ch == 'm' || ch == 'M') - lval *= 1000L; + lval *= 1000L; cinfo->mem->max_memory_to_use = lval * 1000L; } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) { @@ -318,15 +318,15 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, cinfo->optimize_coding = TRUE; #else fprintf(stderr, "%s: sorry, entropy optimization was not compiled in\n", - progname); + progname); exit(EXIT_FAILURE); #endif } else if (keymatch(arg, "outfile", 4)) { /* Set output file name. */ - if (++argn >= argc) /* advance to next argument */ - usage(); - outfilename = argv[argn]; /* save it away for later use */ + if (++argn >= argc) /* advance to next argument */ + usage(); + outfilename = argv[argn]; /* save it away for later use */ } else if (keymatch(arg, "progressive", 1)) { /* Select simple progressive mode. */ @@ -335,7 +335,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, /* We must postpone execution until num_components is known. */ #else fprintf(stderr, "%s: sorry, progressive output was not compiled in\n", - progname); + progname); exit(EXIT_FAILURE); #endif @@ -351,14 +351,14 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, } else if (keymatch(arg, "quality", 1)) { /* Quality ratings (quantization table scaling factors). */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); qualityarg = argv[argn]; } else if (keymatch(arg, "qslots", 2)) { /* Quantization table slot numbers. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); qslotsarg = argv[argn]; /* Must delay setting qslots until after we have processed any * colorspace-determining switches, since jpeg_set_colorspace sets @@ -367,8 +367,8 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, } else if (keymatch(arg, "qtables", 2)) { /* Quantization tables fetched from file. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); qtablefile = argv[argn]; /* We postpone actually reading the file in case -quality comes later. */ @@ -377,24 +377,24 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, long lval; char ch = 'x'; - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1) - usage(); + usage(); if (lval < 0 || lval > 65535L) - usage(); + usage(); if (ch == 'b' || ch == 'B') { - cinfo->restart_interval = (unsigned int) lval; - cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */ + cinfo->restart_interval = (unsigned int) lval; + cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */ } else { - cinfo->restart_in_rows = (int) lval; - /* restart_interval will be computed during startup */ + cinfo->restart_in_rows = (int) lval; + /* restart_interval will be computed during startup */ } } else if (keymatch(arg, "sample", 2)) { /* Set sampling factors. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); samplearg = argv[argn]; /* Must delay setting sample factors until after we have processed any * colorspace-determining switches, since jpeg_set_colorspace sets @@ -404,13 +404,13 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, } else if (keymatch(arg, "scans", 4)) { /* Set scan script. */ #ifdef C_MULTISCAN_FILES_SUPPORTED - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); scansarg = argv[argn]; /* We must postpone reading the file in case -progressive appears. */ #else fprintf(stderr, "%s: sorry, multi-scan output was not compiled in\n", - progname); + progname); exit(EXIT_FAILURE); #endif @@ -418,12 +418,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, /* Set input smoothing factor. */ int val; - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (sscanf(argv[argn], "%d", &val) != 1) - usage(); + usage(); if (val < 0 || val > 100) - usage(); + usage(); cinfo->smoothing_factor = val; } else if (keymatch(arg, "targa", 1)) { @@ -431,7 +431,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, is_targa = TRUE; } else { - usage(); /* bogus switch */ + usage(); /* bogus switch */ } } @@ -441,35 +441,35 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, /* Set quantization tables for selected quality. */ /* Some or all may be overridden if -qtables is present. */ - if (qualityarg != NULL) /* process -quality if it was present */ + if (qualityarg != NULL) /* process -quality if it was present */ if (! set_quality_ratings(cinfo, qualityarg, force_baseline)) - usage(); + usage(); - if (qtablefile != NULL) /* process -qtables if it was present */ + if (qtablefile != NULL) /* process -qtables if it was present */ if (! read_quant_tables(cinfo, qtablefile, force_baseline)) - usage(); + usage(); - if (qslotsarg != NULL) /* process -qslots if it was present */ + if (qslotsarg != NULL) /* process -qslots if it was present */ if (! set_quant_slots(cinfo, qslotsarg)) - usage(); + usage(); - if (samplearg != NULL) /* process -sample if it was present */ + if (samplearg != NULL) /* process -sample if it was present */ if (! set_sample_factors(cinfo, samplearg)) - usage(); + usage(); #ifdef C_PROGRESSIVE_SUPPORTED - if (simple_progressive) /* process -progressive; -scans can override */ + if (simple_progressive) /* process -progressive; -scans can override */ jpeg_simple_progression(cinfo); #endif #ifdef C_MULTISCAN_FILES_SUPPORTED - if (scansarg != NULL) /* process -scans if it was present */ + if (scansarg != NULL) /* process -scans if it was present */ if (! read_scan_script(cinfo, scansarg)) - usage(); + usage(); #endif } - return argn; /* return index of next arg (file name) */ + return argn; /* return index of next arg (file name) */ } @@ -500,7 +500,7 @@ main (int argc, char **argv) progname = argv[0]; if (progname == NULL || progname[0] == 0) - progname = "cjpeg"; /* in case C library doesn't provide it */ + progname = "cjpeg"; /* in case C library doesn't provide it */ /* Initialize the JPEG compression object with default error handling. */ cinfo.err = jpeg_std_error(&jerr); @@ -637,5 +637,5 @@ main (int argc, char **argv) /* All done. */ exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS); - return 0; /* suppress no-return-value warnings */ + return 0; /* suppress no-return-value warnings */ } diff --git a/coderules.txt b/coderules.txt index 357929fb4..ea8fcc088 100644 --- a/coderules.txt +++ b/coderules.txt @@ -21,11 +21,11 @@ Block comments should be laid out thusly: */ We indent statements in K&R style, e.g., - if (test) { - then-part; - } else { - else-part; - } + if (test) { + then-part; + } else { + else-part; + } with two spaces per indentation level. (This indentation convention is handled automatically by GNU Emacs and many other text editors.) @@ -57,16 +57,16 @@ keywords can be inserted for use in Windows DLLs.) ansi2knr does not transform method declarations (function pointers in structs). We handle these with a macro JMETHOD, defined as - #ifdef HAVE_PROTOTYPES - #define JMETHOD(type,methodname,arglist) type (*methodname) arglist - #else - #define JMETHOD(type,methodname,arglist) type (*methodname) () - #endif + #ifdef HAVE_PROTOTYPES + #define JMETHOD(type,methodname,arglist) type (*methodname) arglist + #else + #define JMETHOD(type,methodname,arglist) type (*methodname) () + #endif which is used like this: - struct function_pointers { - JMETHOD(void, init_entropy_encoder, (int somearg, jparms *jp)); - JMETHOD(void, term_entropy_encoder, (void)); - }; + struct function_pointers { + JMETHOD(void, init_entropy_encoder, (int somearg, jparms *jp)); + JMETHOD(void, term_entropy_encoder, (void)); + }; Note the set of parentheses surrounding the parameter list. A similar solution is used for forward and external function declarations diff --git a/djpeg.c b/djpeg.c index c1abb139f..7a2eaa07e 100644 --- a/djpeg.c +++ b/djpeg.c @@ -12,8 +12,8 @@ * * Two different command line styles are permitted, depending on the * compile-time switch TWO_FILE_COMMANDLINE: - * djpeg [options] inputfile outputfile - * djpeg [options] [inputfile] + * djpeg [options] inputfile outputfile + * djpeg [options] [inputfile] * In the second style, output is always to standard output, which you'd * normally redirect to a file or pipe to some other program. Input is * either from a named file or from standard input (typically redirected). @@ -21,30 +21,30 @@ * don't support pipes. Also, you MUST use the first style if your system * doesn't do binary I/O to stdin/stdout. * To simplify script writing, the "-outfile" switch is provided. The syntax - * djpeg [options] -outfile outputfile inputfile + * djpeg [options] -outfile outputfile inputfile * works regardless of which command line style is used. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ -#include "jversion.h" /* for version message */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "jversion.h" /* for version message */ #include "jconfigint.h" -#include /* to declare isprint() */ +#include /* to declare isprint() */ -#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ +#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ #ifdef __MWERKS__ #include /* Metrowerks needs this */ -#include /* ... and this */ +#include /* ... and this */ #endif #ifdef THINK_C -#include /* Think declares it here */ +#include /* Think declares it here */ #endif #endif /* Create the add-on message string table. */ -#define JMESSAGE(code,string) string , +#define JMESSAGE(code,string) string , static const char * const cdjpeg_message_table[] = { #include "cderror.h" @@ -60,17 +60,17 @@ static const char * const cdjpeg_message_table[] = { */ typedef enum { - FMT_BMP, /* BMP format (Windows flavor) */ - FMT_GIF, /* GIF format */ - FMT_OS2, /* BMP format (OS/2 flavor) */ - FMT_PPM, /* PPM/PGM (PBMPLUS formats) */ - FMT_RLE, /* RLE format */ - FMT_TARGA, /* Targa format */ - FMT_TIFF /* TIFF format */ + FMT_BMP, /* BMP format (Windows flavor) */ + FMT_GIF, /* GIF format */ + FMT_OS2, /* BMP format (OS/2 flavor) */ + FMT_PPM, /* PPM/PGM (PBMPLUS formats) */ + FMT_RLE, /* RLE format */ + FMT_TARGA, /* Targa format */ + FMT_TIFF /* TIFF format */ } IMAGE_FORMATS; -#ifndef DEFAULT_FMT /* so can override from CFLAGS in Makefile */ -#define DEFAULT_FMT FMT_PPM +#ifndef DEFAULT_FMT /* so can override from CFLAGS in Makefile */ +#define DEFAULT_FMT FMT_PPM #endif static IMAGE_FORMATS requested_fmt; @@ -85,8 +85,8 @@ static IMAGE_FORMATS requested_fmt; */ -static const char * progname; /* program name for error messages */ -static char * outfilename; /* for -outfile switch */ +static const char * progname; /* program name for error messages */ +static char * outfilename; /* for -outfile switch */ boolean memsrc; /* for -memsrc switch */ #define INPUT_BUF_SIZE 4096 @@ -112,40 +112,40 @@ usage (void) #endif #ifdef BMP_SUPPORTED fprintf(stderr, " -bmp Select BMP output format (Windows style)%s\n", - (DEFAULT_FMT == FMT_BMP ? " (default)" : "")); + (DEFAULT_FMT == FMT_BMP ? " (default)" : "")); #endif #ifdef GIF_SUPPORTED fprintf(stderr, " -gif Select GIF output format%s\n", - (DEFAULT_FMT == FMT_GIF ? " (default)" : "")); + (DEFAULT_FMT == FMT_GIF ? " (default)" : "")); #endif #ifdef BMP_SUPPORTED fprintf(stderr, " -os2 Select BMP output format (OS/2 style)%s\n", - (DEFAULT_FMT == FMT_OS2 ? " (default)" : "")); + (DEFAULT_FMT == FMT_OS2 ? " (default)" : "")); #endif #ifdef PPM_SUPPORTED fprintf(stderr, " -pnm Select PBMPLUS (PPM/PGM) output format%s\n", - (DEFAULT_FMT == FMT_PPM ? " (default)" : "")); + (DEFAULT_FMT == FMT_PPM ? " (default)" : "")); #endif #ifdef RLE_SUPPORTED fprintf(stderr, " -rle Select Utah RLE output format%s\n", - (DEFAULT_FMT == FMT_RLE ? " (default)" : "")); + (DEFAULT_FMT == FMT_RLE ? " (default)" : "")); #endif #ifdef TARGA_SUPPORTED fprintf(stderr, " -targa Select Targa output format%s\n", - (DEFAULT_FMT == FMT_TARGA ? " (default)" : "")); + (DEFAULT_FMT == FMT_TARGA ? " (default)" : "")); #endif fprintf(stderr, "Switches for advanced users:\n"); #ifdef DCT_ISLOW_SUPPORTED fprintf(stderr, " -dct int Use integer DCT method%s\n", - (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : "")); + (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : "")); #endif #ifdef DCT_IFAST_SUPPORTED fprintf(stderr, " -dct fast Use fast integer DCT (less accurate)%s\n", - (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : "")); + (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : "")); #endif #ifdef DCT_FLOAT_SUPPORTED fprintf(stderr, " -dct float Use floating-point DCT method%s\n", - (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : "")); + (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : "")); #endif fprintf(stderr, " -dither fs Use F-S dithering (default)\n"); fprintf(stderr, " -dither none Don't use dithering in quantization\n"); @@ -170,7 +170,7 @@ usage (void) LOCAL(int) parse_switches (j_decompress_ptr cinfo, int argc, char **argv, - int last_file_arg_seen, boolean for_real) + int last_file_arg_seen, boolean for_real) /* Parse optional switches. * Returns argv[] index of first file-name argument (== argc if none). * Any file names with indexes <= last_file_arg_seen are ignored; @@ -184,7 +184,7 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv, char * arg; /* Set up default JPEG parameters. */ - requested_fmt = DEFAULT_FMT; /* set default output file format */ + requested_fmt = DEFAULT_FMT; /* set default output file format */ outfilename = NULL; memsrc = FALSE; cinfo->err->trace_level = 0; @@ -196,54 +196,54 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv, if (*arg != '-') { /* Not a switch, must be a file name argument */ if (argn <= last_file_arg_seen) { - outfilename = NULL; /* -outfile applies to just one input file */ - continue; /* ignore this name if previously processed */ + outfilename = NULL; /* -outfile applies to just one input file */ + continue; /* ignore this name if previously processed */ } - break; /* else done parsing switches */ + break; /* else done parsing switches */ } - arg++; /* advance past switch marker character */ + arg++; /* advance past switch marker character */ if (keymatch(arg, "bmp", 1)) { /* BMP output format. */ requested_fmt = FMT_BMP; } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) || - keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) { + keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) { /* Do color quantization. */ int val; - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (sscanf(argv[argn], "%d", &val) != 1) - usage(); + usage(); cinfo->desired_number_of_colors = val; cinfo->quantize_colors = TRUE; } else if (keymatch(arg, "dct", 2)) { /* Select IDCT algorithm. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (keymatch(argv[argn], "int", 1)) { - cinfo->dct_method = JDCT_ISLOW; + cinfo->dct_method = JDCT_ISLOW; } else if (keymatch(argv[argn], "fast", 2)) { - cinfo->dct_method = JDCT_IFAST; + cinfo->dct_method = JDCT_IFAST; } else if (keymatch(argv[argn], "float", 2)) { - cinfo->dct_method = JDCT_FLOAT; + cinfo->dct_method = JDCT_FLOAT; } else - usage(); + usage(); } else if (keymatch(arg, "dither", 2)) { /* Select dithering algorithm. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (keymatch(argv[argn], "fs", 2)) { - cinfo->dither_mode = JDITHER_FS; + cinfo->dither_mode = JDITHER_FS; } else if (keymatch(argv[argn], "none", 2)) { - cinfo->dither_mode = JDITHER_NONE; + cinfo->dither_mode = JDITHER_NONE; } else if (keymatch(argv[argn], "ordered", 2)) { - cinfo->dither_mode = JDITHER_ORDERED; + cinfo->dither_mode = JDITHER_ORDERED; } else - usage(); + usage(); } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) { /* Enable debug printouts. */ @@ -251,12 +251,12 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv, static boolean printed_version = FALSE; if (! printed_version) { - fprintf(stderr, "%s version %s (build %s)\n", - PACKAGE_NAME, VERSION, BUILD); - fprintf(stderr, "%s\n\n", JCOPYRIGHT); - fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n", - JVERSION); - printed_version = TRUE; + fprintf(stderr, "%s version %s (build %s)\n", + PACKAGE_NAME, VERSION, BUILD); + fprintf(stderr, "%s\n\n", JCOPYRIGHT); + fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n", + JVERSION); + printed_version = TRUE; } cinfo->err->trace_level++; @@ -265,7 +265,7 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv, cinfo->two_pass_quantize = FALSE; cinfo->dither_mode = JDITHER_ORDERED; if (! cinfo->quantize_colors) /* don't override an earlier -colors */ - cinfo->desired_number_of_colors = 216; + cinfo->desired_number_of_colors = 216; cinfo->dct_method = JDCT_FASTEST; cinfo->do_fancy_upsampling = FALSE; @@ -283,21 +283,21 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv, } else if (keymatch(arg, "map", 3)) { /* Quantize to a color map taken from an input file. */ - if (++argn >= argc) /* advance to next argument */ - usage(); - if (for_real) { /* too expensive to do twice! */ -#ifdef QUANT_2PASS_SUPPORTED /* otherwise can't quantize to supplied map */ - FILE * mapfile; - - if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) { - fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]); - exit(EXIT_FAILURE); - } - read_color_map(cinfo, mapfile); - fclose(mapfile); - cinfo->quantize_colors = TRUE; + if (++argn >= argc) /* advance to next argument */ + usage(); + if (for_real) { /* too expensive to do twice! */ +#ifdef QUANT_2PASS_SUPPORTED /* otherwise can't quantize to supplied map */ + FILE * mapfile; + + if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) { + fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]); + exit(EXIT_FAILURE); + } + read_color_map(cinfo, mapfile); + fclose(mapfile); + cinfo->quantize_colors = TRUE; #else - ERREXIT(cinfo, JERR_NOT_COMPILED); + ERREXIT(cinfo, JERR_NOT_COMPILED); #endif } @@ -306,12 +306,12 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv, long lval; char ch = 'x'; - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1) - usage(); + usage(); if (ch == 'm' || ch == 'M') - lval *= 1000L; + lval *= 1000L; cinfo->mem->max_memory_to_use = lval * 1000L; } else if (keymatch(arg, "nosmooth", 3)) { @@ -328,9 +328,9 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv, } else if (keymatch(arg, "outfile", 4)) { /* Set output file name. */ - if (++argn >= argc) /* advance to next argument */ - usage(); - outfilename = argv[argn]; /* save it away for later use */ + if (++argn >= argc) /* advance to next argument */ + usage(); + outfilename = argv[argn]; /* save it away for later use */ } else if (keymatch(arg, "memsrc", 2)) { /* Use in-memory source manager */ @@ -352,22 +352,22 @@ parse_switches (j_decompress_ptr cinfo, int argc, char **argv, } else if (keymatch(arg, "scale", 1)) { /* Scale the output image by a fraction M/N. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (sscanf(argv[argn], "%d/%d", - &cinfo->scale_num, &cinfo->scale_denom) != 2) - usage(); + &cinfo->scale_num, &cinfo->scale_denom) != 2) + usage(); } else if (keymatch(arg, "targa", 1)) { /* Targa output format. */ requested_fmt = FMT_TARGA; } else { - usage(); /* bogus switch */ + usage(); /* bogus switch */ } } - return argn; /* return index of next arg (file name) */ + return argn; /* return index of next arg (file name) */ } @@ -403,14 +403,14 @@ print_text_marker (j_decompress_ptr cinfo) length = jpeg_getc(cinfo) << 8; length += jpeg_getc(cinfo); - length -= 2; /* discount the length word itself */ + length -= 2; /* discount the length word itself */ if (traceit) { if (cinfo->unread_marker == JPEG_COM) fprintf(stderr, "Comment, length %ld:\n", (long) length); - else /* assume it is an APPn otherwise */ + else /* assume it is an APPn otherwise */ fprintf(stderr, "APP%d, length %ld:\n", - cinfo->unread_marker - JPEG_APP0, (long) length); + cinfo->unread_marker - JPEG_APP0, (long) length); } while (--length >= 0) { @@ -422,16 +422,16 @@ print_text_marker (j_decompress_ptr cinfo) * Newlines in CR, CR/LF, or LF form will be printed as one newline. */ if (ch == '\r') { - fprintf(stderr, "\n"); + fprintf(stderr, "\n"); } else if (ch == '\n') { - if (lastch != '\r') - fprintf(stderr, "\n"); + if (lastch != '\r') + fprintf(stderr, "\n"); } else if (ch == '\\') { - fprintf(stderr, "\\\\"); + fprintf(stderr, "\\\\"); } else if (isprint(ch)) { - putc(ch, stderr); + putc(ch, stderr); } else { - fprintf(stderr, "\\%03o", ch); + fprintf(stderr, "\\%03o", ch); } lastch = ch; } @@ -471,7 +471,7 @@ main (int argc, char **argv) progname = argv[0]; if (progname == NULL || progname[0] == 0) - progname = "djpeg"; /* in case C library doesn't provide it */ + progname = "djpeg"; /* in case C library doesn't provide it */ /* Initialize the JPEG decompression object with default error handling. */ cinfo.err = jpeg_std_error(&jerr); @@ -510,14 +510,14 @@ main (int argc, char **argv) if (outfilename == NULL) { if (file_index != argc-2) { fprintf(stderr, "%s: must name one input and one output file\n", - progname); + progname); usage(); } outfilename = argv[file_index+1]; } else { if (file_index != argc-1) { fprintf(stderr, "%s: must name one input and one output file\n", - progname); + progname); usage(); } } @@ -634,7 +634,7 @@ main (int argc, char **argv) /* Process data */ while (cinfo.output_scanline < cinfo.output_height) { num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer, - dest_mgr->buffer_height); + dest_mgr->buffer_height); (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines); } @@ -668,5 +668,5 @@ main (int argc, char **argv) /* All done. */ exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS); - return 0; /* suppress no-return-value warnings */ + return 0; /* suppress no-return-value warnings */ } diff --git a/example.c b/example.c index 1d6f6cc30..0a65a6ccd 100644 --- a/example.c +++ b/example.c @@ -6,7 +6,7 @@ * conjunction with the documentation file libjpeg.txt. * * This code will not do anything useful as-is, but it may be helpful as a - * skeleton for constructing routines that call the JPEG library. + * skeleton for constructing routines that call the JPEG library. * * We present these routines in the same coding style used in the JPEG code * (ANSI function definitions, etc); but you are of course free to code your @@ -58,9 +58,9 @@ * RGB color and is described by: */ -extern JSAMPLE * image_buffer; /* Points to large array of R,G,B-order data */ -extern int image_height; /* Number of rows in image */ -extern int image_width; /* Number of columns in image */ +extern JSAMPLE * image_buffer; /* Points to large array of R,G,B-order data */ +extern int image_height; /* Number of rows in image */ +extern int image_width; /* Number of columns in image */ /* @@ -88,9 +88,9 @@ write_JPEG_file (char * filename, int quality) */ struct jpeg_error_mgr jerr; /* More stuff */ - FILE * outfile; /* target file */ - JSAMPROW row_pointer[1]; /* pointer to JSAMPLE row[s] */ - int row_stride; /* physical row width in image buffer */ + FILE * outfile; /* target file */ + JSAMPROW row_pointer[1]; /* pointer to JSAMPLE row[s] */ + int row_stride; /* physical row width in image buffer */ /* Step 1: allocate and initialize JPEG compression object */ @@ -122,10 +122,10 @@ write_JPEG_file (char * filename, int quality) /* First we supply a description of the input image. * Four fields of the cinfo struct must be filled in: */ - cinfo.image_width = image_width; /* image width and height, in pixels */ + cinfo.image_width = image_width; /* image width and height, in pixels */ cinfo.image_height = image_height; - cinfo.input_components = 3; /* # of color components per pixel */ - cinfo.in_color_space = JCS_RGB; /* colorspace of input image */ + cinfo.input_components = 3; /* # of color components per pixel */ + cinfo.in_color_space = JCS_RGB; /* colorspace of input image */ /* Now use the library's routine to set default compression parameters. * (You must set at least cinfo.in_color_space before calling this, * since the defaults depend on the source color space.) @@ -151,7 +151,7 @@ write_JPEG_file (char * filename, int quality) * To keep things simple, we pass one scanline per call; you can pass * more if you wish, though. */ - row_stride = image_width * 3; /* JSAMPLEs per row in image_buffer */ + row_stride = image_width * 3; /* JSAMPLEs per row in image_buffer */ while (cinfo.next_scanline < cinfo.image_height) { /* jpeg_write_scanlines expects an array of pointers to scanlines. @@ -248,9 +248,9 @@ write_JPEG_file (char * filename, int quality) */ struct my_error_mgr { - struct jpeg_error_mgr pub; /* "public" fields */ + struct jpeg_error_mgr pub; /* "public" fields */ - jmp_buf setjmp_buffer; /* for return to caller */ + jmp_buf setjmp_buffer; /* for return to caller */ }; typedef struct my_error_mgr * my_error_ptr; @@ -293,9 +293,9 @@ read_JPEG_file (char * filename) */ struct my_error_mgr jerr; /* More stuff */ - FILE * infile; /* source file */ - JSAMPARRAY buffer; /* Output row buffer */ - int row_stride; /* physical row width in output buffer */ + FILE * infile; /* source file */ + JSAMPARRAY buffer; /* Output row buffer */ + int row_stride; /* physical row width in output buffer */ /* In this example we want to open the input file before doing anything else, * so that the setjmp() error recovery below can assume the file is open. @@ -356,12 +356,12 @@ read_JPEG_file (char * filename) * output image dimensions available, as well as the output colormap * if we asked for color quantization. * In this example, we need to make an output work buffer of the right size. - */ + */ /* JSAMPLEs per row in output buffer */ row_stride = cinfo.output_width * cinfo.output_components; /* Make a one-row-high sample array that will go away when done with image */ buffer = (*cinfo.mem->alloc_sarray) - ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1); + ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1); /* Step 6: while (scan lines remain to be read) */ /* jpeg_read_scanlines(...); */ diff --git a/jcapimin.c b/jcapimin.c index 20ba9e99b..601bb71da 100644 --- a/jcapimin.c +++ b/jcapimin.c @@ -33,12 +33,12 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize) int i; /* Guard against version mismatches between library and caller. */ - cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */ + cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */ if (version != JPEG_LIB_VERSION) ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version); if (structsize != SIZEOF(struct jpeg_compress_struct)) - ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE, - (int) SIZEOF(struct jpeg_compress_struct), (int) structsize); + ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE, + (int) SIZEOF(struct jpeg_compress_struct), (int) structsize); /* For debugging purposes, we zero the whole master structure. * But the application has already set the err pointer, and may have set @@ -85,7 +85,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize) cinfo->script_space = NULL; - cinfo->input_gamma = 1.0; /* in case application forgets */ + cinfo->input_gamma = 1.0; /* in case application forgets */ /* OK, I'm ready */ cinfo->global_state = CSTATE_START; @@ -173,15 +173,15 @@ jpeg_finish_compress (j_compress_ptr cinfo) (*cinfo->master->prepare_for_pass) (cinfo); for (iMCU_row = 0; iMCU_row < cinfo->total_iMCU_rows; iMCU_row++) { if (cinfo->progress != NULL) { - cinfo->progress->pass_counter = (long) iMCU_row; - cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows; - (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + cinfo->progress->pass_counter = (long) iMCU_row; + cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows; + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); } /* We bypass the main controller and invoke coef controller directly; * all work is being done from the coefficient buffer. */ if (! (*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE) NULL)) - ERREXIT(cinfo, JERR_CANT_SUSPEND); + ERREXIT(cinfo, JERR_CANT_SUSPEND); } (*cinfo->master->finish_pass) (cinfo); } @@ -202,7 +202,7 @@ jpeg_finish_compress (j_compress_ptr cinfo) GLOBAL(void) jpeg_write_marker (j_compress_ptr cinfo, int marker, - const JOCTET *dataptr, unsigned int datalen) + const JOCTET *dataptr, unsigned int datalen) { JMETHOD(void, write_marker_byte, (j_compress_ptr info, int val)); @@ -213,7 +213,7 @@ jpeg_write_marker (j_compress_ptr cinfo, int marker, ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); (*cinfo->marker->write_marker_header) (cinfo, marker, datalen); - write_marker_byte = cinfo->marker->write_marker_byte; /* copy for speed */ + write_marker_byte = cinfo->marker->write_marker_byte; /* copy for speed */ while (datalen--) { (*write_marker_byte) (cinfo, *dataptr); dataptr++; @@ -248,14 +248,14 @@ jpeg_write_m_byte (j_compress_ptr cinfo, int val) * To produce a pair of files containing abbreviated tables and abbreviated * image data, one would proceed as follows: * - * initialize JPEG object - * set JPEG parameters - * set destination to table file - * jpeg_write_tables(cinfo); - * set destination to image file - * jpeg_start_compress(cinfo, FALSE); - * write data... - * jpeg_finish_compress(cinfo); + * initialize JPEG object + * set JPEG parameters + * set destination to table file + * jpeg_write_tables(cinfo); + * set destination to image file + * jpeg_start_compress(cinfo, FALSE); + * write data... + * jpeg_finish_compress(cinfo); * * jpeg_write_tables has the side effect of marking all tables written * (same as jpeg_suppress_tables(..., TRUE)). Thus a subsequent start_compress diff --git a/jcapistd.c b/jcapistd.c index c0320b1b1..167f0205a 100644 --- a/jcapistd.c +++ b/jcapistd.c @@ -41,7 +41,7 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables) ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); if (write_all_tables) - jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */ + jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */ /* (Re)initialize error mgr and destination modules */ (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo); @@ -75,7 +75,7 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables) GLOBAL(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines, - JDIMENSION num_lines) + JDIMENSION num_lines) { JDIMENSION row_ctr, rows_left; @@ -118,7 +118,7 @@ jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines, GLOBAL(JDIMENSION) jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data, - JDIMENSION num_lines) + JDIMENSION num_lines) { JDIMENSION lines_per_iMCU_row; diff --git a/jcarith.c b/jcarith.c index a9ca1c338..e8bb281c2 100644 --- a/jcarith.c +++ b/jcarith.c @@ -34,8 +34,8 @@ typedef struct { int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */ - unsigned int restarts_to_go; /* MCUs left in this restart interval */ - int next_restart_num; /* next restart number to write (0-7) */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + int next_restart_num; /* next restart number to write (0-7) */ /* Pointers to statistics areas (these workspaces have image lifespan) */ unsigned char * dc_stats[NUM_ARITH_TBLS]; @@ -101,14 +101,14 @@ typedef arith_entropy_encoder * arith_entropy_ptr; */ #ifdef RIGHT_SHIFT_IS_UNSIGNED -#define ISHIFT_TEMPS int ishift_temp; +#define ISHIFT_TEMPS int ishift_temp; #define IRIGHT_SHIFT(x,shft) \ - ((ishift_temp = (x)) < 0 ? \ - (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \ - (ishift_temp >> (shft))) + ((ishift_temp = (x)) < 0 ? \ + (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \ + (ishift_temp >> (shft))) #else #define ISHIFT_TEMPS -#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) +#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) #endif @@ -149,11 +149,11 @@ finish_pass (j_compress_ptr cinfo) /* One final overflow has to be handled */ if (e->buffer >= 0) { if (e->zc) - do emit_byte(0x00, cinfo); - while (--e->zc); + do emit_byte(0x00, cinfo); + while (--e->zc); emit_byte(e->buffer + 1, cinfo); if (e->buffer + 1 == 0xFF) - emit_byte(0x00, cinfo); + emit_byte(0x00, cinfo); } e->zc += e->sc; /* carry-over converts stacked 0xFF bytes to 0x00 */ e->sc = 0; @@ -162,17 +162,17 @@ finish_pass (j_compress_ptr cinfo) ++e->zc; else if (e->buffer >= 0) { if (e->zc) - do emit_byte(0x00, cinfo); - while (--e->zc); + do emit_byte(0x00, cinfo); + while (--e->zc); emit_byte(e->buffer, cinfo); } if (e->sc) { if (e->zc) - do emit_byte(0x00, cinfo); - while (--e->zc); + do emit_byte(0x00, cinfo); + while (--e->zc); do { - emit_byte(0xFF, cinfo); - emit_byte(0x00, cinfo); + emit_byte(0xFF, cinfo); + emit_byte(0x00, cinfo); } while (--e->sc); } } @@ -187,7 +187,7 @@ finish_pass (j_compress_ptr cinfo) if (e->c & 0x7F800L) { emit_byte((e->c >> 11) & 0xFF, cinfo); if (((e->c >> 11) & 0xFF) == 0xFF) - emit_byte(0x00, cinfo); + emit_byte(0x00, cinfo); } } } @@ -216,7 +216,7 @@ finish_pass (j_compress_ptr cinfo) */ LOCAL(void) -arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) +arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) { register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy; register unsigned char nl, nm; @@ -227,9 +227,9 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) * Qe values and probability estimation state machine */ sv = *st; - qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */ - nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */ - nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ + qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */ + nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */ + nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ /* Encode & estimation procedures per sections D.1.4 & D.1.5 */ e->a -= qe; @@ -243,7 +243,7 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) e->c += e->a; e->a = qe; } - *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ + *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ } else { /* Encode the more probable symbol */ if (e->a >= 0x8000L) @@ -255,7 +255,7 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) e->c += e->a; e->a = qe; } - *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ + *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ } /* Renormalization & data output per section D.1.6 */ @@ -266,43 +266,43 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val) /* Another byte is ready for output */ temp = e->c >> 19; if (temp > 0xFF) { - /* Handle overflow over all stacked 0xFF bytes */ - if (e->buffer >= 0) { - if (e->zc) - do emit_byte(0x00, cinfo); - while (--e->zc); - emit_byte(e->buffer + 1, cinfo); - if (e->buffer + 1 == 0xFF) - emit_byte(0x00, cinfo); - } - e->zc += e->sc; /* carry-over converts stacked 0xFF bytes to 0x00 */ - e->sc = 0; - /* Note: The 3 spacer bits in the C register guarantee - * that the new buffer byte can't be 0xFF here - * (see page 160 in the P&M JPEG book). */ - e->buffer = temp & 0xFF; /* new output byte, might overflow later */ + /* Handle overflow over all stacked 0xFF bytes */ + if (e->buffer >= 0) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + emit_byte(e->buffer + 1, cinfo); + if (e->buffer + 1 == 0xFF) + emit_byte(0x00, cinfo); + } + e->zc += e->sc; /* carry-over converts stacked 0xFF bytes to 0x00 */ + e->sc = 0; + /* Note: The 3 spacer bits in the C register guarantee + * that the new buffer byte can't be 0xFF here + * (see page 160 in the P&M JPEG book). */ + e->buffer = temp & 0xFF; /* new output byte, might overflow later */ } else if (temp == 0xFF) { - ++e->sc; /* stack 0xFF byte (which might overflow later) */ + ++e->sc; /* stack 0xFF byte (which might overflow later) */ } else { - /* Output all stacked 0xFF bytes, they will not overflow any more */ - if (e->buffer == 0) - ++e->zc; - else if (e->buffer >= 0) { - if (e->zc) - do emit_byte(0x00, cinfo); - while (--e->zc); - emit_byte(e->buffer, cinfo); - } - if (e->sc) { - if (e->zc) - do emit_byte(0x00, cinfo); - while (--e->zc); - do { - emit_byte(0xFF, cinfo); - emit_byte(0x00, cinfo); - } while (--e->sc); - } - e->buffer = temp & 0xFF; /* new output byte (can still overflow) */ + /* Output all stacked 0xFF bytes, they will not overflow any more */ + if (e->buffer == 0) + ++e->zc; + else if (e->buffer >= 0) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + emit_byte(e->buffer, cinfo); + } + if (e->sc) { + if (e->zc) + do emit_byte(0x00, cinfo); + while (--e->zc); + do { + emit_byte(0xFF, cinfo); + emit_byte(0x00, cinfo); + } while (--e->sc); + } + e->buffer = temp & 0xFF; /* new output byte (can still overflow) */ } e->c &= 0x7FFFFL; e->ct += 8; @@ -398,45 +398,45 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* Figure F.4: Encode_DC_DIFF */ if ((v = m - entropy->last_dc_val[ci]) == 0) { arith_encode(cinfo, st, 0); - entropy->dc_context[ci] = 0; /* zero diff category */ + entropy->dc_context[ci] = 0; /* zero diff category */ } else { entropy->last_dc_val[ci] = m; arith_encode(cinfo, st, 1); /* Figure F.6: Encoding nonzero value v */ /* Figure F.7: Encoding the sign of v */ if (v > 0) { - arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */ - st += 2; /* Table F.4: SP = S0 + 2 */ - entropy->dc_context[ci] = 4; /* small positive diff category */ + arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */ + st += 2; /* Table F.4: SP = S0 + 2 */ + entropy->dc_context[ci] = 4; /* small positive diff category */ } else { - v = -v; - arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */ - st += 3; /* Table F.4: SN = S0 + 3 */ - entropy->dc_context[ci] = 8; /* small negative diff category */ + v = -v; + arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */ + st += 3; /* Table F.4: SN = S0 + 3 */ + entropy->dc_context[ci] = 8; /* small negative diff category */ } /* Figure F.8: Encoding the magnitude category of v */ m = 0; if (v -= 1) { - arith_encode(cinfo, st, 1); - m = 1; - v2 = v; - st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ - while (v2 >>= 1) { - arith_encode(cinfo, st, 1); - m <<= 1; - st += 1; - } + arith_encode(cinfo, st, 1); + m = 1; + v2 = v; + st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ + while (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st += 1; + } } arith_encode(cinfo, st, 0); /* Section F.1.4.4.1.2: Establish dc_context conditioning category */ if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1)) - entropy->dc_context[ci] = 0; /* zero diff category */ + entropy->dc_context[ci] = 0; /* zero diff category */ else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1)) - entropy->dc_context[ci] += 8; /* large diff category */ + entropy->dc_context[ci] += 8; /* large diff category */ /* Figure F.9: Encoding the magnitude bit pattern of v */ st += 14; while (m >>= 1) - arith_encode(cinfo, st, (m & v) ? 1 : 0); + arith_encode(cinfo, st, (m & v) ? 1 : 0); } } @@ -491,21 +491,21 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* Figure F.5: Encode_AC_Coefficients */ for (k = cinfo->Ss; k <= ke; k++) { st = entropy->ac_stats[tbl] + 3 * (k - 1); - arith_encode(cinfo, st, 0); /* EOB decision */ + arith_encode(cinfo, st, 0); /* EOB decision */ for (;;) { if ((v = (*block)[jpeg_natural_order[k]]) >= 0) { - if (v >>= cinfo->Al) { - arith_encode(cinfo, st + 1, 1); - arith_encode(cinfo, entropy->fixed_bin, 0); - break; - } + if (v >>= cinfo->Al) { + arith_encode(cinfo, st + 1, 1); + arith_encode(cinfo, entropy->fixed_bin, 0); + break; + } } else { - v = -v; - if (v >>= cinfo->Al) { - arith_encode(cinfo, st + 1, 1); - arith_encode(cinfo, entropy->fixed_bin, 1); - break; - } + v = -v; + if (v >>= cinfo->Al) { + arith_encode(cinfo, st + 1, 1); + arith_encode(cinfo, entropy->fixed_bin, 1); + break; + } } arith_encode(cinfo, st + 1, 0); st += 3; k++; } @@ -517,15 +517,15 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) m = 1; v2 = v; if (v2 >>= 1) { - arith_encode(cinfo, st, 1); - m <<= 1; - st = entropy->ac_stats[tbl] + - (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); - while (v2 >>= 1) { - arith_encode(cinfo, st, 1); - m <<= 1; - st += 1; - } + arith_encode(cinfo, st, 1); + m <<= 1; + st = entropy->ac_stats[tbl] + + (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); + while (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st += 1; + } } } arith_encode(cinfo, st, 0); @@ -566,7 +566,7 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) entropy->restarts_to_go--; } - st = entropy->fixed_bin; /* use fixed probability estimation */ + st = entropy->fixed_bin; /* use fixed probability estimation */ Al = cinfo->Al; /* Encode the MCU data blocks */ @@ -635,29 +635,29 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) for (k = cinfo->Ss; k <= ke; k++) { st = entropy->ac_stats[tbl] + 3 * (k - 1); if (k > kex) - arith_encode(cinfo, st, 0); /* EOB decision */ + arith_encode(cinfo, st, 0); /* EOB decision */ for (;;) { if ((v = (*block)[jpeg_natural_order[k]]) >= 0) { - if (v >>= cinfo->Al) { - if (v >> 1) /* previously nonzero coef */ - arith_encode(cinfo, st + 2, (v & 1)); - else { /* newly nonzero coef */ - arith_encode(cinfo, st + 1, 1); - arith_encode(cinfo, entropy->fixed_bin, 0); - } - break; - } + if (v >>= cinfo->Al) { + if (v >> 1) /* previously nonzero coef */ + arith_encode(cinfo, st + 2, (v & 1)); + else { /* newly nonzero coef */ + arith_encode(cinfo, st + 1, 1); + arith_encode(cinfo, entropy->fixed_bin, 0); + } + break; + } } else { - v = -v; - if (v >>= cinfo->Al) { - if (v >> 1) /* previously nonzero coef */ - arith_encode(cinfo, st + 2, (v & 1)); - else { /* newly nonzero coef */ - arith_encode(cinfo, st + 1, 1); - arith_encode(cinfo, entropy->fixed_bin, 1); - } - break; - } + v = -v; + if (v >>= cinfo->Al) { + if (v >> 1) /* previously nonzero coef */ + arith_encode(cinfo, st + 2, (v & 1)); + else { /* newly nonzero coef */ + arith_encode(cinfo, st + 1, 1); + arith_encode(cinfo, entropy->fixed_bin, 1); + } + break; + } } arith_encode(cinfo, st + 1, 0); st += 3; k++; } @@ -713,45 +713,45 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* Figure F.4: Encode_DC_DIFF */ if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) { arith_encode(cinfo, st, 0); - entropy->dc_context[ci] = 0; /* zero diff category */ + entropy->dc_context[ci] = 0; /* zero diff category */ } else { entropy->last_dc_val[ci] = (*block)[0]; arith_encode(cinfo, st, 1); /* Figure F.6: Encoding nonzero value v */ /* Figure F.7: Encoding the sign of v */ if (v > 0) { - arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */ - st += 2; /* Table F.4: SP = S0 + 2 */ - entropy->dc_context[ci] = 4; /* small positive diff category */ + arith_encode(cinfo, st + 1, 0); /* Table F.4: SS = S0 + 1 */ + st += 2; /* Table F.4: SP = S0 + 2 */ + entropy->dc_context[ci] = 4; /* small positive diff category */ } else { - v = -v; - arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */ - st += 3; /* Table F.4: SN = S0 + 3 */ - entropy->dc_context[ci] = 8; /* small negative diff category */ + v = -v; + arith_encode(cinfo, st + 1, 1); /* Table F.4: SS = S0 + 1 */ + st += 3; /* Table F.4: SN = S0 + 3 */ + entropy->dc_context[ci] = 8; /* small negative diff category */ } /* Figure F.8: Encoding the magnitude category of v */ m = 0; if (v -= 1) { - arith_encode(cinfo, st, 1); - m = 1; - v2 = v; - st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ - while (v2 >>= 1) { - arith_encode(cinfo, st, 1); - m <<= 1; - st += 1; - } + arith_encode(cinfo, st, 1); + m = 1; + v2 = v; + st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ + while (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st += 1; + } } arith_encode(cinfo, st, 0); /* Section F.1.4.4.1.2: Establish dc_context conditioning category */ if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1)) - entropy->dc_context[ci] = 0; /* zero diff category */ + entropy->dc_context[ci] = 0; /* zero diff category */ else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1)) - entropy->dc_context[ci] += 8; /* large diff category */ + entropy->dc_context[ci] += 8; /* large diff category */ /* Figure F.9: Encoding the magnitude bit pattern of v */ st += 14; while (m >>= 1) - arith_encode(cinfo, st, (m & v) ? 1 : 0); + arith_encode(cinfo, st, (m & v) ? 1 : 0); } /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */ @@ -765,43 +765,43 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* Figure F.5: Encode_AC_Coefficients */ for (k = 1; k <= ke; k++) { st = entropy->ac_stats[tbl] + 3 * (k - 1); - arith_encode(cinfo, st, 0); /* EOB decision */ + arith_encode(cinfo, st, 0); /* EOB decision */ while ((v = (*block)[jpeg_natural_order[k]]) == 0) { - arith_encode(cinfo, st + 1, 0); st += 3; k++; + arith_encode(cinfo, st + 1, 0); st += 3; k++; } arith_encode(cinfo, st + 1, 1); /* Figure F.6: Encoding nonzero value v */ /* Figure F.7: Encoding the sign of v */ if (v > 0) { - arith_encode(cinfo, entropy->fixed_bin, 0); + arith_encode(cinfo, entropy->fixed_bin, 0); } else { - v = -v; - arith_encode(cinfo, entropy->fixed_bin, 1); + v = -v; + arith_encode(cinfo, entropy->fixed_bin, 1); } st += 2; /* Figure F.8: Encoding the magnitude category of v */ m = 0; if (v -= 1) { - arith_encode(cinfo, st, 1); - m = 1; - v2 = v; - if (v2 >>= 1) { - arith_encode(cinfo, st, 1); - m <<= 1; - st = entropy->ac_stats[tbl] + - (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); - while (v2 >>= 1) { - arith_encode(cinfo, st, 1); - m <<= 1; - st += 1; - } - } + arith_encode(cinfo, st, 1); + m = 1; + v2 = v; + if (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st = entropy->ac_stats[tbl] + + (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); + while (v2 >>= 1) { + arith_encode(cinfo, st, 1); + m <<= 1; + st += 1; + } + } } arith_encode(cinfo, st, 0); /* Figure F.9: Encoding the magnitude bit pattern of v */ st += 14; while (m >>= 1) - arith_encode(cinfo, st, (m & v) ? 1 : 0); + arith_encode(cinfo, st, (m & v) ? 1 : 0); } /* Encode EOB decision only if k <= DCTSIZE2 - 1 */ if (k <= DCTSIZE2 - 1) { @@ -838,14 +838,14 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics) if (cinfo->progressive_mode) { if (cinfo->Ah == 0) { if (cinfo->Ss == 0) - entropy->pub.encode_mcu = encode_mcu_DC_first; + entropy->pub.encode_mcu = encode_mcu_DC_first; else - entropy->pub.encode_mcu = encode_mcu_AC_first; + entropy->pub.encode_mcu = encode_mcu_AC_first; } else { if (cinfo->Ss == 0) - entropy->pub.encode_mcu = encode_mcu_DC_refine; + entropy->pub.encode_mcu = encode_mcu_DC_refine; else - entropy->pub.encode_mcu = encode_mcu_AC_refine; + entropy->pub.encode_mcu = encode_mcu_AC_refine; } } else entropy->pub.encode_mcu = encode_mcu; @@ -857,10 +857,10 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics) if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) { tbl = compptr->dc_tbl_no; if (tbl < 0 || tbl >= NUM_ARITH_TBLS) - ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); + ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); if (entropy->dc_stats[tbl] == NULL) - entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) - ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS); + entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS); MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS); /* Initialize DC predictions to 0 */ entropy->last_dc_val[ci] = 0; @@ -870,15 +870,15 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics) if (cinfo->progressive_mode == 0 || cinfo->Se) { tbl = compptr->ac_tbl_no; if (tbl < 0 || tbl >= NUM_ARITH_TBLS) - ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); + ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); if (entropy->ac_stats[tbl] == NULL) - entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) - ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS); + entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS); MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS); #ifdef CALCULATE_SPECTRAL_CONDITIONING if (cinfo->progressive_mode) - /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */ - cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4); + /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */ + cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4); #endif } } @@ -909,7 +909,7 @@ jinit_arith_encoder (j_compress_ptr cinfo) entropy = (arith_entropy_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(arith_entropy_encoder)); + SIZEOF(arith_entropy_encoder)); cinfo->entropy = (struct jpeg_entropy_encoder *) entropy; entropy->pub.start_pass = start_pass; entropy->pub.finish_pass = finish_pass; diff --git a/jccoefct.c b/jccoefct.c index 1963ddb61..ffc9b7ec3 100644 --- a/jccoefct.c +++ b/jccoefct.c @@ -34,10 +34,10 @@ typedef struct { struct jpeg_c_coef_controller pub; /* public fields */ - JDIMENSION iMCU_row_num; /* iMCU row # within image */ - JDIMENSION mcu_ctr; /* counts MCUs processed in current row */ - int MCU_vert_offset; /* counts MCU rows within iMCU row */ - int MCU_rows_per_iMCU_row; /* number of such rows needed */ + JDIMENSION iMCU_row_num; /* iMCU row # within image */ + JDIMENSION mcu_ctr; /* counts MCUs processed in current row */ + int MCU_vert_offset; /* counts MCU rows within iMCU row */ + int MCU_rows_per_iMCU_row; /* number of such rows needed */ /* For single-pass compression, it's sufficient to buffer just one MCU * (although this may prove a bit slow in practice). We allocate a @@ -143,7 +143,7 @@ METHODDEF(boolean) compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf) { my_coef_ptr coef = (my_coef_ptr) cinfo->coef; - JDIMENSION MCU_col_num; /* index of current MCU within row */ + JDIMENSION MCU_col_num; /* index of current MCU within row */ JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1; JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; int blkn, bi, ci, yindex, yoffset, blockcnt; @@ -154,7 +154,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf) for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; yoffset++) { for (MCU_col_num = coef->mcu_ctr; MCU_col_num <= last_MCU_col; - MCU_col_num++) { + MCU_col_num++) { /* Determine where data comes from in input_buf and do the DCT thing. * Each call on forward_DCT processes a horizontal row of DCT blocks * as wide as an MCU; we rely on having allocated the MCU_buffer[] blocks @@ -166,46 +166,46 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf) */ blkn = 0; for (ci = 0; ci < cinfo->comps_in_scan; ci++) { - compptr = cinfo->cur_comp_info[ci]; - blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width - : compptr->last_col_width; - xpos = MCU_col_num * compptr->MCU_sample_width; - ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */ - for (yindex = 0; yindex < compptr->MCU_height; yindex++) { - if (coef->iMCU_row_num < last_iMCU_row || - yoffset+yindex < compptr->last_row_height) { - (*cinfo->fdct->forward_DCT) (cinfo, compptr, - input_buf[compptr->component_index], - coef->MCU_buffer[blkn], - ypos, xpos, (JDIMENSION) blockcnt); - if (blockcnt < compptr->MCU_width) { - /* Create some dummy blocks at the right edge of the image. */ - jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt], - (compptr->MCU_width - blockcnt) * SIZEOF(JBLOCK)); - for (bi = blockcnt; bi < compptr->MCU_width; bi++) { - coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0]; - } - } - } else { - /* Create a row of dummy blocks at the bottom of the image. */ - jzero_far((void FAR *) coef->MCU_buffer[blkn], - compptr->MCU_width * SIZEOF(JBLOCK)); - for (bi = 0; bi < compptr->MCU_width; bi++) { - coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0]; - } - } - blkn += compptr->MCU_width; - ypos += DCTSIZE; - } + compptr = cinfo->cur_comp_info[ci]; + blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width + : compptr->last_col_width; + xpos = MCU_col_num * compptr->MCU_sample_width; + ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */ + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + if (coef->iMCU_row_num < last_iMCU_row || + yoffset+yindex < compptr->last_row_height) { + (*cinfo->fdct->forward_DCT) (cinfo, compptr, + input_buf[compptr->component_index], + coef->MCU_buffer[blkn], + ypos, xpos, (JDIMENSION) blockcnt); + if (blockcnt < compptr->MCU_width) { + /* Create some dummy blocks at the right edge of the image. */ + jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt], + (compptr->MCU_width - blockcnt) * SIZEOF(JBLOCK)); + for (bi = blockcnt; bi < compptr->MCU_width; bi++) { + coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0]; + } + } + } else { + /* Create a row of dummy blocks at the bottom of the image. */ + jzero_far((void FAR *) coef->MCU_buffer[blkn], + compptr->MCU_width * SIZEOF(JBLOCK)); + for (bi = 0; bi < compptr->MCU_width; bi++) { + coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0]; + } + } + blkn += compptr->MCU_width; + ypos += DCTSIZE; + } } /* Try to write the MCU. In event of a suspension failure, we will * re-DCT the MCU on restart (a bit inefficient, could be fixed...) */ if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) { - /* Suspension forced; update state counters and exit */ - coef->MCU_vert_offset = yoffset; - coef->mcu_ctr = MCU_col_num; - return FALSE; + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->mcu_ctr = MCU_col_num; + return FALSE; } } /* Completed an MCU row, but perhaps not an iMCU row */ @@ -280,17 +280,17 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf) for (block_row = 0; block_row < block_rows; block_row++) { thisblockrow = buffer[block_row]; (*cinfo->fdct->forward_DCT) (cinfo, compptr, - input_buf[ci], thisblockrow, - (JDIMENSION) (block_row * DCTSIZE), - (JDIMENSION) 0, blocks_across); + input_buf[ci], thisblockrow, + (JDIMENSION) (block_row * DCTSIZE), + (JDIMENSION) 0, blocks_across); if (ndummy > 0) { - /* Create dummy blocks at the right edge of the image. */ - thisblockrow += blocks_across; /* => first dummy block */ - jzero_far((void FAR *) thisblockrow, ndummy * SIZEOF(JBLOCK)); - lastDC = thisblockrow[-1][0]; - for (bi = 0; bi < ndummy; bi++) { - thisblockrow[bi][0] = lastDC; - } + /* Create dummy blocks at the right edge of the image. */ + thisblockrow += blocks_across; /* => first dummy block */ + jzero_far((void FAR *) thisblockrow, ndummy * SIZEOF(JBLOCK)); + lastDC = thisblockrow[-1][0]; + for (bi = 0; bi < ndummy; bi++) { + thisblockrow[bi][0] = lastDC; + } } } /* If at end of image, create dummy block rows as needed. @@ -299,22 +299,22 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf) * This squeezes a few more bytes out of the resulting file... */ if (coef->iMCU_row_num == last_iMCU_row) { - blocks_across += ndummy; /* include lower right corner */ + blocks_across += ndummy; /* include lower right corner */ MCUs_across = blocks_across / h_samp_factor; for (block_row = block_rows; block_row < compptr->v_samp_factor; - block_row++) { - thisblockrow = buffer[block_row]; - lastblockrow = buffer[block_row-1]; - jzero_far((void FAR *) thisblockrow, - (size_t) (blocks_across * SIZEOF(JBLOCK))); - for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) { - lastDC = lastblockrow[h_samp_factor-1][0]; - for (bi = 0; bi < h_samp_factor; bi++) { - thisblockrow[bi][0] = lastDC; - } - thisblockrow += h_samp_factor; /* advance to next MCU in row */ - lastblockrow += h_samp_factor; - } + block_row++) { + thisblockrow = buffer[block_row]; + lastblockrow = buffer[block_row-1]; + jzero_far((void FAR *) thisblockrow, + (size_t) (blocks_across * SIZEOF(JBLOCK))); + for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) { + lastDC = lastblockrow[h_samp_factor-1][0]; + for (bi = 0; bi < h_samp_factor; bi++) { + thisblockrow[bi][0] = lastDC; + } + thisblockrow += h_samp_factor; /* advance to next MCU in row */ + lastblockrow += h_samp_factor; + } } } } @@ -341,7 +341,7 @@ METHODDEF(boolean) compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf) { my_coef_ptr coef = (my_coef_ptr) cinfo->coef; - JDIMENSION MCU_col_num; /* index of current MCU within row */ + JDIMENSION MCU_col_num; /* index of current MCU within row */ int blkn, ci, xindex, yindex, yoffset; JDIMENSION start_col; JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN]; @@ -364,25 +364,25 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf) for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; yoffset++) { for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row; - MCU_col_num++) { + MCU_col_num++) { /* Construct list of pointers to DCT blocks belonging to this MCU */ - blkn = 0; /* index of current DCT block within MCU */ + blkn = 0; /* index of current DCT block within MCU */ for (ci = 0; ci < cinfo->comps_in_scan; ci++) { - compptr = cinfo->cur_comp_info[ci]; - start_col = MCU_col_num * compptr->MCU_width; - for (yindex = 0; yindex < compptr->MCU_height; yindex++) { - buffer_ptr = buffer[ci][yindex+yoffset] + start_col; - for (xindex = 0; xindex < compptr->MCU_width; xindex++) { - coef->MCU_buffer[blkn++] = buffer_ptr++; - } - } + compptr = cinfo->cur_comp_info[ci]; + start_col = MCU_col_num * compptr->MCU_width; + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + buffer_ptr = buffer[ci][yindex+yoffset] + start_col; + for (xindex = 0; xindex < compptr->MCU_width; xindex++) { + coef->MCU_buffer[blkn++] = buffer_ptr++; + } + } } /* Try to write the MCU. */ if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) { - /* Suspension forced; update state counters and exit */ - coef->MCU_vert_offset = yoffset; - coef->mcu_ctr = MCU_col_num; - return FALSE; + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->mcu_ctr = MCU_col_num; + return FALSE; } } /* Completed an MCU row, but perhaps not an iMCU row */ @@ -408,7 +408,7 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer) coef = (my_coef_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_coef_controller)); + SIZEOF(my_coef_controller)); cinfo->coef = (struct jpeg_c_coef_controller *) coef; coef->pub.start_pass = start_pass_coef; @@ -421,14 +421,14 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer) jpeg_component_info *compptr; for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { + ci++, compptr++) { coef->whole_image[ci] = (*cinfo->mem->request_virt_barray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, - (JDIMENSION) jround_up((long) compptr->width_in_blocks, - (long) compptr->h_samp_factor), - (JDIMENSION) jround_up((long) compptr->height_in_blocks, - (long) compptr->v_samp_factor), - (JDIMENSION) compptr->v_samp_factor); + ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, + (JDIMENSION) jround_up((long) compptr->width_in_blocks, + (long) compptr->h_samp_factor), + (JDIMENSION) jround_up((long) compptr->height_in_blocks, + (long) compptr->v_samp_factor), + (JDIMENSION) compptr->v_samp_factor); } #else ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); @@ -440,7 +440,7 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer) buffer = (JBLOCKROW) (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, - C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)); + C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)); for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) { coef->MCU_buffer[i] = buffer + i; } diff --git a/jccolext.c b/jccolext.c index dda3beb84..2c6b7acf9 100644 --- a/jccolext.c +++ b/jccolext.c @@ -58,16 +58,16 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo, */ /* Y */ outptr0[col] = (JSAMPLE) - ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) - >> SCALEBITS); + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); /* Cb */ outptr1[col] = (JSAMPLE) - ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF]) - >> SCALEBITS); + ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF]) + >> SCALEBITS); /* Cr */ outptr2[col] = (JSAMPLE) - ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF]) - >> SCALEBITS); + ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF]) + >> SCALEBITS); } } } @@ -108,8 +108,8 @@ rgb_gray_convert_internal (j_compress_ptr cinfo, inptr += RGB_PIXELSIZE; /* Y */ outptr[col] = (JSAMPLE) - ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) - >> SCALEBITS); + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); } } } diff --git a/jccolor.c b/jccolor.c index 83184855d..fe9422a95 100644 --- a/jccolor.c +++ b/jccolor.c @@ -24,7 +24,7 @@ typedef struct { struct jpeg_color_converter pub; /* public fields */ /* Private state for RGB->YCC conversion */ - INT32 * rgb_ycc_tab; /* => table for RGB to YCbCr conversion */ + INT32 * rgb_ycc_tab; /* => table for RGB to YCbCr conversion */ } my_color_converter; typedef my_color_converter * my_cconvert_ptr; @@ -36,9 +36,9 @@ typedef my_color_converter * my_cconvert_ptr; * YCbCr is defined per CCIR 601-1, except that Cb and Cr are * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5. * The conversion equations to be implemented are therefore - * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.) * Note: older versions of the IJG code used a zero offset of MAXJSAMPLE/2, * rather than CENTERJSAMPLE, for Cb and Cr. This gave equal positive and @@ -60,10 +60,10 @@ typedef my_color_converter * my_cconvert_ptr; * in the tables to save adding them separately in the inner loop. */ -#define SCALEBITS 16 /* speediest right-shift on some machines */ -#define CBCR_OFFSET ((INT32) CENTERJSAMPLE << SCALEBITS) -#define ONE_HALF ((INT32) 1 << (SCALEBITS-1)) -#define FIX(x) ((INT32) ((x) * (1L< Y section */ -#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */ -#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */ -#define R_CB_OFF (3*(MAXJSAMPLE+1)) -#define G_CB_OFF (4*(MAXJSAMPLE+1)) -#define B_CB_OFF (5*(MAXJSAMPLE+1)) -#define R_CR_OFF B_CB_OFF /* B=>Cb, R=>Cr are the same */ -#define G_CR_OFF (6*(MAXJSAMPLE+1)) -#define B_CR_OFF (7*(MAXJSAMPLE+1)) -#define TABLE_SIZE (8*(MAXJSAMPLE+1)) +#define R_Y_OFF 0 /* offset to R => Y section */ +#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */ +#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */ +#define R_CB_OFF (3*(MAXJSAMPLE+1)) +#define G_CB_OFF (4*(MAXJSAMPLE+1)) +#define B_CB_OFF (5*(MAXJSAMPLE+1)) +#define R_CR_OFF B_CB_OFF /* B=>Cb, R=>Cr are the same */ +#define G_CR_OFF (6*(MAXJSAMPLE+1)) +#define B_CR_OFF (7*(MAXJSAMPLE+1)) +#define TABLE_SIZE (8*(MAXJSAMPLE+1)) /* Include inline routines for colorspace extensions */ @@ -202,7 +202,7 @@ rgb_ycc_start (j_compress_ptr cinfo) /* Allocate and fill in the conversion tables. */ cconvert->rgb_ycc_tab = rgb_ycc_tab = (INT32 *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (TABLE_SIZE * SIZEOF(INT32))); + (TABLE_SIZE * SIZEOF(INT32))); for (i = 0; i <= MAXJSAMPLE; i++) { rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i; @@ -230,8 +230,8 @@ rgb_ycc_start (j_compress_ptr cinfo) METHODDEF(void) rgb_ycc_convert (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) { switch (cinfo->in_color_space) { case JCS_EXT_RGB: @@ -279,8 +279,8 @@ rgb_ycc_convert (j_compress_ptr cinfo, METHODDEF(void) rgb_gray_convert (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) { switch (cinfo->in_color_space) { case JCS_EXT_RGB: @@ -325,8 +325,8 @@ rgb_gray_convert (j_compress_ptr cinfo, METHODDEF(void) rgb_rgb_convert (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) { switch (cinfo->in_color_space) { case JCS_EXT_RGB: @@ -375,8 +375,8 @@ rgb_rgb_convert (j_compress_ptr cinfo, METHODDEF(void) cmyk_ycck_convert (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) { my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; register int r, g, b; @@ -398,7 +398,7 @@ cmyk_ycck_convert (j_compress_ptr cinfo, g = MAXJSAMPLE - GETJSAMPLE(inptr[1]); b = MAXJSAMPLE - GETJSAMPLE(inptr[2]); /* K passes through as-is */ - outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */ + outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */ inptr += 4; /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations * must be too; we do not need an explicit range-limiting operation. @@ -407,16 +407,16 @@ cmyk_ycck_convert (j_compress_ptr cinfo, */ /* Y */ outptr0[col] = (JSAMPLE) - ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) - >> SCALEBITS); + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); /* Cb */ outptr1[col] = (JSAMPLE) - ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF]) - >> SCALEBITS); + ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF]) + >> SCALEBITS); /* Cr */ outptr2[col] = (JSAMPLE) - ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF]) - >> SCALEBITS); + ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF]) + >> SCALEBITS); } } } @@ -430,8 +430,8 @@ cmyk_ycck_convert (j_compress_ptr cinfo, METHODDEF(void) grayscale_convert (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) { register JSAMPROW inptr; register JSAMPROW outptr; @@ -444,7 +444,7 @@ grayscale_convert (j_compress_ptr cinfo, outptr = output_buf[0][output_row]; output_row++; for (col = 0; col < num_cols; col++) { - outptr[col] = inptr[0]; /* don't need GETJSAMPLE() here */ + outptr[col] = inptr[0]; /* don't need GETJSAMPLE() here */ inptr += instride; } } @@ -459,8 +459,8 @@ grayscale_convert (j_compress_ptr cinfo, METHODDEF(void) null_convert (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows) + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) { register JSAMPROW inptr; register JSAMPROW outptr; @@ -475,8 +475,8 @@ null_convert (j_compress_ptr cinfo, inptr = *input_buf; outptr = output_buf[ci][output_row]; for (col = 0; col < num_cols; col++) { - outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */ - inptr += nc; + outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */ + inptr += nc; } } input_buf++; @@ -507,7 +507,7 @@ jinit_color_converter (j_compress_ptr cinfo) cconvert = (my_cconvert_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_color_converter)); + SIZEOF(my_color_converter)); cinfo->cconvert = (struct jpeg_color_converter *) cconvert; /* set start_pass to null method until we find out differently */ cconvert->pub.start_pass = null_method; @@ -545,7 +545,7 @@ jinit_color_converter (j_compress_ptr cinfo) ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE); break; - default: /* JCS_UNKNOWN can be anything */ + default: /* JCS_UNKNOWN can be anything */ if (cinfo->input_components < 1) ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE); break; @@ -652,9 +652,9 @@ jinit_color_converter (j_compress_ptr cinfo) ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); break; - default: /* allow null conversion of JCS_UNKNOWN */ + default: /* allow null conversion of JCS_UNKNOWN */ if (cinfo->jpeg_color_space != cinfo->in_color_space || - cinfo->num_components != cinfo->input_components) + cinfo->num_components != cinfo->input_components) ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); cconvert->pub.color_convert = null_convert; break; diff --git a/jcdctmgr.c b/jcdctmgr.c index 3234a01aa..bf40ff76b 100644 --- a/jcdctmgr.c +++ b/jcdctmgr.c @@ -18,7 +18,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #include "jsimddct.h" @@ -44,7 +44,7 @@ typedef JMETHOD(void, float_quantize_method_ptr, METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *); typedef struct { - struct jpeg_forward_dct pub; /* public fields */ + struct jpeg_forward_dct pub; /* public fields */ /* Pointer to the DCT routine actually in use */ forward_DCT_method_ptr dct; @@ -147,7 +147,7 @@ flss (UINT16 val) * * In order to allow SIMD implementations we also tweak the values to * allow the same calculation to be made at all times: - * + * * dctbl[0] = f rounded to nearest integer * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5) * dctbl[2] = 1 << ((word size) * 2 - r) @@ -221,7 +221,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo) qtblno = compptr->quant_tbl_no; /* Make sure specified quantization table is present */ if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS || - cinfo->quant_tbl_ptrs[qtblno] == NULL) + cinfo->quant_tbl_ptrs[qtblno] == NULL) ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno); qtbl = cinfo->quant_tbl_ptrs[qtblno]; /* Compute divisors for this quant table */ @@ -233,91 +233,91 @@ start_pass_fdctmgr (j_compress_ptr cinfo) * coefficients multiplied by 8 (to counteract scaling). */ if (fdct->divisors[qtblno] == NULL) { - fdct->divisors[qtblno] = (DCTELEM *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (DCTSIZE2 * 4) * SIZEOF(DCTELEM)); + fdct->divisors[qtblno] = (DCTELEM *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (DCTSIZE2 * 4) * SIZEOF(DCTELEM)); } dtbl = fdct->divisors[qtblno]; for (i = 0; i < DCTSIZE2; i++) { - if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) - && fdct->quantize == jsimd_quantize) - fdct->quantize = quantize; + if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) + && fdct->quantize == jsimd_quantize) + fdct->quantize = quantize; } break; #endif #ifdef DCT_IFAST_SUPPORTED case JDCT_IFAST: { - /* For AA&N IDCT method, divisors are equal to quantization - * coefficients scaled by scalefactor[row]*scalefactor[col], where - * scalefactor[0] = 1 - * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 - * We apply a further scale factor of 8. - */ + /* For AA&N IDCT method, divisors are equal to quantization + * coefficients scaled by scalefactor[row]*scalefactor[col], where + * scalefactor[0] = 1 + * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + * We apply a further scale factor of 8. + */ #define CONST_BITS 14 - static const INT16 aanscales[DCTSIZE2] = { - /* precomputed values scaled up by 14 bits */ - 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, - 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270, - 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906, - 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, - 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, - 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552, - 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446, - 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247 - }; - SHIFT_TEMPS - - if (fdct->divisors[qtblno] == NULL) { - fdct->divisors[qtblno] = (DCTELEM *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (DCTSIZE2 * 4) * SIZEOF(DCTELEM)); - } - dtbl = fdct->divisors[qtblno]; - for (i = 0; i < DCTSIZE2; i++) { - if(!compute_reciprocal( - DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i], - (INT32) aanscales[i]), - CONST_BITS-3), &dtbl[i]) - && fdct->quantize == jsimd_quantize) - fdct->quantize = quantize; - } + static const INT16 aanscales[DCTSIZE2] = { + /* precomputed values scaled up by 14 bits */ + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, + 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270, + 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906, + 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, + 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552, + 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446, + 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247 + }; + SHIFT_TEMPS + + if (fdct->divisors[qtblno] == NULL) { + fdct->divisors[qtblno] = (DCTELEM *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + (DCTSIZE2 * 4) * SIZEOF(DCTELEM)); + } + dtbl = fdct->divisors[qtblno]; + for (i = 0; i < DCTSIZE2; i++) { + if(!compute_reciprocal( + DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i], + (INT32) aanscales[i]), + CONST_BITS-3), &dtbl[i]) + && fdct->quantize == jsimd_quantize) + fdct->quantize = quantize; + } } break; #endif #ifdef DCT_FLOAT_SUPPORTED case JDCT_FLOAT: { - /* For float AA&N IDCT method, divisors are equal to quantization - * coefficients scaled by scalefactor[row]*scalefactor[col], where - * scalefactor[0] = 1 - * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 - * We apply a further scale factor of 8. - * What's actually stored is 1/divisor so that the inner loop can - * use a multiplication rather than a division. - */ - FAST_FLOAT * fdtbl; - int row, col; - static const double aanscalefactor[DCTSIZE] = { - 1.0, 1.387039845, 1.306562965, 1.175875602, - 1.0, 0.785694958, 0.541196100, 0.275899379 - }; - - if (fdct->float_divisors[qtblno] == NULL) { - fdct->float_divisors[qtblno] = (FAST_FLOAT *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - DCTSIZE2 * SIZEOF(FAST_FLOAT)); - } - fdtbl = fdct->float_divisors[qtblno]; - i = 0; - for (row = 0; row < DCTSIZE; row++) { - for (col = 0; col < DCTSIZE; col++) { - fdtbl[i] = (FAST_FLOAT) - (1.0 / (((double) qtbl->quantval[i] * - aanscalefactor[row] * aanscalefactor[col] * 8.0))); - i++; - } - } + /* For float AA&N IDCT method, divisors are equal to quantization + * coefficients scaled by scalefactor[row]*scalefactor[col], where + * scalefactor[0] = 1 + * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + * We apply a further scale factor of 8. + * What's actually stored is 1/divisor so that the inner loop can + * use a multiplication rather than a division. + */ + FAST_FLOAT * fdtbl; + int row, col; + static const double aanscalefactor[DCTSIZE] = { + 1.0, 1.387039845, 1.306562965, 1.175875602, + 1.0, 0.785694958, 0.541196100, 0.275899379 + }; + + if (fdct->float_divisors[qtblno] == NULL) { + fdct->float_divisors[qtblno] = (FAST_FLOAT *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + DCTSIZE2 * SIZEOF(FAST_FLOAT)); + } + fdtbl = fdct->float_divisors[qtblno]; + i = 0; + for (row = 0; row < DCTSIZE; row++) { + for (col = 0; col < DCTSIZE; col++) { + fdtbl[i] = (FAST_FLOAT) + (1.0 / (((double) qtbl->quantval[i] * + aanscalefactor[row] * aanscalefactor[col] * 8.0))); + i++; + } + } } break; #endif @@ -344,7 +344,7 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace) for (elemr = 0; elemr < DCTSIZE; elemr++) { elemptr = sample_data[elemr] + start_col; -#if DCTSIZE == 8 /* unroll the inner loop */ +#if DCTSIZE == 8 /* unroll the inner loop */ *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; @@ -410,9 +410,9 @@ quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) METHODDEF(void) forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY sample_data, JBLOCKROW coef_blocks, - JDIMENSION start_row, JDIMENSION start_col, - JDIMENSION num_blocks) + JSAMPARRAY sample_data, JBLOCKROW coef_blocks, + JDIMENSION start_row, JDIMENSION start_col, + JDIMENSION num_blocks) /* This version is used for integer DCT implementations. */ { /* This routine is heavily used, so it's worth coding it tightly. */ @@ -427,7 +427,7 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr, quantize_method_ptr do_quantize = fdct->quantize; workspace = fdct->workspace; - sample_data += start_row; /* fold in the vertical offset once */ + sample_data += start_row; /* fold in the vertical offset once */ for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { /* Load data into workspace, applying unsigned->signed conversion */ @@ -455,7 +455,7 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * works workspaceptr = workspace; for (elemr = 0; elemr < DCTSIZE; elemr++) { elemptr = sample_data[elemr] + start_col; -#if DCTSIZE == 8 /* unroll the inner loop */ +#if DCTSIZE == 8 /* unroll the inner loop */ *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); @@ -500,9 +500,9 @@ quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspa METHODDEF(void) forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY sample_data, JBLOCKROW coef_blocks, - JDIMENSION start_row, JDIMENSION start_col, - JDIMENSION num_blocks) + JSAMPARRAY sample_data, JBLOCKROW coef_blocks, + JDIMENSION start_row, JDIMENSION start_col, + JDIMENSION num_blocks) /* This version is used for floating-point DCT implementations. */ { /* This routine is heavily used, so it's worth coding it tightly. */ @@ -518,7 +518,7 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr, float_quantize_method_ptr do_quantize = fdct->float_quantize; workspace = fdct->float_workspace; - sample_data += start_row; /* fold in the vertical offset once */ + sample_data += start_row; /* fold in the vertical offset once */ for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { /* Load data into workspace, applying unsigned->signed conversion */ @@ -547,7 +547,7 @@ jinit_forward_dct (j_compress_ptr cinfo) fdct = (my_fdct_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_fdct_controller)); + SIZEOF(my_fdct_controller)); cinfo->fdct = (struct jpeg_forward_dct *) fdct; fdct->pub.start_pass = start_pass_fdctmgr; @@ -626,12 +626,12 @@ jinit_forward_dct (j_compress_ptr cinfo) if (cinfo->dct_method == JDCT_FLOAT) fdct->float_workspace = (FAST_FLOAT *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(FAST_FLOAT) * DCTSIZE2); + SIZEOF(FAST_FLOAT) * DCTSIZE2); else #endif fdct->workspace = (DCTELEM *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(DCTELEM) * DCTSIZE2); + SIZEOF(DCTELEM) * DCTSIZE2); /* Mark divisor tables unallocated */ for (i = 0; i < NUM_QUANT_TBLS; i++) { diff --git a/jchuff.c b/jchuff.c index 398d2a11f..b205cf98d 100644 --- a/jchuff.c +++ b/jchuff.c @@ -19,7 +19,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jchuff.h" /* Declarations shared with jcphuff.c */ +#include "jchuff.h" /* Declarations shared with jcphuff.c */ #include /* @@ -64,8 +64,8 @@ */ typedef struct { - size_t put_buffer; /* current bit-accumulation buffer */ - int put_bits; /* # of bits now in it */ + size_t put_buffer; /* current bit-accumulation buffer */ + int put_bits; /* # of bits now in it */ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ } savable_state; @@ -79,12 +79,12 @@ typedef struct { #else #if MAX_COMPS_IN_SCAN == 4 #define ASSIGN_STATE(dest,src) \ - ((dest).put_buffer = (src).put_buffer, \ - (dest).put_bits = (src).put_bits, \ - (dest).last_dc_val[0] = (src).last_dc_val[0], \ - (dest).last_dc_val[1] = (src).last_dc_val[1], \ - (dest).last_dc_val[2] = (src).last_dc_val[2], \ - (dest).last_dc_val[3] = (src).last_dc_val[3]) + ((dest).put_buffer = (src).put_buffer, \ + (dest).put_bits = (src).put_bits, \ + (dest).last_dc_val[0] = (src).last_dc_val[0], \ + (dest).last_dc_val[1] = (src).last_dc_val[1], \ + (dest).last_dc_val[2] = (src).last_dc_val[2], \ + (dest).last_dc_val[3] = (src).last_dc_val[3]) #endif #endif @@ -92,17 +92,17 @@ typedef struct { typedef struct { struct jpeg_entropy_encoder pub; /* public fields */ - savable_state saved; /* Bit buffer & DC state at start of MCU */ + savable_state saved; /* Bit buffer & DC state at start of MCU */ /* These fields are NOT loaded into local working state. */ - unsigned int restarts_to_go; /* MCUs left in this restart interval */ - int next_restart_num; /* next restart number to write (0-7) */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + int next_restart_num; /* next restart number to write (0-7) */ /* Pointers to derived tables (these workspaces have image lifespan) */ c_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS]; c_derived_tbl * ac_derived_tbls[NUM_HUFF_TBLS]; -#ifdef ENTROPY_OPT_SUPPORTED /* Statistics tables for optimization */ +#ifdef ENTROPY_OPT_SUPPORTED /* Statistics tables for optimization */ long * dc_count_ptrs[NUM_HUFF_TBLS]; long * ac_count_ptrs[NUM_HUFF_TBLS]; #endif @@ -115,20 +115,20 @@ typedef huff_entropy_encoder * huff_entropy_ptr; */ typedef struct { - JOCTET * next_output_byte; /* => next byte to write in buffer */ - size_t free_in_buffer; /* # of byte spaces remaining in buffer */ - savable_state cur; /* Current bit buffer & DC state */ - j_compress_ptr cinfo; /* dump_buffer needs access to this */ + JOCTET * next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + savable_state cur; /* Current bit buffer & DC state */ + j_compress_ptr cinfo; /* dump_buffer needs access to this */ } working_state; /* Forward declarations */ METHODDEF(boolean) encode_mcu_huff JPP((j_compress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(void) finish_pass_huff JPP((j_compress_ptr cinfo)); #ifdef ENTROPY_OPT_SUPPORTED METHODDEF(boolean) encode_mcu_gather JPP((j_compress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(void) finish_pass_gather JPP((j_compress_ptr cinfo)); #endif @@ -167,29 +167,29 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics) /* Check for invalid table indexes */ /* (make_c_derived_tbl does this in the other path) */ if (dctbl < 0 || dctbl >= NUM_HUFF_TBLS) - ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl); + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, dctbl); if (actbl < 0 || actbl >= NUM_HUFF_TBLS) - ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl); + ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, actbl); /* Allocate and zero the statistics tables */ /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */ if (entropy->dc_count_ptrs[dctbl] == NULL) - entropy->dc_count_ptrs[dctbl] = (long *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - 257 * SIZEOF(long)); + entropy->dc_count_ptrs[dctbl] = (long *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + 257 * SIZEOF(long)); MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * SIZEOF(long)); if (entropy->ac_count_ptrs[actbl] == NULL) - entropy->ac_count_ptrs[actbl] = (long *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - 257 * SIZEOF(long)); + entropy->ac_count_ptrs[actbl] = (long *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + 257 * SIZEOF(long)); MEMZERO(entropy->ac_count_ptrs[actbl], 257 * SIZEOF(long)); #endif } else { /* Compute derived values for Huffman tables */ /* We may do this more than once for a table, but it's not expensive */ jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl, - & entropy->dc_derived_tbls[dctbl]); + & entropy->dc_derived_tbls[dctbl]); jpeg_make_c_derived_tbl(cinfo, FALSE, actbl, - & entropy->ac_derived_tbls[actbl]); + & entropy->ac_derived_tbls[actbl]); } /* Initialize DC predictions to 0 */ entropy->saved.last_dc_val[ci] = 0; @@ -214,7 +214,7 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics) GLOBAL(void) jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno, - c_derived_tbl ** pdtbl) + c_derived_tbl ** pdtbl) { JHUFF_TBL *htbl; c_derived_tbl *dtbl; @@ -239,22 +239,22 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno, if (*pdtbl == NULL) *pdtbl = (c_derived_tbl *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(c_derived_tbl)); + SIZEOF(c_derived_tbl)); dtbl = *pdtbl; - + /* Figure C.1: make table of Huffman code length for each symbol */ p = 0; for (l = 1; l <= 16; l++) { i = (int) htbl->bits[l]; - if (i < 0 || p + i > 256) /* protect against table overrun */ + if (i < 0 || p + i > 256) /* protect against table overrun */ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); while (i--) huffsize[p++] = (char) l; } huffsize[p] = 0; lastp = p; - + /* Figure C.2: generate the codes themselves */ /* We also validate that the counts represent a legal Huffman code tree. */ @@ -274,7 +274,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno, code <<= 1; si++; } - + /* Figure C.3: generate encoding tables */ /* These are code and size indexed by symbol value */ @@ -305,10 +305,10 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno, /* Emit a byte, taking 'action' if must suspend. */ #define emit_byte(state,val,action) \ - { *(state)->next_output_byte++ = (JOCTET) (val); \ - if (--(state)->free_in_buffer == 0) \ - if (! dump_buffer(state)) \ - { action; } } + { *(state)->next_output_byte++ = (JOCTET) (val); \ + if (--(state)->free_in_buffer == 0) \ + if (! dump_buffer(state)) \ + { action; } } LOCAL(boolean) @@ -456,7 +456,7 @@ flush_bits (working_state * state) PUT_BITS(0x7F, 7) while (put_bits >= 8) EMIT_BYTE() - state->cur.put_buffer = 0; /* and reset bit-buffer to empty */ + state->cur.put_buffer = 0; /* and reset bit-buffer to empty */ state->cur.put_bits = 0; STORE_BUFFER() @@ -468,7 +468,7 @@ flush_bits (working_state * state) LOCAL(boolean) encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, - c_derived_tbl *dctbl, c_derived_tbl *actbl) + c_derived_tbl *dctbl, c_derived_tbl *actbl) { int temp, temp2, temp3; int nbits; @@ -483,7 +483,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, LOAD_BUFFER() /* Encode the DC coefficient difference per section F.1.2.1 */ - + temp = temp2 = block[0] - last_dc_val; /* This is a well-known technique for obtaining the absolute value without a @@ -517,8 +517,8 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, CHECKBUF15() /* Encode the AC coefficients per section F.1.2.2 */ - - r = 0; /* r = run length of zeros */ + + r = 0; /* r = run length of zeros */ /* Manually unroll the k loop to eliminate the counter variable. This * improves performance greatly on systems with a limited number of @@ -624,7 +624,7 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data) if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) if (! emit_restart(&state, entropy->next_restart_num)) - return FALSE; + return FALSE; } /* Encode the MCU data blocks */ @@ -632,9 +632,9 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data) ci = cinfo->MCU_membership[blkn]; compptr = cinfo->cur_comp_info[ci]; if (! encode_one_block(&state, - MCU_data[blkn][0], state.cur.last_dc_val[ci], - entropy->dc_derived_tbls[compptr->dc_tbl_no], - entropy->ac_derived_tbls[compptr->ac_tbl_no])) + MCU_data[blkn][0], state.cur.last_dc_val[ci], + entropy->dc_derived_tbls[compptr->dc_tbl_no], + entropy->ac_derived_tbls[compptr->ac_tbl_no])) return FALSE; /* Update last_dc_val */ state.cur.last_dc_val[ci] = MCU_data[blkn][0][0]; @@ -704,18 +704,18 @@ finish_pass_huff (j_compress_ptr cinfo) LOCAL(void) htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val, - long dc_counts[], long ac_counts[]) + long dc_counts[], long ac_counts[]) { register int temp; register int nbits; register int k, r; - + /* Encode the DC coefficient difference per section F.1.2.1 */ - + temp = block[0] - last_dc_val; if (temp < 0) temp = -temp; - + /* Find the number of bits needed for the magnitude of the coefficient */ nbits = 0; while (temp) { @@ -730,36 +730,36 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val, /* Count the Huffman symbol for the number of bits */ dc_counts[nbits]++; - + /* Encode the AC coefficients per section F.1.2.2 */ - - r = 0; /* r = run length of zeros */ - + + r = 0; /* r = run length of zeros */ + for (k = 1; k < DCTSIZE2; k++) { if ((temp = block[jpeg_natural_order[k]]) == 0) { r++; } else { /* if run length > 15, must emit special run-length-16 codes (0xF0) */ while (r > 15) { - ac_counts[0xF0]++; - r -= 16; + ac_counts[0xF0]++; + r -= 16; } - + /* Find the number of bits needed for the magnitude of the coefficient */ if (temp < 0) - temp = -temp; - + temp = -temp; + /* Find the number of bits needed for the magnitude of the coefficient */ - nbits = 1; /* there must be at least one 1 bit */ + nbits = 1; /* there must be at least one 1 bit */ while ((temp >>= 1)) - nbits++; + nbits++; /* Check for out-of-range coefficient values */ if (nbits > MAX_COEF_BITS) - ERREXIT(cinfo, JERR_BAD_DCT_COEF); - + ERREXIT(cinfo, JERR_BAD_DCT_COEF); + /* Count Huffman symbol for run length / number of bits */ ac_counts[(r << 4) + nbits]++; - + r = 0; } } @@ -787,7 +787,7 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data) if (entropy->restarts_to_go == 0) { /* Re-initialize DC predictions to 0 */ for (ci = 0; ci < cinfo->comps_in_scan; ci++) - entropy->saved.last_dc_val[ci] = 0; + entropy->saved.last_dc_val[ci] = 0; /* Update restart state */ entropy->restarts_to_go = cinfo->restart_interval; } @@ -798,8 +798,8 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data) ci = cinfo->MCU_membership[blkn]; compptr = cinfo->cur_comp_info[ci]; htest_one_block(cinfo, MCU_data[blkn][0], entropy->saved.last_dc_val[ci], - entropy->dc_count_ptrs[compptr->dc_tbl_no], - entropy->ac_count_ptrs[compptr->ac_tbl_no]); + entropy->dc_count_ptrs[compptr->dc_tbl_no], + entropy->ac_count_ptrs[compptr->ac_tbl_no]); entropy->saved.last_dc_val[ci] = MCU_data[blkn][0][0]; } @@ -838,10 +838,10 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data) GLOBAL(void) jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) { -#define MAX_CLEN 32 /* assumed maximum initial code length */ - UINT8 bits[MAX_CLEN+1]; /* bits[k] = # of symbols with code length k */ - int codesize[257]; /* codesize[k] = code length of symbol k */ - int others[257]; /* next symbol in current branch of tree */ +#define MAX_CLEN 32 /* assumed maximum initial code length */ + UINT8 bits[MAX_CLEN+1]; /* bits[k] = # of symbols with code length k */ + int codesize[257]; /* codesize[k] = code length of symbol k */ + int others[257]; /* next symbol in current branch of tree */ int c1, c2; int p, i, j; long v; @@ -851,9 +851,9 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) MEMZERO(bits, SIZEOF(bits)); MEMZERO(codesize, SIZEOF(codesize)); for (i = 0; i < 257; i++) - others[i] = -1; /* init links to empty */ - - freq[256] = 1; /* make sure 256 has a nonzero count */ + others[i] = -1; /* init links to empty */ + + freq[256] = 1; /* make sure 256 has a nonzero count */ /* Including the pseudo-symbol 256 in the Huffman procedure guarantees * that no real symbol is given code-value of all ones, because 256 * will be placed last in the largest codeword category. @@ -868,8 +868,8 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) v = 1000000000L; for (i = 0; i <= 256; i++) { if (freq[i] && freq[i] <= v) { - v = freq[i]; - c1 = i; + v = freq[i]; + c1 = i; } } @@ -879,15 +879,15 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) v = 1000000000L; for (i = 0; i <= 256; i++) { if (freq[i] && freq[i] <= v && i != c1) { - v = freq[i]; - c2 = i; + v = freq[i]; + c2 = i; } } /* Done if we've merged everything into one frequency */ if (c2 < 0) break; - + /* Else merge the two counts/trees */ freq[c1] += freq[c2]; freq[c2] = 0; @@ -898,9 +898,9 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) c1 = others[c1]; codesize[c1]++; } - - others[c1] = c2; /* chain c2 onto c1's tree branch */ - + + others[c1] = c2; /* chain c2 onto c1's tree branch */ + /* Increment the codesize of everything in c2's tree branch */ codesize[c2]++; while (others[c2] >= 0) { @@ -915,7 +915,7 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) /* The JPEG standard seems to think that this can't happen, */ /* but I'm paranoid... */ if (codesize[i] > MAX_CLEN) - ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW); + ERREXIT(cinfo, JERR_HUFF_CLEN_OVERFLOW); bits[codesize[i]]++; } @@ -931,28 +931,28 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) * shortest nonzero BITS entry is converted into a prefix for two code words * one bit longer. */ - + for (i = MAX_CLEN; i > 16; i--) { while (bits[i] > 0) { - j = i - 2; /* find length of new prefix to be used */ + j = i - 2; /* find length of new prefix to be used */ while (bits[j] == 0) - j--; - - bits[i] -= 2; /* remove two symbols */ - bits[i-1]++; /* one goes in this length */ - bits[j+1] += 2; /* two new symbols in this length */ - bits[j]--; /* symbol of this length is now a prefix */ + j--; + + bits[i] -= 2; /* remove two symbols */ + bits[i-1]++; /* one goes in this length */ + bits[j+1] += 2; /* two new symbols in this length */ + bits[j]--; /* symbol of this length is now a prefix */ } } /* Remove the count for the pseudo-symbol 256 from the largest codelength */ - while (bits[i] == 0) /* find largest codelength still in use */ + while (bits[i] == 0) /* find largest codelength still in use */ i--; bits[i]--; - + /* Return final symbol counts (only for lengths 0..16) */ MEMCOPY(htbl->bits, bits, SIZEOF(htbl->bits)); - + /* Return a list of the symbols sorted by code length */ /* It's not real clear to me why we don't need to consider the codelength * changes made above, but the JPEG spec seems to think this works. @@ -961,8 +961,8 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]) for (i = 1; i <= MAX_CLEN; i++) { for (j = 0; j <= 255; j++) { if (codesize[j] == i) { - htbl->huffval[p] = (UINT8) j; - p++; + htbl->huffval[p] = (UINT8) j; + p++; } } } @@ -999,14 +999,14 @@ finish_pass_gather (j_compress_ptr cinfo) if (! did_dc[dctbl]) { htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl]; if (*htblptr == NULL) - *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); + *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]); did_dc[dctbl] = TRUE; } if (! did_ac[actbl]) { htblptr = & cinfo->ac_huff_tbl_ptrs[actbl]; if (*htblptr == NULL) - *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); + *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]); did_ac[actbl] = TRUE; } @@ -1029,7 +1029,7 @@ jinit_huff_encoder (j_compress_ptr cinfo) entropy = (huff_entropy_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(huff_entropy_encoder)); + SIZEOF(huff_entropy_encoder)); cinfo->entropy = (struct jpeg_entropy_encoder *) entropy; entropy->pub.start_pass = start_pass_huff; diff --git a/jchuff.h b/jchuff.h index a9599fc1e..a1a5280d9 100644 --- a/jchuff.h +++ b/jchuff.h @@ -25,23 +25,23 @@ /* Derived data constructed for each Huffman table */ typedef struct { - unsigned int ehufco[256]; /* code for each symbol */ - char ehufsi[256]; /* length of code for each symbol */ + unsigned int ehufco[256]; /* code for each symbol */ + char ehufsi[256]; /* length of code for each symbol */ /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */ } c_derived_tbl; /* Short forms of external names for systems with brain-damaged linkers. */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jpeg_make_c_derived_tbl jMkCDerived -#define jpeg_gen_optimal_table jGenOptTbl +#define jpeg_make_c_derived_tbl jMkCDerived +#define jpeg_gen_optimal_table jGenOptTbl #endif /* NEED_SHORT_EXTERNAL_NAMES */ /* Expand a Huffman table definition into the derived format */ EXTERN(void) jpeg_make_c_derived_tbl - JPP((j_compress_ptr cinfo, boolean isDC, int tblno, - c_derived_tbl ** pdtbl)); + JPP((j_compress_ptr cinfo, boolean isDC, int tblno, + c_derived_tbl ** pdtbl)); /* Generate an optimal table definition given the specified counts */ EXTERN(void) jpeg_gen_optimal_table - JPP((j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])); + JPP((j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[])); diff --git a/jcinit.c b/jcinit.c index de0ade2a7..347cf6dab 100644 --- a/jcinit.c +++ b/jcinit.c @@ -60,7 +60,7 @@ jinit_compress_master (j_compress_ptr cinfo) /* Need a full-image coefficient buffer in any multi-pass mode. */ jinit_c_coef_controller(cinfo, - (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding)); + (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding)); jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */); jinit_marker_writer(cinfo); diff --git a/jcmainct.c b/jcmainct.c index 5b7ff21dc..5a0549a1b 100644 --- a/jcmainct.c +++ b/jcmainct.c @@ -28,10 +28,10 @@ typedef struct { struct jpeg_c_main_controller pub; /* public fields */ - JDIMENSION cur_iMCU_row; /* number of current iMCU row */ - JDIMENSION rowgroup_ctr; /* counts row groups received in iMCU row */ - boolean suspended; /* remember if we suspended output */ - J_BUF_MODE pass_mode; /* current operating mode */ + JDIMENSION cur_iMCU_row; /* number of current iMCU row */ + JDIMENSION rowgroup_ctr; /* counts row groups received in iMCU row */ + boolean suspended; /* remember if we suspended output */ + J_BUF_MODE pass_mode; /* current operating mode */ /* If using just a strip buffer, this points to the entire set of buffers * (we allocate one for each component). In the full-image case, this @@ -52,12 +52,12 @@ typedef my_main_controller * my_main_ptr; /* Forward declarations */ METHODDEF(void) process_data_simple_main - JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, - JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)); + JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, + JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)); #ifdef FULL_MAIN_BUFFER_SUPPORTED METHODDEF(void) process_data_buffer_main - JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, - JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)); + JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, + JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)); #endif @@ -74,10 +74,10 @@ start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode) if (cinfo->raw_data_in) return; - main_ptr->cur_iMCU_row = 0; /* initialize counters */ + main_ptr->cur_iMCU_row = 0; /* initialize counters */ main_ptr->rowgroup_ctr = 0; main_ptr->suspended = FALSE; - main_ptr->pass_mode = pass_mode; /* save mode for use by process_data */ + main_ptr->pass_mode = pass_mode; /* save mode for use by process_data */ switch (pass_mode) { case JBUF_PASS_THRU: @@ -111,8 +111,8 @@ start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode) METHODDEF(void) process_data_simple_main (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, - JDIMENSION in_rows_avail) + JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail) { my_main_ptr main_ptr = (my_main_ptr) cinfo->main; @@ -120,9 +120,9 @@ process_data_simple_main (j_compress_ptr cinfo, /* Read input data if we haven't filled the main buffer yet */ if (main_ptr->rowgroup_ctr < DCTSIZE) (*cinfo->prep->pre_process_data) (cinfo, - input_buf, in_row_ctr, in_rows_avail, - main_ptr->buffer, &main_ptr->rowgroup_ctr, - (JDIMENSION) DCTSIZE); + input_buf, in_row_ctr, in_rows_avail, + main_ptr->buffer, &main_ptr->rowgroup_ctr, + (JDIMENSION) DCTSIZE); /* If we don't have a full iMCU row buffered, return to application for * more data. Note that preprocessor will always pad to fill the iMCU row @@ -140,8 +140,8 @@ process_data_simple_main (j_compress_ptr cinfo, * think we were done. */ if (! main_ptr->suspended) { - (*in_row_ctr)--; - main_ptr->suspended = TRUE; + (*in_row_ctr)--; + main_ptr->suspended = TRUE; } return; } @@ -167,8 +167,8 @@ process_data_simple_main (j_compress_ptr cinfo, METHODDEF(void) process_data_buffer_main (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, - JDIMENSION in_rows_avail) + JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail) { my_main_ptr main_ptr = (my_main_ptr) cinfo->main; int ci; @@ -179,16 +179,16 @@ process_data_buffer_main (j_compress_ptr cinfo, /* Realign the virtual buffers if at the start of an iMCU row. */ if (main_ptr->rowgroup_ctr == 0) { for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { - main_ptr->buffer[ci] = (*cinfo->mem->access_virt_sarray) - ((j_common_ptr) cinfo, main_ptr->whole_image[ci], - main_ptr->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE), - (JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing); + ci++, compptr++) { + main_ptr->buffer[ci] = (*cinfo->mem->access_virt_sarray) + ((j_common_ptr) cinfo, main_ptr->whole_image[ci], + main_ptr->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE), + (JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing); } /* In a read pass, pretend we just read some source data. */ if (! writing) { - *in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE; - main_ptr->rowgroup_ctr = DCTSIZE; + *in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE; + main_ptr->rowgroup_ctr = DCTSIZE; } } @@ -196,35 +196,35 @@ process_data_buffer_main (j_compress_ptr cinfo, /* Note: preprocessor will pad if necessary to fill the last iMCU row. */ if (writing) { (*cinfo->prep->pre_process_data) (cinfo, - input_buf, in_row_ctr, in_rows_avail, - main_ptr->buffer, &main_ptr->rowgroup_ctr, - (JDIMENSION) DCTSIZE); + input_buf, in_row_ctr, in_rows_avail, + main_ptr->buffer, &main_ptr->rowgroup_ctr, + (JDIMENSION) DCTSIZE); /* Return to application if we need more data to fill the iMCU row. */ if (main_ptr->rowgroup_ctr < DCTSIZE) - return; + return; } /* Emit data, unless this is a sink-only pass. */ if (main_ptr->pass_mode != JBUF_SAVE_SOURCE) { if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) { - /* If compressor did not consume the whole row, then we must need to - * suspend processing and return to the application. In this situation - * we pretend we didn't yet consume the last input row; otherwise, if - * it happened to be the last row of the image, the application would - * think we were done. - */ - if (! main_ptr->suspended) { - (*in_row_ctr)--; - main_ptr->suspended = TRUE; - } - return; + /* If compressor did not consume the whole row, then we must need to + * suspend processing and return to the application. In this situation + * we pretend we didn't yet consume the last input row; otherwise, if + * it happened to be the last row of the image, the application would + * think we were done. + */ + if (! main_ptr->suspended) { + (*in_row_ctr)--; + main_ptr->suspended = TRUE; + } + return; } /* We did finish the row. Undo our little suspension hack if a previous * call suspended; then mark the main buffer empty. */ if (main_ptr->suspended) { - (*in_row_ctr)++; - main_ptr->suspended = FALSE; + (*in_row_ctr)++; + main_ptr->suspended = FALSE; } } @@ -250,7 +250,7 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer) main_ptr = (my_main_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_main_controller)); + SIZEOF(my_main_controller)); cinfo->main = (struct jpeg_c_main_controller *) main_ptr; main_ptr->pub.start_pass = start_pass_main; @@ -266,13 +266,13 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer) /* Allocate a full-image virtual array for each component */ /* Note we pad the bottom to a multiple of the iMCU height */ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { + ci++, compptr++) { main_ptr->whole_image[ci] = (*cinfo->mem->request_virt_sarray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, - compptr->width_in_blocks * DCTSIZE, - (JDIMENSION) jround_up((long) compptr->height_in_blocks, - (long) compptr->v_samp_factor) * DCTSIZE, - (JDIMENSION) (compptr->v_samp_factor * DCTSIZE)); + ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, + compptr->width_in_blocks * DCTSIZE, + (JDIMENSION) jround_up((long) compptr->height_in_blocks, + (long) compptr->v_samp_factor) * DCTSIZE, + (JDIMENSION) (compptr->v_samp_factor * DCTSIZE)); } #else ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); @@ -283,11 +283,11 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer) #endif /* Allocate a strip buffer for each component */ for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { + ci++, compptr++) { main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, - compptr->width_in_blocks * DCTSIZE, - (JDIMENSION) (compptr->v_samp_factor * DCTSIZE)); + ((j_common_ptr) cinfo, JPOOL_IMAGE, + compptr->width_in_blocks * DCTSIZE, + (JDIMENSION) (compptr->v_samp_factor * DCTSIZE)); } } } diff --git a/jcmarker.c b/jcmarker.c index 4fbece4bc..62aeb59e7 100644 --- a/jcmarker.c +++ b/jcmarker.c @@ -17,7 +17,7 @@ #include "jpegcomp.h" -typedef enum { /* JPEG marker codes */ +typedef enum { /* JPEG marker codes */ M_SOF0 = 0xc0, M_SOF1 = 0xc1, M_SOF2 = 0xc2, @@ -173,7 +173,7 @@ emit_dqt (j_compress_ptr cinfo, int index) /* The table entries must be emitted in zigzag order. */ unsigned int qval = qtbl->quantval[jpeg_natural_order[i]]; if (prec) - emit_byte(cinfo, (int) (qval >> 8)); + emit_byte(cinfo, (int) (qval >> 8)); emit_byte(cinfo, (int) (qval & 0xFF)); } @@ -190,33 +190,33 @@ emit_dht (j_compress_ptr cinfo, int index, boolean is_ac) { JHUFF_TBL * htbl; int length, i; - + if (is_ac) { htbl = cinfo->ac_huff_tbl_ptrs[index]; - index += 0x10; /* output index has AC bit set */ + index += 0x10; /* output index has AC bit set */ } else { htbl = cinfo->dc_huff_tbl_ptrs[index]; } if (htbl == NULL) ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index); - + if (! htbl->sent_table) { emit_marker(cinfo, M_DHT); - + length = 0; for (i = 1; i <= 16; i++) length += htbl->bits[i]; - + emit_2bytes(cinfo, length + 2 + 1 + 16); emit_byte(cinfo, index); - + for (i = 1; i <= 16; i++) emit_byte(cinfo, htbl->bits[i]); - + for (i = 0; i < length; i++) emit_byte(cinfo, htbl->huffval[i]); - + htbl->sent_table = TRUE; } } @@ -258,12 +258,12 @@ emit_dac (j_compress_ptr cinfo) for (i = 0; i < NUM_ARITH_TBLS; i++) { if (dc_in_use[i]) { - emit_byte(cinfo, i); - emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4)); + emit_byte(cinfo, i); + emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4)); } if (ac_in_use[i]) { - emit_byte(cinfo, i + 0x10); - emit_byte(cinfo, cinfo->arith_ac_K[i]); + emit_byte(cinfo, i + 0x10); + emit_byte(cinfo, cinfo->arith_ac_K[i]); } } } @@ -276,8 +276,8 @@ emit_dri (j_compress_ptr cinfo) /* Emit a DRI marker */ { emit_marker(cinfo, M_DRI); - - emit_2bytes(cinfo, 4); /* fixed length */ + + emit_2bytes(cinfo, 4); /* fixed length */ emit_2bytes(cinfo, (int) cinfo->restart_interval); } @@ -289,9 +289,9 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code) { int ci; jpeg_component_info *compptr; - + emit_marker(cinfo, code); - + emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */ /* Make sure image isn't bigger than SOF field can handle */ @@ -320,13 +320,13 @@ emit_sos (j_compress_ptr cinfo) { int i, td, ta; jpeg_component_info *compptr; - + emit_marker(cinfo, M_SOS); - + emit_2bytes(cinfo, 2 * cinfo->comps_in_scan + 2 + 1 + 3); /* length */ - + emit_byte(cinfo, cinfo->comps_in_scan); - + for (i = 0; i < cinfo->comps_in_scan; i++) { compptr = cinfo->cur_comp_info[i]; emit_byte(cinfo, compptr->component_id); @@ -354,22 +354,22 @@ emit_jfif_app0 (j_compress_ptr cinfo) /* Emit a JFIF-compliant APP0 marker */ { /* - * Length of APP0 block (2 bytes) - * Block ID (4 bytes - ASCII "JFIF") - * Zero byte (1 byte to terminate the ID string) - * Version Major, Minor (2 bytes - major first) - * Units (1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm) - * Xdpu (2 bytes - dots per unit horizontal) - * Ydpu (2 bytes - dots per unit vertical) - * Thumbnail X size (1 byte) - * Thumbnail Y size (1 byte) + * Length of APP0 block (2 bytes) + * Block ID (4 bytes - ASCII "JFIF") + * Zero byte (1 byte to terminate the ID string) + * Version Major, Minor (2 bytes - major first) + * Units (1 byte - 0x00 = none, 0x01 = inch, 0x02 = cm) + * Xdpu (2 bytes - dots per unit horizontal) + * Ydpu (2 bytes - dots per unit vertical) + * Thumbnail X size (1 byte) + * Thumbnail Y size (1 byte) */ - + emit_marker(cinfo, M_APP0); - + emit_2bytes(cinfo, 2 + 4 + 1 + 2 + 1 + 2 + 2 + 1 + 1); /* length */ - emit_byte(cinfo, 0x4A); /* Identifier: ASCII "JFIF" */ + emit_byte(cinfo, 0x4A); /* Identifier: ASCII "JFIF" */ emit_byte(cinfo, 0x46); emit_byte(cinfo, 0x49); emit_byte(cinfo, 0x46); @@ -379,7 +379,7 @@ emit_jfif_app0 (j_compress_ptr cinfo) emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */ emit_2bytes(cinfo, (int) cinfo->X_density); emit_2bytes(cinfo, (int) cinfo->Y_density); - emit_byte(cinfo, 0); /* No thumbnail image */ + emit_byte(cinfo, 0); /* No thumbnail image */ emit_byte(cinfo, 0); } @@ -389,12 +389,12 @@ emit_adobe_app14 (j_compress_ptr cinfo) /* Emit an Adobe APP14 marker */ { /* - * Length of APP14 block (2 bytes) - * Block ID (5 bytes - ASCII "Adobe") - * Version Number (2 bytes - currently 100) - * Flags0 (2 bytes - currently 0) - * Flags1 (2 bytes - currently 0) - * Color transform (1 byte) + * Length of APP14 block (2 bytes) + * Block ID (5 bytes - ASCII "Adobe") + * Version Number (2 bytes - currently 100) + * Flags0 (2 bytes - currently 0) + * Flags1 (2 bytes - currently 0) + * Color transform (1 byte) * * Although Adobe TN 5116 mentions Version = 101, all the Adobe files * now in circulation seem to use Version = 100, so that's what we write. @@ -403,28 +403,28 @@ emit_adobe_app14 (j_compress_ptr cinfo) * YCbCr, 2 if it's YCCK, 0 otherwise. Adobe's definition has to do with * whether the encoder performed a transformation, which is pretty useless. */ - + emit_marker(cinfo, M_APP14); - + emit_2bytes(cinfo, 2 + 5 + 2 + 2 + 2 + 1); /* length */ - emit_byte(cinfo, 0x41); /* Identifier: ASCII "Adobe" */ + emit_byte(cinfo, 0x41); /* Identifier: ASCII "Adobe" */ emit_byte(cinfo, 0x64); emit_byte(cinfo, 0x6F); emit_byte(cinfo, 0x62); emit_byte(cinfo, 0x65); - emit_2bytes(cinfo, 100); /* Version */ - emit_2bytes(cinfo, 0); /* Flags0 */ - emit_2bytes(cinfo, 0); /* Flags1 */ + emit_2bytes(cinfo, 100); /* Version */ + emit_2bytes(cinfo, 0); /* Flags0 */ + emit_2bytes(cinfo, 0); /* Flags1 */ switch (cinfo->jpeg_color_space) { case JCS_YCbCr: - emit_byte(cinfo, 1); /* Color transform = 1 */ + emit_byte(cinfo, 1); /* Color transform = 1 */ break; case JCS_YCCK: - emit_byte(cinfo, 2); /* Color transform = 2 */ + emit_byte(cinfo, 2); /* Color transform = 2 */ break; default: - emit_byte(cinfo, 0); /* Color transform = 0 */ + emit_byte(cinfo, 0); /* Color transform = 0 */ break; } } @@ -442,12 +442,12 @@ METHODDEF(void) write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen) /* Emit an arbitrary marker header */ { - if (datalen > (unsigned int) 65533) /* safety check */ + if (datalen > (unsigned int) 65533) /* safety check */ ERREXIT(cinfo, JERR_BAD_LENGTH); emit_marker(cinfo, (JPEG_MARKER) marker); - emit_2bytes(cinfo, (int) (datalen + 2)); /* total length */ + emit_2bytes(cinfo, (int) (datalen + 2)); /* total length */ } METHODDEF(void) @@ -474,12 +474,12 @@ write_file_header (j_compress_ptr cinfo) { my_marker_ptr marker = (my_marker_ptr) cinfo->marker; - emit_marker(cinfo, M_SOI); /* first the SOI */ + emit_marker(cinfo, M_SOI); /* first the SOI */ /* SOI is defined to reset restart interval to 0 */ marker->last_restart_interval = 0; - if (cinfo->write_JFIF_header) /* next an optional JFIF APP0 */ + if (cinfo->write_JFIF_header) /* next an optional JFIF APP0 */ emit_jfif_app0(cinfo); if (cinfo->write_Adobe_marker) /* next an optional Adobe APP14 */ emit_adobe_app14(cinfo); @@ -500,7 +500,7 @@ write_frame_header (j_compress_ptr cinfo) int ci, prec; boolean is_baseline; jpeg_component_info *compptr; - + /* Emit DQT for each quantization table. * Note that emit_dqt() suppresses any duplicate tables. */ @@ -520,9 +520,9 @@ write_frame_header (j_compress_ptr cinfo) } else { is_baseline = TRUE; for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { + ci++, compptr++) { if (compptr->dc_tbl_no > 1 || compptr->ac_tbl_no > 1) - is_baseline = FALSE; + is_baseline = FALSE; } if (prec && is_baseline) { is_baseline = FALSE; @@ -539,11 +539,11 @@ write_frame_header (j_compress_ptr cinfo) emit_sof(cinfo, M_SOF9); /* SOF code for sequential arithmetic */ } else { if (cinfo->progressive_mode) - emit_sof(cinfo, M_SOF2); /* SOF code for progressive Huffman */ + emit_sof(cinfo, M_SOF2); /* SOF code for progressive Huffman */ else if (is_baseline) - emit_sof(cinfo, M_SOF0); /* SOF code for baseline implementation */ + emit_sof(cinfo, M_SOF0); /* SOF code for baseline implementation */ else - emit_sof(cinfo, M_SOF1); /* SOF code for non-baseline Huffman file */ + emit_sof(cinfo, M_SOF1); /* SOF code for non-baseline Huffman file */ } } @@ -575,10 +575,10 @@ write_scan_header (j_compress_ptr cinfo) compptr = cinfo->cur_comp_info[i]; /* DC needs no table for refinement scan */ if (cinfo->Ss == 0 && cinfo->Ah == 0) - emit_dht(cinfo, compptr->dc_tbl_no, FALSE); + emit_dht(cinfo, compptr->dc_tbl_no, FALSE); /* AC needs no table when not present */ if (cinfo->Se) - emit_dht(cinfo, compptr->ac_tbl_no, TRUE); + emit_dht(cinfo, compptr->ac_tbl_no, TRUE); } } @@ -627,9 +627,9 @@ write_tables_only (j_compress_ptr cinfo) if (! cinfo->arith_code) { for (i = 0; i < NUM_HUFF_TBLS; i++) { if (cinfo->dc_huff_tbl_ptrs[i] != NULL) - emit_dht(cinfo, i, FALSE); + emit_dht(cinfo, i, FALSE); if (cinfo->ac_huff_tbl_ptrs[i] != NULL) - emit_dht(cinfo, i, TRUE); + emit_dht(cinfo, i, TRUE); } } @@ -649,7 +649,7 @@ jinit_marker_writer (j_compress_ptr cinfo) /* Create the subobject */ marker = (my_marker_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_marker_writer)); + SIZEOF(my_marker_writer)); cinfo->marker = (struct jpeg_marker_writer *) marker; /* Initialize method pointers */ marker->pub.write_file_header = write_file_header; diff --git a/jcmaster.c b/jcmaster.c index dca031569..cf51c1ea3 100644 --- a/jcmaster.c +++ b/jcmaster.c @@ -10,7 +10,7 @@ * * This file contains master control logic for the JPEG compressor. * These routines are concerned with parameter validation, initial setup, - * and inter-pass control (determining the number of passes and the work + * and inter-pass control (determining the number of passes and the work * to be done in each pass). */ @@ -23,20 +23,20 @@ /* Private state */ typedef enum { - main_pass, /* input data, also do first output step */ - huff_opt_pass, /* Huffman code optimization pass */ - output_pass /* data output pass */ + main_pass, /* input data, also do first output step */ + huff_opt_pass, /* Huffman code optimization pass */ + output_pass /* data output pass */ } c_pass_type; typedef struct { - struct jpeg_comp_master pub; /* public fields */ + struct jpeg_comp_master pub; /* public fields */ - c_pass_type pass_type; /* the type of the current pass */ + c_pass_type pass_type; /* the type of the current pass */ - int pass_number; /* # of passes completed */ - int total_passes; /* total # of passes needed */ + int pass_number; /* # of passes completed */ + int total_passes; /* total # of passes needed */ - int scan_number; /* current index in scan_info[] */ + int scan_number; /* current index in scan_info[] */ } my_comp_master; typedef my_comp_master * my_master_ptr; @@ -105,7 +105,7 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only) /* Check that number of components won't exceed internal array sizes */ if (cinfo->num_components > MAX_COMPONENTS) ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components, - MAX_COMPONENTS); + MAX_COMPONENTS); /* Compute maximum sampling factors; check factor validity */ cinfo->max_h_samp_factor = 1; @@ -113,12 +113,12 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only) for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR || - compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR) + compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR) ERREXIT(cinfo, JERR_BAD_SAMPLING); cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor, - compptr->h_samp_factor); + compptr->h_samp_factor); cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor, - compptr->v_samp_factor); + compptr->v_samp_factor); } /* Compute dimensions of components */ @@ -135,17 +135,17 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only) /* Size in DCT blocks */ compptr->width_in_blocks = (JDIMENSION) jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor, - (long) (cinfo->max_h_samp_factor * DCTSIZE)); + (long) (cinfo->max_h_samp_factor * DCTSIZE)); compptr->height_in_blocks = (JDIMENSION) jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor, - (long) (cinfo->max_v_samp_factor * DCTSIZE)); + (long) (cinfo->max_v_samp_factor * DCTSIZE)); /* Size in samples */ compptr->downsampled_width = (JDIMENSION) jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor, - (long) cinfo->max_h_samp_factor); + (long) cinfo->max_h_samp_factor); compptr->downsampled_height = (JDIMENSION) jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor, - (long) cinfo->max_v_samp_factor); + (long) cinfo->max_v_samp_factor); /* Mark component needed (this flag isn't actually used for compression) */ compptr->component_needed = TRUE; } @@ -155,7 +155,7 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only) */ cinfo->total_iMCU_rows = (JDIMENSION) jdiv_round_up((long) cinfo->_jpeg_height, - (long) (cinfo->max_v_samp_factor*DCTSIZE)); + (long) (cinfo->max_v_samp_factor*DCTSIZE)); } @@ -188,15 +188,15 @@ validate_script (j_compress_ptr cinfo) #ifdef C_PROGRESSIVE_SUPPORTED cinfo->progressive_mode = TRUE; last_bitpos_ptr = & last_bitpos[0][0]; - for (ci = 0; ci < cinfo->num_components; ci++) + for (ci = 0; ci < cinfo->num_components; ci++) for (coefi = 0; coefi < DCTSIZE2; coefi++) - *last_bitpos_ptr++ = -1; + *last_bitpos_ptr++ = -1; #else ERREXIT(cinfo, JERR_NOT_COMPILED); #endif } else { cinfo->progressive_mode = FALSE; - for (ci = 0; ci < cinfo->num_components; ci++) + for (ci = 0; ci < cinfo->num_components; ci++) component_sent[ci] = FALSE; } @@ -208,10 +208,10 @@ validate_script (j_compress_ptr cinfo) for (ci = 0; ci < ncomps; ci++) { thisi = scanptr->component_index[ci]; if (thisi < 0 || thisi >= cinfo->num_components) - ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); + ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); /* Components must appear in SOF order within each scan */ if (ci > 0 && thisi <= scanptr->component_index[ci-1]) - ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); + ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); } /* Validate progression parameters */ Ss = scanptr->Ss; @@ -233,43 +233,43 @@ validate_script (j_compress_ptr cinfo) #define MAX_AH_AL 13 #endif if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 || - Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL) - ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL) + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); if (Ss == 0) { - if (Se != 0) /* DC and AC together not OK */ - ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + if (Se != 0) /* DC and AC together not OK */ + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); } else { - if (ncomps != 1) /* AC scans must be for only one component */ - ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + if (ncomps != 1) /* AC scans must be for only one component */ + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); } for (ci = 0; ci < ncomps; ci++) { - last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0]; - if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */ - ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); - for (coefi = Ss; coefi <= Se; coefi++) { - if (last_bitpos_ptr[coefi] < 0) { - /* first scan of this coefficient */ - if (Ah != 0) - ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); - } else { - /* not first scan */ - if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1) - ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); - } - last_bitpos_ptr[coefi] = Al; - } + last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0]; + if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */ + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + for (coefi = Ss; coefi <= Se; coefi++) { + if (last_bitpos_ptr[coefi] < 0) { + /* first scan of this coefficient */ + if (Ah != 0) + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + } else { + /* not first scan */ + if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1) + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + } + last_bitpos_ptr[coefi] = Al; + } } #endif } else { /* For sequential JPEG, all progression parameters must be these: */ if (Ss != 0 || Se != DCTSIZE2-1 || Ah != 0 || Al != 0) - ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); + ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno); /* Make sure components are not sent twice */ for (ci = 0; ci < ncomps; ci++) { - thisi = scanptr->component_index[ci]; - if (component_sent[thisi]) - ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); - component_sent[thisi] = TRUE; + thisi = scanptr->component_index[ci]; + if (component_sent[thisi]) + ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno); + component_sent[thisi] = TRUE; } } } @@ -284,13 +284,13 @@ validate_script (j_compress_ptr cinfo) */ for (ci = 0; ci < cinfo->num_components; ci++) { if (last_bitpos[ci][0] < 0) - ERREXIT(cinfo, JERR_MISSING_DATA); + ERREXIT(cinfo, JERR_MISSING_DATA); } #endif } else { for (ci = 0; ci < cinfo->num_components; ci++) { if (! component_sent[ci]) - ERREXIT(cinfo, JERR_MISSING_DATA); + ERREXIT(cinfo, JERR_MISSING_DATA); } } } @@ -313,7 +313,7 @@ select_scan_parameters (j_compress_ptr cinfo) cinfo->comps_in_scan = scanptr->comps_in_scan; for (ci = 0; ci < scanptr->comps_in_scan; ci++) { cinfo->cur_comp_info[ci] = - &cinfo->comp_info[scanptr->component_index[ci]]; + &cinfo->comp_info[scanptr->component_index[ci]]; } cinfo->Ss = scanptr->Ss; cinfo->Se = scanptr->Se; @@ -326,7 +326,7 @@ select_scan_parameters (j_compress_ptr cinfo) /* Prepare for single sequential-JPEG scan containing all components */ if (cinfo->num_components > MAX_COMPS_IN_SCAN) ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components, - MAX_COMPS_IN_SCAN); + MAX_COMPS_IN_SCAN); cinfo->comps_in_scan = cinfo->num_components; for (ci = 0; ci < cinfo->num_components; ci++) { cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci]; @@ -346,16 +346,16 @@ per_scan_setup (j_compress_ptr cinfo) { int ci, mcublks, tmp; jpeg_component_info *compptr; - + if (cinfo->comps_in_scan == 1) { - + /* Noninterleaved (single-component) scan */ compptr = cinfo->cur_comp_info[0]; - + /* Overall image size in MCUs */ cinfo->MCUs_per_row = compptr->width_in_blocks; cinfo->MCU_rows_in_scan = compptr->height_in_blocks; - + /* For noninterleaved scan, always one block per MCU */ compptr->MCU_width = 1; compptr->MCU_height = 1; @@ -368,28 +368,28 @@ per_scan_setup (j_compress_ptr cinfo) tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor); if (tmp == 0) tmp = compptr->v_samp_factor; compptr->last_row_height = tmp; - + /* Prepare array describing MCU composition */ cinfo->blocks_in_MCU = 1; cinfo->MCU_membership[0] = 0; - + } else { - + /* Interleaved (multi-component) scan */ if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN) ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan, - MAX_COMPS_IN_SCAN); - + MAX_COMPS_IN_SCAN); + /* Overall image size in MCUs */ cinfo->MCUs_per_row = (JDIMENSION) jdiv_round_up((long) cinfo->_jpeg_width, - (long) (cinfo->max_h_samp_factor*DCTSIZE)); + (long) (cinfo->max_h_samp_factor*DCTSIZE)); cinfo->MCU_rows_in_scan = (JDIMENSION) jdiv_round_up((long) cinfo->_jpeg_height, - (long) (cinfo->max_v_samp_factor*DCTSIZE)); - + (long) (cinfo->max_v_samp_factor*DCTSIZE)); + cinfo->blocks_in_MCU = 0; - + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; /* Sampling factors give # of blocks of component in each MCU */ @@ -407,12 +407,12 @@ per_scan_setup (j_compress_ptr cinfo) /* Prepare array describing MCU composition */ mcublks = compptr->MCU_blocks; if (cinfo->blocks_in_MCU + mcublks > C_MAX_BLOCKS_IN_MCU) - ERREXIT(cinfo, JERR_BAD_MCU_SIZE); + ERREXIT(cinfo, JERR_BAD_MCU_SIZE); while (mcublks-- > 0) { - cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci; + cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci; } } - + } /* Convert restart specified in rows to actual MCU count. */ @@ -452,8 +452,8 @@ prepare_for_pass (j_compress_ptr cinfo) (*cinfo->fdct->start_pass) (cinfo); (*cinfo->entropy->start_pass) (cinfo, cinfo->optimize_coding); (*cinfo->coef->start_pass) (cinfo, - (master->total_passes > 1 ? - JBUF_SAVE_AND_PASS : JBUF_PASS_THRU)); + (master->total_passes > 1 ? + JBUF_SAVE_AND_PASS : JBUF_PASS_THRU)); (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU); if (cinfo->optimize_coding) { /* No immediate data output; postpone writing frame/scan headers */ @@ -581,7 +581,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only) master = (my_master_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_comp_master)); + SIZEOF(my_comp_master)); cinfo->master = (struct jpeg_comp_master *) master; master->pub.prepare_for_pass = prepare_for_pass; master->pub.pass_startup = pass_startup; @@ -602,7 +602,7 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only) cinfo->num_scans = 1; } - if (cinfo->progressive_mode && !cinfo->arith_code) /* TEMPORARY HACK ??? */ + if (cinfo->progressive_mode && !cinfo->arith_code) /* TEMPORARY HACK ??? */ cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */ /* Initialize my private state */ diff --git a/jcomapi.c b/jcomapi.c index 9b1fa7568..4ca20428e 100644 --- a/jcomapi.c +++ b/jcomapi.c @@ -72,8 +72,8 @@ jpeg_destroy (j_common_ptr cinfo) /* NB: mem pointer is NULL if memory mgr failed to initialize. */ if (cinfo->mem != NULL) (*cinfo->mem->self_destruct) (cinfo); - cinfo->mem = NULL; /* be safe if jpeg_destroy is called twice */ - cinfo->global_state = 0; /* mark it destroyed */ + cinfo->mem = NULL; /* be safe if jpeg_destroy is called twice */ + cinfo->global_state = 0; /* mark it destroyed */ } @@ -89,7 +89,7 @@ jpeg_alloc_quant_table (j_common_ptr cinfo) tbl = (JQUANT_TBL *) (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, SIZEOF(JQUANT_TBL)); - tbl->sent_table = FALSE; /* make sure this is false in any new table */ + tbl->sent_table = FALSE; /* make sure this is false in any new table */ return tbl; } @@ -101,6 +101,6 @@ jpeg_alloc_huff_table (j_common_ptr cinfo) tbl = (JHUFF_TBL *) (*cinfo->mem->alloc_small) (cinfo, JPOOL_PERMANENT, SIZEOF(JHUFF_TBL)); - tbl->sent_table = FALSE; /* make sure this is false in any new table */ + tbl->sent_table = FALSE; /* make sure this is false in any new table */ return tbl; } diff --git a/jconfig.txt b/jconfig.txt index dda62367d..ba829ad21 100644 --- a/jconfig.txt +++ b/jconfig.txt @@ -94,10 +94,10 @@ /* Define "boolean" as unsigned char, not int, on Windows systems. */ #ifdef _WIN32 -#ifndef __RPCNDR_H__ /* don't conflict if rpcndr.h already read */ +#ifndef __RPCNDR_H__ /* don't conflict if rpcndr.h already read */ typedef unsigned char boolean; #endif -#define HAVE_BOOLEAN /* prevent jmorecfg.h from redefining it */ +#define HAVE_BOOLEAN /* prevent jmorecfg.h from redefining it */ #endif @@ -130,11 +130,11 @@ typedef unsigned char boolean; /* These defines indicate which image (non-JPEG) file formats are allowed. */ -#define BMP_SUPPORTED /* BMP image file format */ -#define GIF_SUPPORTED /* GIF image file format */ -#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */ -#undef RLE_SUPPORTED /* Utah RLE image file format */ -#define TARGA_SUPPORTED /* Targa image file format */ +#define BMP_SUPPORTED /* BMP image file format */ +#define GIF_SUPPORTED /* GIF image file format */ +#define PPM_SUPPORTED /* PBMPLUS PPM/PGM image file format */ +#undef RLE_SUPPORTED /* Utah RLE image file format */ +#define TARGA_SUPPORTED /* Targa image file format */ /* Define this if you want to name both input and output files on the command * line, rather than using stdout and optionally stdin. You MUST do this if diff --git a/jcparam.c b/jcparam.c index f4e5eec8e..ba81108c2 100644 --- a/jcparam.c +++ b/jcparam.c @@ -25,8 +25,8 @@ GLOBAL(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl, - const unsigned int *basic_table, - int scale_factor, boolean force_baseline) + const unsigned int *basic_table, + int scale_factor, boolean force_baseline) /* Define a quantization table equal to the basic_table times * a scale factor (given as a percentage). * If force_baseline is TRUE, the computed quantization table entries @@ -55,7 +55,7 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl, if (temp <= 0L) temp = 1L; if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */ if (force_baseline && temp > 255L) - temp = 255L; /* limit to baseline range if requested */ + temp = 255L; /* limit to baseline range if requested */ (*qtblptr)->quantval[i] = (UINT16) temp; } @@ -100,16 +100,16 @@ jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline) { /* Set up two quantization tables using the specified scaling */ jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, - cinfo->q_scale_factor[0], force_baseline); + cinfo->q_scale_factor[0], force_baseline); jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl, - cinfo->q_scale_factor[1], force_baseline); + cinfo->q_scale_factor[1], force_baseline); } #endif GLOBAL(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor, - boolean force_baseline) + boolean force_baseline) /* Set or change the 'quality' (quantization) setting, using default tables * and a straight percentage-scaling quality scale. In most cases it's better * to use jpeg_set_quality (below); this entry point is provided for @@ -118,9 +118,9 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor, { /* Set up two quantization tables using the specified scaling */ jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, - scale_factor, force_baseline); + scale_factor, force_baseline); jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl, - scale_factor, force_baseline); + scale_factor, force_baseline); } @@ -192,12 +192,12 @@ jpeg_set_defaults (j_compress_ptr cinfo) if (cinfo->comp_info == NULL) cinfo->comp_info = (jpeg_component_info *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - MAX_COMPONENTS * SIZEOF(jpeg_component_info)); + MAX_COMPONENTS * SIZEOF(jpeg_component_info)); /* Initialize everything not dependent on the color space */ #if JPEG_LIB_VERSION >= 70 - cinfo->scale_num = 1; /* 1:1 scaling */ + cinfo->scale_num = 1; /* 1:1 scaling */ cinfo->scale_denom = 1; #endif cinfo->data_precision = BITS_IN_JSAMPLE; @@ -262,8 +262,8 @@ jpeg_set_defaults (j_compress_ptr cinfo) */ cinfo->JFIF_major_version = 1; /* Default JFIF version = 1.01 */ cinfo->JFIF_minor_version = 1; - cinfo->density_unit = 0; /* Pixel size is unknown by default */ - cinfo->X_density = 1; /* Pixel aspect ratio is square by default */ + cinfo->density_unit = 0; /* Pixel size is unknown by default */ + cinfo->X_density = 1; /* Pixel aspect ratio is square by default */ cinfo->Y_density = 1; /* Choose JPEG colorspace based on input space, set defaults accordingly */ @@ -389,7 +389,7 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace) cinfo->num_components = cinfo->input_components; if (cinfo->num_components < 1 || cinfo->num_components > MAX_COMPONENTS) ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components, - MAX_COMPONENTS); + MAX_COMPONENTS); for (ci = 0; ci < cinfo->num_components; ci++) { SET_COMP(ci, ci, 1,1, 0, 0,0); } @@ -404,7 +404,7 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace) LOCAL(jpeg_scan_info *) fill_a_scan (jpeg_scan_info * scanptr, int ci, - int Ss, int Se, int Ah, int Al) + int Ss, int Se, int Ah, int Al) /* Support routine: generate one scan for specified component */ { scanptr->comps_in_scan = 1; @@ -419,7 +419,7 @@ fill_a_scan (jpeg_scan_info * scanptr, int ci, LOCAL(jpeg_scan_info *) fill_scans (jpeg_scan_info * scanptr, int ncomps, - int Ss, int Se, int Ah, int Al) + int Ss, int Se, int Ah, int Al) /* Support routine: generate one scan for each component */ { int ci; @@ -482,9 +482,9 @@ jpeg_simple_progression (j_compress_ptr cinfo) } else { /* All-purpose script for other color spaces. */ if (ncomps > MAX_COMPS_IN_SCAN) - nscans = 6 * ncomps; /* 2 DC + 4 AC scans per component */ + nscans = 6 * ncomps; /* 2 DC + 4 AC scans per component */ else - nscans = 2 + 4 * ncomps; /* 2 DC scans; 4 AC scans per component */ + nscans = 2 + 4 * ncomps; /* 2 DC scans; 4 AC scans per component */ } /* Allocate space for script. @@ -498,7 +498,7 @@ jpeg_simple_progression (j_compress_ptr cinfo) cinfo->script_space_size = MAX(nscans, 10); cinfo->script_space = (jpeg_scan_info *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - cinfo->script_space_size * SIZEOF(jpeg_scan_info)); + cinfo->script_space_size * SIZEOF(jpeg_scan_info)); } scanptr = cinfo->script_space; cinfo->scan_info = scanptr; diff --git a/jcphuff.c b/jcphuff.c index 310287175..b764b650c 100644 --- a/jcphuff.c +++ b/jcphuff.c @@ -15,7 +15,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jchuff.h" /* Declarations shared with jchuff.c */ +#include "jchuff.h" /* Declarations shared with jchuff.c */ #ifdef C_PROGRESSIVE_SUPPORTED @@ -30,24 +30,24 @@ typedef struct { /* Bit-level coding status. * next_output_byte/free_in_buffer are local copies of cinfo->dest fields. */ - JOCTET * next_output_byte; /* => next byte to write in buffer */ - size_t free_in_buffer; /* # of byte spaces remaining in buffer */ - INT32 put_buffer; /* current bit-accumulation buffer */ - int put_bits; /* # of bits now in it */ - j_compress_ptr cinfo; /* link to cinfo (needed for dump_buffer) */ + JOCTET * next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + INT32 put_buffer; /* current bit-accumulation buffer */ + int put_bits; /* # of bits now in it */ + j_compress_ptr cinfo; /* link to cinfo (needed for dump_buffer) */ /* Coding status for DC components */ int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ /* Coding status for AC components */ - int ac_tbl_no; /* the table number of the single component */ - unsigned int EOBRUN; /* run length of EOBs */ - unsigned int BE; /* # of buffered correction bits before MCU */ - char * bit_buffer; /* buffer for correction bits (1 per char) */ + int ac_tbl_no; /* the table number of the single component */ + unsigned int EOBRUN; /* run length of EOBs */ + unsigned int BE; /* # of buffered correction bits before MCU */ + char * bit_buffer; /* buffer for correction bits (1 per char) */ /* packing correction bits tightly would save some space but cost time... */ - unsigned int restarts_to_go; /* MCUs left in this restart interval */ - int next_restart_num; /* next restart number to write (0-7) */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ + int next_restart_num; /* next restart number to write (0-7) */ /* Pointers to derived tables (these workspaces have image lifespan). * Since any one scan codes only DC or only AC, we only need one set @@ -67,7 +67,7 @@ typedef phuff_entropy_encoder * phuff_entropy_ptr; * The minimum safe size is 64 bits. */ -#define MAX_CORR_BITS 1000 /* Max # of correction bits I can buffer */ +#define MAX_CORR_BITS 1000 /* Max # of correction bits I can buffer */ /* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32. * We assume that int right shift is unsigned if INT32 right shift is, @@ -75,25 +75,25 @@ typedef phuff_entropy_encoder * phuff_entropy_ptr; */ #ifdef RIGHT_SHIFT_IS_UNSIGNED -#define ISHIFT_TEMPS int ishift_temp; +#define ISHIFT_TEMPS int ishift_temp; #define IRIGHT_SHIFT(x,shft) \ - ((ishift_temp = (x)) < 0 ? \ - (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \ - (ishift_temp >> (shft))) + ((ishift_temp = (x)) < 0 ? \ + (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \ + (ishift_temp >> (shft))) #else #define ISHIFT_TEMPS -#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) +#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) #endif /* Forward declarations */ METHODDEF(boolean) encode_mcu_DC_first JPP((j_compress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(boolean) encode_mcu_AC_first JPP((j_compress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(boolean) encode_mcu_DC_refine JPP((j_compress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(boolean) encode_mcu_AC_refine JPP((j_compress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(void) finish_pass_phuff JPP((j_compress_ptr cinfo)); METHODDEF(void) finish_pass_gather_phuff JPP((j_compress_ptr cinfo)); @@ -104,7 +104,7 @@ METHODDEF(void) finish_pass_gather_phuff JPP((j_compress_ptr cinfo)); METHODDEF(void) start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) -{ +{ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; boolean is_DC_band; int ci, tbl; @@ -130,9 +130,9 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) entropy->pub.encode_mcu = encode_mcu_AC_refine; /* AC refinement needs a correction bit buffer */ if (entropy->bit_buffer == NULL) - entropy->bit_buffer = (char *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - MAX_CORR_BITS * SIZEOF(char)); + entropy->bit_buffer = (char *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + MAX_CORR_BITS * SIZEOF(char)); } } if (gather_statistics) @@ -149,8 +149,8 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) entropy->last_dc_val[ci] = 0; /* Get table index */ if (is_DC_band) { - if (cinfo->Ah != 0) /* DC refinement needs no table */ - continue; + if (cinfo->Ah != 0) /* DC refinement needs no table */ + continue; tbl = compptr->dc_tbl_no; } else { entropy->ac_tbl_no = tbl = compptr->ac_tbl_no; @@ -163,15 +163,15 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) /* Allocate and zero the statistics tables */ /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */ if (entropy->count_ptrs[tbl] == NULL) - entropy->count_ptrs[tbl] = (long *) - (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - 257 * SIZEOF(long)); + entropy->count_ptrs[tbl] = (long *) + (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, + 257 * SIZEOF(long)); MEMZERO(entropy->count_ptrs[tbl], 257 * SIZEOF(long)); } else { /* Compute derived values for Huffman table */ /* We may do this more than once for a table, but it's not expensive */ jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl, - & entropy->derived_tbls[tbl]); + & entropy->derived_tbls[tbl]); } } @@ -196,9 +196,9 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics) /* Emit a byte */ #define emit_byte(entropy,val) \ - { *(entropy)->next_output_byte++ = (JOCTET) (val); \ - if (--(entropy)->free_in_buffer == 0) \ - dump_buffer(entropy); } + { *(entropy)->next_output_byte++ = (JOCTET) (val); \ + if (--(entropy)->free_in_buffer == 0) \ + dump_buffer(entropy); } LOCAL(void) @@ -236,21 +236,21 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size) ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE); if (entropy->gather_statistics) - return; /* do nothing if we're only getting stats */ + return; /* do nothing if we're only getting stats */ put_buffer &= (((INT32) 1)<put_buffer; /* and merge with old buffer contents */ while (put_bits >= 8) { int c = (int) ((put_buffer >> 16) & 0xFF); - + emit_byte(entropy, c); - if (c == 0xFF) { /* need to stuff a zero byte? */ + if (c == 0xFF) { /* need to stuff a zero byte? */ emit_byte(entropy, 0); } put_buffer <<= 8; @@ -293,10 +293,10 @@ emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol) LOCAL(void) emit_buffered_bits (phuff_entropy_ptr entropy, char * bufstart, - unsigned int nbits) + unsigned int nbits) { if (entropy->gather_statistics) - return; /* no real work */ + return; /* no real work */ while (nbits > 0) { emit_bits(entropy, (unsigned int) (*bufstart), 1); @@ -315,7 +315,7 @@ emit_eobrun (phuff_entropy_ptr entropy) { register int temp, nbits; - if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */ + if (entropy->EOBRUN > 0) { /* if there is any pending EOBRUN */ temp = entropy->EOBRUN; nbits = 0; while ((temp >>= 1)) @@ -409,12 +409,12 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* Encode the DC coefficient difference per section G.1.2.1 */ temp2 = temp; if (temp < 0) { - temp = -temp; /* temp is abs value of input */ + temp = -temp; /* temp is abs value of input */ /* For a negative input, want temp2 = bitwise complement of abs(input) */ /* This code assumes we are on a two's complement machine */ temp2--; } - + /* Find the number of bits needed for the magnitude of the coefficient */ nbits = 0; while (temp) { @@ -426,13 +426,13 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) */ if (nbits > MAX_COEF_BITS+1) ERREXIT(cinfo, JERR_BAD_DCT_COEF); - + /* Count/emit the Huffman-coded symbol for the number of bits */ emit_symbol(entropy, compptr->dc_tbl_no, nbits); - + /* Emit that number of bits of the value, if positive, */ /* or the complement of its magnitude, if negative. */ - if (nbits) /* emit_bits rejects calls with size 0 */ + if (nbits) /* emit_bits rejects calls with size 0 */ emit_bits(entropy, (unsigned int) temp2, nbits); } @@ -481,9 +481,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) block = MCU_data[0]; /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */ - - r = 0; /* r = run length of zeros */ - + + r = 0; /* r = run length of zeros */ + for (k = cinfo->Ss; k <= Se; k++) { if ((temp = (*block)[jpeg_natural_order[k]]) == 0) { r++; @@ -495,12 +495,12 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) * interwoven with finding the abs value (temp) and output bits (temp2). */ if (temp < 0) { - temp = -temp; /* temp is abs value of input */ - temp >>= Al; /* apply the point transform */ + temp = -temp; /* temp is abs value of input */ + temp >>= Al; /* apply the point transform */ /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ temp2 = ~temp; } else { - temp >>= Al; /* apply the point transform */ + temp >>= Al; /* apply the point transform */ temp2 = temp; } /* Watch out for case that nonzero coef is zero after point transform */ @@ -519,7 +519,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) } /* Find the number of bits needed for the magnitude of the coefficient */ - nbits = 1; /* there must be at least one 1 bit */ + nbits = 1; /* there must be at least one 1 bit */ while ((temp >>= 1)) nbits++; /* Check for out-of-range coefficient values */ @@ -533,13 +533,13 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data) /* or the complement of its magnitude, if negative. */ emit_bits(entropy, (unsigned int) temp2, nbits); - r = 0; /* reset zero run length */ + r = 0; /* reset zero run length */ } - if (r > 0) { /* If there are trailing zeroes, */ - entropy->EOBRUN++; /* count an EOB */ + if (r > 0) { /* If there are trailing zeroes, */ + entropy->EOBRUN++; /* count an EOB */ if (entropy->EOBRUN == 0x7FFF) - emit_eobrun(entropy); /* force it out to avoid overflow */ + emit_eobrun(entropy); /* force it out to avoid overflow */ } cinfo->dest->next_output_byte = entropy->next_output_byte; @@ -648,17 +648,17 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) * in C, we shift after obtaining the absolute value. */ if (temp < 0) - temp = -temp; /* temp is abs value of input */ - temp >>= Al; /* apply the point transform */ - absvalues[k] = temp; /* save abs value for main pass */ + temp = -temp; /* temp is abs value of input */ + temp >>= Al; /* apply the point transform */ + absvalues[k] = temp; /* save abs value for main pass */ if (temp == 1) - EOB = k; /* EOB = index of last newly-nonzero coef */ + EOB = k; /* EOB = index of last newly-nonzero coef */ } /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */ - - r = 0; /* r = run length of zeros */ - BR = 0; /* BR = count of buffered bits added now */ + + r = 0; /* r = run length of zeros */ + BR = 0; /* BR = count of buffered bits added now */ BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */ for (k = cinfo->Ss; k <= Se; k++) { @@ -705,12 +705,12 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) emit_buffered_bits(entropy, BR_buffer, BR); BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ BR = 0; - r = 0; /* reset zero run length */ + r = 0; /* reset zero run length */ } - if (r > 0 || BR > 0) { /* If there are trailing zeroes, */ - entropy->EOBRUN++; /* count an EOB */ - entropy->BE += BR; /* concat my correction bits to older ones */ + if (r > 0 || BR > 0) { /* If there are trailing zeroes, */ + entropy->EOBRUN++; /* count an EOB */ + entropy->BE += BR; /* concat my correction bits to older ones */ /* We force out the EOB if we risk either: * 1. overflow of the EOB counter; * 2. overflow of the correction bit buffer during the next MCU. @@ -742,7 +742,7 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data) METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo) -{ +{ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; entropy->next_output_byte = cinfo->dest->next_output_byte; @@ -784,8 +784,8 @@ finish_pass_gather_phuff (j_compress_ptr cinfo) for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; if (is_DC_band) { - if (cinfo->Ah != 0) /* DC refinement needs no table */ - continue; + if (cinfo->Ah != 0) /* DC refinement needs no table */ + continue; tbl = compptr->dc_tbl_no; } else { tbl = compptr->ac_tbl_no; @@ -816,7 +816,7 @@ jinit_phuff_encoder (j_compress_ptr cinfo) entropy = (phuff_entropy_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(phuff_entropy_encoder)); + SIZEOF(phuff_entropy_encoder)); cinfo->entropy = (struct jpeg_entropy_encoder *) entropy; entropy->pub.start_pass = start_pass_phuff; @@ -825,7 +825,7 @@ jinit_phuff_encoder (j_compress_ptr cinfo) entropy->derived_tbls[i] = NULL; entropy->count_ptrs[i] = NULL; } - entropy->bit_buffer = NULL; /* needed only in AC refinement scan */ + entropy->bit_buffer = NULL; /* needed only in AC refinement scan */ } #endif /* C_PROGRESSIVE_SUPPORTED */ diff --git a/jcprepct.c b/jcprepct.c index fa93333db..785ff887e 100644 --- a/jcprepct.c +++ b/jcprepct.c @@ -58,12 +58,12 @@ typedef struct { */ JSAMPARRAY color_buf[MAX_COMPONENTS]; - JDIMENSION rows_to_go; /* counts rows remaining in source image */ - int next_buf_row; /* index of next row to store in color_buf */ + JDIMENSION rows_to_go; /* counts rows remaining in source image */ + int next_buf_row; /* index of next row to store in color_buf */ -#ifdef CONTEXT_ROWS_SUPPORTED /* only needed for context case */ - int this_row_group; /* starting row index of group to process */ - int next_buf_stop; /* downsample when we reach this index */ +#ifdef CONTEXT_ROWS_SUPPORTED /* only needed for context case */ + int this_row_group; /* starting row index of group to process */ + int next_buf_stop; /* downsample when we reach this index */ #endif } my_prep_controller; @@ -104,13 +104,13 @@ start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode) LOCAL(void) expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols, - int input_rows, int output_rows) + int input_rows, int output_rows) { register int row; for (row = input_rows; row < output_rows; row++) { jcopy_sample_rows(image_data, input_rows-1, image_data, row, - 1, num_cols); + 1, num_cols); } } @@ -126,10 +126,10 @@ expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols, METHODDEF(void) pre_process_data (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, - JDIMENSION in_rows_avail, - JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr, - JDIMENSION out_row_groups_avail) + JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail, + JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr, + JDIMENSION out_row_groups_avail) { my_prep_ptr prep = (my_prep_ptr) cinfo->prep; int numrows, ci; @@ -137,32 +137,32 @@ pre_process_data (j_compress_ptr cinfo, jpeg_component_info * compptr; while (*in_row_ctr < in_rows_avail && - *out_row_group_ctr < out_row_groups_avail) { + *out_row_group_ctr < out_row_groups_avail) { /* Do color conversion to fill the conversion buffer. */ inrows = in_rows_avail - *in_row_ctr; numrows = cinfo->max_v_samp_factor - prep->next_buf_row; numrows = (int) MIN((JDIMENSION) numrows, inrows); (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr, - prep->color_buf, - (JDIMENSION) prep->next_buf_row, - numrows); + prep->color_buf, + (JDIMENSION) prep->next_buf_row, + numrows); *in_row_ctr += numrows; prep->next_buf_row += numrows; prep->rows_to_go -= numrows; /* If at bottom of image, pad to fill the conversion buffer. */ if (prep->rows_to_go == 0 && - prep->next_buf_row < cinfo->max_v_samp_factor) { + prep->next_buf_row < cinfo->max_v_samp_factor) { for (ci = 0; ci < cinfo->num_components; ci++) { - expand_bottom_edge(prep->color_buf[ci], cinfo->image_width, - prep->next_buf_row, cinfo->max_v_samp_factor); + expand_bottom_edge(prep->color_buf[ci], cinfo->image_width, + prep->next_buf_row, cinfo->max_v_samp_factor); } prep->next_buf_row = cinfo->max_v_samp_factor; } /* If we've filled the conversion buffer, empty it. */ if (prep->next_buf_row == cinfo->max_v_samp_factor) { (*cinfo->downsample->downsample) (cinfo, - prep->color_buf, (JDIMENSION) 0, - output_buf, *out_row_group_ctr); + prep->color_buf, (JDIMENSION) 0, + output_buf, *out_row_group_ctr); prep->next_buf_row = 0; (*out_row_group_ctr)++; } @@ -170,16 +170,16 @@ pre_process_data (j_compress_ptr cinfo, * Note we assume the caller is providing a one-iMCU-height output buffer! */ if (prep->rows_to_go == 0 && - *out_row_group_ctr < out_row_groups_avail) { + *out_row_group_ctr < out_row_groups_avail) { for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { - expand_bottom_edge(output_buf[ci], - compptr->width_in_blocks * DCTSIZE, - (int) (*out_row_group_ctr * compptr->v_samp_factor), - (int) (out_row_groups_avail * compptr->v_samp_factor)); + ci++, compptr++) { + expand_bottom_edge(output_buf[ci], + compptr->width_in_blocks * DCTSIZE, + (int) (*out_row_group_ctr * compptr->v_samp_factor), + (int) (out_row_groups_avail * compptr->v_samp_factor)); } *out_row_group_ctr = out_row_groups_avail; - break; /* can exit outer loop without test */ + break; /* can exit outer loop without test */ } } } @@ -193,10 +193,10 @@ pre_process_data (j_compress_ptr cinfo, METHODDEF(void) pre_process_context (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, - JDIMENSION in_rows_avail, - JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr, - JDIMENSION out_row_groups_avail) + JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail, + JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr, + JDIMENSION out_row_groups_avail) { my_prep_ptr prep = (my_prep_ptr) cinfo->prep; int numrows, ci; @@ -210,19 +210,19 @@ pre_process_context (j_compress_ptr cinfo, numrows = prep->next_buf_stop - prep->next_buf_row; numrows = (int) MIN((JDIMENSION) numrows, inrows); (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr, - prep->color_buf, - (JDIMENSION) prep->next_buf_row, - numrows); + prep->color_buf, + (JDIMENSION) prep->next_buf_row, + numrows); /* Pad at top of image, if first time through */ if (prep->rows_to_go == cinfo->image_height) { - for (ci = 0; ci < cinfo->num_components; ci++) { - int row; - for (row = 1; row <= cinfo->max_v_samp_factor; row++) { - jcopy_sample_rows(prep->color_buf[ci], 0, - prep->color_buf[ci], -row, - 1, cinfo->image_width); - } - } + for (ci = 0; ci < cinfo->num_components; ci++) { + int row; + for (row = 1; row <= cinfo->max_v_samp_factor; row++) { + jcopy_sample_rows(prep->color_buf[ci], 0, + prep->color_buf[ci], -row, + 1, cinfo->image_width); + } + } } *in_row_ctr += numrows; prep->next_buf_row += numrows; @@ -230,29 +230,29 @@ pre_process_context (j_compress_ptr cinfo, } else { /* Return for more data, unless we are at the bottom of the image. */ if (prep->rows_to_go != 0) - break; + break; /* When at bottom of image, pad to fill the conversion buffer. */ if (prep->next_buf_row < prep->next_buf_stop) { - for (ci = 0; ci < cinfo->num_components; ci++) { - expand_bottom_edge(prep->color_buf[ci], cinfo->image_width, - prep->next_buf_row, prep->next_buf_stop); - } - prep->next_buf_row = prep->next_buf_stop; + for (ci = 0; ci < cinfo->num_components; ci++) { + expand_bottom_edge(prep->color_buf[ci], cinfo->image_width, + prep->next_buf_row, prep->next_buf_stop); + } + prep->next_buf_row = prep->next_buf_stop; } } /* If we've gotten enough data, downsample a row group. */ if (prep->next_buf_row == prep->next_buf_stop) { (*cinfo->downsample->downsample) (cinfo, - prep->color_buf, - (JDIMENSION) prep->this_row_group, - output_buf, *out_row_group_ctr); + prep->color_buf, + (JDIMENSION) prep->this_row_group, + output_buf, *out_row_group_ctr); (*out_row_group_ctr)++; /* Advance pointers with wraparound as necessary. */ prep->this_row_group += cinfo->max_v_samp_factor; if (prep->this_row_group >= buf_height) - prep->this_row_group = 0; + prep->this_row_group = 0; if (prep->next_buf_row >= buf_height) - prep->next_buf_row = 0; + prep->next_buf_row = 0; prep->next_buf_stop = prep->next_buf_row + cinfo->max_v_samp_factor; } } @@ -277,8 +277,8 @@ create_context_buffer (j_compress_ptr cinfo) */ fake_buffer = (JSAMPARRAY) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (cinfo->num_components * 5 * rgroup_height) * - SIZEOF(JSAMPROW)); + (cinfo->num_components * 5 * rgroup_height) * + SIZEOF(JSAMPROW)); for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { @@ -289,11 +289,11 @@ create_context_buffer (j_compress_ptr cinfo) true_buffer = (*cinfo->mem->alloc_sarray) ((j_common_ptr) cinfo, JPOOL_IMAGE, (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE * - cinfo->max_h_samp_factor) / compptr->h_samp_factor), + cinfo->max_h_samp_factor) / compptr->h_samp_factor), (JDIMENSION) (3 * rgroup_height)); /* Copy true buffer row pointers into the middle of the fake row array */ MEMCOPY(fake_buffer + rgroup_height, true_buffer, - 3 * rgroup_height * SIZEOF(JSAMPROW)); + 3 * rgroup_height * SIZEOF(JSAMPROW)); /* Fill in the above and below wraparound pointers */ for (i = 0; i < rgroup_height; i++) { fake_buffer[i] = true_buffer[2 * rgroup_height + i]; @@ -318,12 +318,12 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer) int ci; jpeg_component_info * compptr; - if (need_full_buffer) /* safety check */ + if (need_full_buffer) /* safety check */ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); prep = (my_prep_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_prep_controller)); + SIZEOF(my_prep_controller)); cinfo->prep = (struct jpeg_c_prep_controller *) prep; prep->pub.start_pass = start_pass_prep; @@ -343,12 +343,12 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer) /* No context, just make it tall enough for one row group */ prep->pub.pre_process_data = pre_process_data; for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { + ci++, compptr++) { prep->color_buf[ci] = (*cinfo->mem->alloc_sarray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, - (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE * - cinfo->max_h_samp_factor) / compptr->h_samp_factor), - (JDIMENSION) cinfo->max_v_samp_factor); + ((j_common_ptr) cinfo, JPOOL_IMAGE, + (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE * + cinfo->max_h_samp_factor) / compptr->h_samp_factor), + (JDIMENSION) cinfo->max_v_samp_factor); } } } diff --git a/jcsample.c b/jcsample.c index eea376f90..4662bee36 100644 --- a/jcsample.c +++ b/jcsample.c @@ -54,13 +54,13 @@ /* Pointer to routine to downsample a single component */ typedef JMETHOD(void, downsample1_ptr, - (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data)); + (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data)); /* Private subobject */ typedef struct { - struct jpeg_downsampler pub; /* public fields */ + struct jpeg_downsampler pub; /* public fields */ /* Downsampling method pointers, one per component */ downsample1_ptr methods[MAX_COMPONENTS]; @@ -87,7 +87,7 @@ start_pass_downsample (j_compress_ptr cinfo) LOCAL(void) expand_right_edge (JSAMPARRAY image_data, int num_rows, - JDIMENSION input_cols, JDIMENSION output_cols) + JDIMENSION input_cols, JDIMENSION output_cols) { register JSAMPROW ptr; register JSAMPLE pixval; @@ -98,9 +98,9 @@ expand_right_edge (JSAMPARRAY image_data, int num_rows, if (numcols > 0) { for (row = 0; row < num_rows; row++) { ptr = image_data[row] + input_cols; - pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ + pixval = ptr[-1]; /* don't need GETJSAMPLE() here */ for (count = numcols; count > 0; count--) - *ptr++ = pixval; + *ptr++ = pixval; } } } @@ -114,8 +114,8 @@ expand_right_edge (JSAMPARRAY image_data, int num_rows, METHODDEF(void) sep_downsample (j_compress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION in_row_index, - JSAMPIMAGE output_buf, JDIMENSION out_row_group_index) + JSAMPIMAGE input_buf, JDIMENSION in_row_index, + JSAMPIMAGE output_buf, JDIMENSION out_row_group_index) { my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample; int ci; @@ -140,10 +140,10 @@ sep_downsample (j_compress_ptr cinfo, METHODDEF(void) int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) + JSAMPARRAY input_data, JSAMPARRAY output_data) { int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v; - JDIMENSION outcol, outcol_h; /* outcol_h == outcol*h_expand */ + JDIMENSION outcol, outcol_h; /* outcol_h == outcol*h_expand */ JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE; JSAMPROW inptr, outptr; INT32 outvalue; @@ -158,19 +158,19 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, * efficient. */ expand_right_edge(input_data, cinfo->max_v_samp_factor, - cinfo->image_width, output_cols * h_expand); + cinfo->image_width, output_cols * h_expand); inrow = 0; for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { outptr = output_data[outrow]; for (outcol = 0, outcol_h = 0; outcol < output_cols; - outcol++, outcol_h += h_expand) { + outcol++, outcol_h += h_expand) { outvalue = 0; for (v = 0; v < v_expand; v++) { - inptr = input_data[inrow+v] + outcol_h; - for (h = 0; h < h_expand; h++) { - outvalue += (INT32) GETJSAMPLE(*inptr++); - } + inptr = input_data[inrow+v] + outcol_h; + for (h = 0; h < h_expand; h++) { + outvalue += (INT32) GETJSAMPLE(*inptr++); + } } *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix); } @@ -187,14 +187,14 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) + JSAMPARRAY input_data, JSAMPARRAY output_data) { /* Copy the data */ jcopy_sample_rows(input_data, 0, output_data, 0, - cinfo->max_v_samp_factor, cinfo->image_width); + cinfo->max_v_samp_factor, cinfo->image_width); /* Edge-expand */ expand_right_edge(output_data, cinfo->max_v_samp_factor, - cinfo->image_width, compptr->width_in_blocks * DCTSIZE); + cinfo->image_width, compptr->width_in_blocks * DCTSIZE); } @@ -212,7 +212,7 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) + JSAMPARRAY input_data, JSAMPARRAY output_data) { int outrow; JDIMENSION outcol; @@ -225,16 +225,16 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, * efficient. */ expand_right_edge(input_data, cinfo->max_v_samp_factor, - cinfo->image_width, output_cols * 2); + cinfo->image_width, output_cols * 2); for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { outptr = output_data[outrow]; inptr = input_data[outrow]; - bias = 0; /* bias = 0,1,0,1,... for successive samples */ + bias = 0; /* bias = 0,1,0,1,... for successive samples */ for (outcol = 0; outcol < output_cols; outcol++) { *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1]) - + bias) >> 1); - bias ^= 1; /* 0=>1, 1=>0 */ + + bias) >> 1); + bias ^= 1; /* 0=>1, 1=>0 */ inptr += 2; } } @@ -249,7 +249,7 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) + JSAMPARRAY input_data, JSAMPARRAY output_data) { int inrow, outrow; JDIMENSION outcol; @@ -262,19 +262,19 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, * efficient. */ expand_right_edge(input_data, cinfo->max_v_samp_factor, - cinfo->image_width, output_cols * 2); + cinfo->image_width, output_cols * 2); inrow = 0; for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) { outptr = output_data[outrow]; inptr0 = input_data[inrow]; inptr1 = input_data[inrow+1]; - bias = 1; /* bias = 1,2,1,2,... for successive samples */ + bias = 1; /* bias = 1,2,1,2,... for successive samples */ for (outcol = 0; outcol < output_cols; outcol++) { *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) + - GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]) - + bias) >> 2); - bias ^= 3; /* 1=>2, 2=>1 */ + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]) + + bias) >> 2); + bias ^= 3; /* 1=>2, 2=>1 */ inptr0 += 2; inptr1 += 2; } inrow += 2; @@ -292,7 +292,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) + JSAMPARRAY input_data, JSAMPARRAY output_data) { int inrow, outrow; JDIMENSION colctr; @@ -305,7 +305,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, * efficient. */ expand_right_edge(input_data - 1, cinfo->max_v_samp_factor + 2, - cinfo->image_width, output_cols * 2); + cinfo->image_width, output_cols * 2); /* We don't bother to form the individual "smoothed" input pixel values; * we can directly compute the output which is the average of the four @@ -333,14 +333,14 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, /* Special case for first column: pretend column -1 is same as column 0 */ membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) + - GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) + - GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + - GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) + - GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]); + GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + + GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) + + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]); neighsum += neighsum; neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) + - GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]); + GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]); membersum = membersum * memberscale + neighsum * neighscale; *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16); inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2; @@ -348,17 +348,17 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, for (colctr = output_cols - 2; colctr > 0; colctr--) { /* sum of pixels directly mapped to this output element */ membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) + - GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); /* sum of edge-neighbor pixels */ neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) + - GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + - GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) + - GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]); + GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + + GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) + + GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]); /* The edge-neighbors count twice as much as corner-neighbors */ neighsum += neighsum; /* Add in the corner-neighbors */ neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) + - GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]); + GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]); /* form final output scaled up by 2^16 */ membersum = membersum * memberscale + neighsum * neighscale; /* round, descale and output it */ @@ -368,14 +368,14 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, /* Special case for last column */ membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) + - GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); + GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]); neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) + - GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + - GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) + - GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]); + GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) + + GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) + + GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]); neighsum += neighsum; neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) + - GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]); + GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]); membersum = membersum * memberscale + neighsum * neighscale; *outptr = (JSAMPLE) ((membersum + 32768) >> 16); @@ -392,7 +392,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, - JSAMPARRAY input_data, JSAMPARRAY output_data) + JSAMPARRAY input_data, JSAMPARRAY output_data) { int outrow; JDIMENSION colctr; @@ -406,7 +406,7 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, * efficient. */ expand_right_edge(input_data - 1, cinfo->max_v_samp_factor + 2, - cinfo->image_width, output_cols); + cinfo->image_width, output_cols); /* Each of the eight neighbor pixels contributes a fraction SF to the * smoothed pixel, while the main pixel contributes (1-8*SF). In order @@ -425,10 +425,10 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, /* Special case for first column */ colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) + - GETJSAMPLE(*inptr); + GETJSAMPLE(*inptr); membersum = GETJSAMPLE(*inptr++); nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) + - GETJSAMPLE(*inptr); + GETJSAMPLE(*inptr); neighsum = colsum + (colsum - membersum) + nextcolsum; membersum = membersum * memberscale + neighsum * neighscale; *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16); @@ -438,7 +438,7 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, membersum = GETJSAMPLE(*inptr++); above_ptr++; below_ptr++; nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) + - GETJSAMPLE(*inptr); + GETJSAMPLE(*inptr); neighsum = lastcolsum + (colsum - membersum) + nextcolsum; membersum = membersum * memberscale + neighsum * neighscale; *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16); @@ -472,7 +472,7 @@ jinit_downsampler (j_compress_ptr cinfo) downsample = (my_downsample_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_downsampler)); + SIZEOF(my_downsampler)); cinfo->downsample = (struct jpeg_downsampler *) downsample; downsample->pub.start_pass = start_pass_downsample; downsample->pub.downsample = sep_downsample; @@ -485,35 +485,35 @@ jinit_downsampler (j_compress_ptr cinfo) for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { if (compptr->h_samp_factor == cinfo->max_h_samp_factor && - compptr->v_samp_factor == cinfo->max_v_samp_factor) { + compptr->v_samp_factor == cinfo->max_v_samp_factor) { #ifdef INPUT_SMOOTHING_SUPPORTED if (cinfo->smoothing_factor) { - downsample->methods[ci] = fullsize_smooth_downsample; - downsample->pub.need_context_rows = TRUE; + downsample->methods[ci] = fullsize_smooth_downsample; + downsample->pub.need_context_rows = TRUE; } else #endif - downsample->methods[ci] = fullsize_downsample; + downsample->methods[ci] = fullsize_downsample; } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor && - compptr->v_samp_factor == cinfo->max_v_samp_factor) { + compptr->v_samp_factor == cinfo->max_v_samp_factor) { smoothok = FALSE; if (jsimd_can_h2v1_downsample()) downsample->methods[ci] = jsimd_h2v1_downsample; else downsample->methods[ci] = h2v1_downsample; } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor && - compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) { + compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) { #ifdef INPUT_SMOOTHING_SUPPORTED if (cinfo->smoothing_factor) { - downsample->methods[ci] = h2v2_smooth_downsample; - downsample->pub.need_context_rows = TRUE; + downsample->methods[ci] = h2v2_smooth_downsample; + downsample->pub.need_context_rows = TRUE; } else #endif - if (jsimd_can_h2v2_downsample()) - downsample->methods[ci] = jsimd_h2v2_downsample; - else - downsample->methods[ci] = h2v2_downsample; + if (jsimd_can_h2v2_downsample()) + downsample->methods[ci] = jsimd_h2v2_downsample; + else + downsample->methods[ci] = h2v2_downsample; } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 && - (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) { + (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) { smoothok = FALSE; downsample->methods[ci] = int_downsample; } else diff --git a/jctrans.c b/jctrans.c index 916e872fa..c35329392 100644 --- a/jctrans.c +++ b/jctrans.c @@ -18,9 +18,9 @@ /* Forward declarations */ LOCAL(void) transencode_master_selection - JPP((j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)); + JPP((j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)); LOCAL(void) transencode_coef_controller - JPP((j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)); + JPP((j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays)); /* @@ -48,7 +48,7 @@ jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays) /* Perform master selection of active modules */ transencode_master_selection(cinfo, coef_arrays); /* Wait for jpeg_finish_compress() call */ - cinfo->next_scanline = 0; /* so jpeg_write_marker works */ + cinfo->next_scanline = 0; /* so jpeg_write_marker works */ cinfo->global_state = CSTATE_WRCOEFS; } @@ -62,7 +62,7 @@ jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr * coef_arrays) GLOBAL(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo, - j_compress_ptr dstinfo) + j_compress_ptr dstinfo) { JQUANT_TBL ** qtblptr; jpeg_component_info *incomp, *outcomp; @@ -96,10 +96,10 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo, if (srcinfo->quant_tbl_ptrs[tblno] != NULL) { qtblptr = & dstinfo->quant_tbl_ptrs[tblno]; if (*qtblptr == NULL) - *qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo); + *qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo); MEMCOPY((*qtblptr)->quantval, - srcinfo->quant_tbl_ptrs[tblno]->quantval, - SIZEOF((*qtblptr)->quantval)); + srcinfo->quant_tbl_ptrs[tblno]->quantval, + SIZEOF((*qtblptr)->quantval)); (*qtblptr)->sent_table = FALSE; } } @@ -109,7 +109,7 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo, dstinfo->num_components = srcinfo->num_components; if (dstinfo->num_components < 1 || dstinfo->num_components > MAX_COMPONENTS) ERREXIT2(dstinfo, JERR_COMPONENT_COUNT, dstinfo->num_components, - MAX_COMPONENTS); + MAX_COMPONENTS); for (ci = 0, incomp = srcinfo->comp_info, outcomp = dstinfo->comp_info; ci < dstinfo->num_components; ci++, incomp++, outcomp++) { outcomp->component_id = incomp->component_id; @@ -122,14 +122,14 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo, */ tblno = outcomp->quant_tbl_no; if (tblno < 0 || tblno >= NUM_QUANT_TBLS || - srcinfo->quant_tbl_ptrs[tblno] == NULL) + srcinfo->quant_tbl_ptrs[tblno] == NULL) ERREXIT1(dstinfo, JERR_NO_QUANT_TABLE, tblno); slot_quant = srcinfo->quant_tbl_ptrs[tblno]; c_quant = incomp->quant_table; if (c_quant != NULL) { for (coefi = 0; coefi < DCTSIZE2; coefi++) { - if (c_quant->quantval[coefi] != slot_quant->quantval[coefi]) - ERREXIT1(dstinfo, JERR_MISMATCHED_QUANT_TABLE, tblno); + if (c_quant->quantval[coefi] != slot_quant->quantval[coefi]) + ERREXIT1(dstinfo, JERR_MISMATCHED_QUANT_TABLE, tblno); } } /* Note: we do not copy the source's Huffman table assignments; @@ -163,7 +163,7 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo, LOCAL(void) transencode_master_selection (j_compress_ptr cinfo, - jvirt_barray_ptr * coef_arrays) + jvirt_barray_ptr * coef_arrays) { /* Although we don't actually use input_components for transcoding, * jcmaster.c's initial_setup will complain if input_components is 0. @@ -219,10 +219,10 @@ transencode_master_selection (j_compress_ptr cinfo, typedef struct { struct jpeg_c_coef_controller pub; /* public fields */ - JDIMENSION iMCU_row_num; /* iMCU row # within image */ - JDIMENSION mcu_ctr; /* counts MCUs processed in current row */ - int MCU_vert_offset; /* counts MCU rows within iMCU row */ - int MCU_rows_per_iMCU_row; /* number of such rows needed */ + JDIMENSION iMCU_row_num; /* iMCU row # within image */ + JDIMENSION mcu_ctr; /* counts MCUs processed in current row */ + int MCU_vert_offset; /* counts MCU rows within iMCU row */ + int MCU_rows_per_iMCU_row; /* number of such rows needed */ /* Virtual block array for each component. */ jvirt_barray_ptr * whole_image; @@ -289,7 +289,7 @@ METHODDEF(boolean) compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf) { my_coef_ptr coef = (my_coef_ptr) cinfo->coef; - JDIMENSION MCU_col_num; /* index of current MCU within row */ + JDIMENSION MCU_col_num; /* index of current MCU within row */ JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1; JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; int blkn, ci, xindex, yindex, yoffset, blockcnt; @@ -312,44 +312,44 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf) for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; yoffset++) { for (MCU_col_num = coef->mcu_ctr; MCU_col_num < cinfo->MCUs_per_row; - MCU_col_num++) { + MCU_col_num++) { /* Construct list of pointers to DCT blocks belonging to this MCU */ - blkn = 0; /* index of current DCT block within MCU */ + blkn = 0; /* index of current DCT block within MCU */ for (ci = 0; ci < cinfo->comps_in_scan; ci++) { - compptr = cinfo->cur_comp_info[ci]; - start_col = MCU_col_num * compptr->MCU_width; - blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width - : compptr->last_col_width; - for (yindex = 0; yindex < compptr->MCU_height; yindex++) { - if (coef->iMCU_row_num < last_iMCU_row || - yindex+yoffset < compptr->last_row_height) { - /* Fill in pointers to real blocks in this row */ - buffer_ptr = buffer[ci][yindex+yoffset] + start_col; - for (xindex = 0; xindex < blockcnt; xindex++) - MCU_buffer[blkn++] = buffer_ptr++; - } else { - /* At bottom of image, need a whole row of dummy blocks */ - xindex = 0; - } - /* Fill in any dummy blocks needed in this row. - * Dummy blocks are filled in the same way as in jccoefct.c: - * all zeroes in the AC entries, DC entries equal to previous - * block's DC value. The init routine has already zeroed the - * AC entries, so we need only set the DC entries correctly. - */ - for (; xindex < compptr->MCU_width; xindex++) { - MCU_buffer[blkn] = coef->dummy_buffer[blkn]; - MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0]; - blkn++; - } - } + compptr = cinfo->cur_comp_info[ci]; + start_col = MCU_col_num * compptr->MCU_width; + blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width + : compptr->last_col_width; + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + if (coef->iMCU_row_num < last_iMCU_row || + yindex+yoffset < compptr->last_row_height) { + /* Fill in pointers to real blocks in this row */ + buffer_ptr = buffer[ci][yindex+yoffset] + start_col; + for (xindex = 0; xindex < blockcnt; xindex++) + MCU_buffer[blkn++] = buffer_ptr++; + } else { + /* At bottom of image, need a whole row of dummy blocks */ + xindex = 0; + } + /* Fill in any dummy blocks needed in this row. + * Dummy blocks are filled in the same way as in jccoefct.c: + * all zeroes in the AC entries, DC entries equal to previous + * block's DC value. The init routine has already zeroed the + * AC entries, so we need only set the DC entries correctly. + */ + for (; xindex < compptr->MCU_width; xindex++) { + MCU_buffer[blkn] = coef->dummy_buffer[blkn]; + MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0]; + blkn++; + } + } } /* Try to write the MCU. */ if (! (*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) { - /* Suspension forced; update state counters and exit */ - coef->MCU_vert_offset = yoffset; - coef->mcu_ctr = MCU_col_num; - return FALSE; + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->mcu_ctr = MCU_col_num; + return FALSE; } } /* Completed an MCU row, but perhaps not an iMCU row */ @@ -372,7 +372,7 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf) LOCAL(void) transencode_coef_controller (j_compress_ptr cinfo, - jvirt_barray_ptr * coef_arrays) + jvirt_barray_ptr * coef_arrays) { my_coef_ptr coef; JBLOCKROW buffer; @@ -380,7 +380,7 @@ transencode_coef_controller (j_compress_ptr cinfo, coef = (my_coef_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_coef_controller)); + SIZEOF(my_coef_controller)); cinfo->coef = (struct jpeg_c_coef_controller *) coef; coef->pub.start_pass = start_pass_coef; coef->pub.compress_data = compress_output; @@ -391,7 +391,7 @@ transencode_coef_controller (j_compress_ptr cinfo, /* Allocate and pre-zero space for dummy DCT blocks. */ buffer = (JBLOCKROW) (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, - C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)); + C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)); jzero_far((void FAR *) buffer, C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)); for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) { coef->dummy_buffer[i] = buffer + i; diff --git a/jdapimin.c b/jdapimin.c index cadb59fce..db53fdcea 100644 --- a/jdapimin.c +++ b/jdapimin.c @@ -32,12 +32,12 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize) int i; /* Guard against version mismatches between library and caller. */ - cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */ + cinfo->mem = NULL; /* so jpeg_destroy knows mem mgr not called */ if (version != JPEG_LIB_VERSION) ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version); if (structsize != SIZEOF(struct jpeg_decompress_struct)) - ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE, - (int) SIZEOF(struct jpeg_decompress_struct), (int) structsize); + ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE, + (int) SIZEOF(struct jpeg_decompress_struct), (int) structsize); /* For debugging purposes, we zero the whole master structure. * But the application has already set the err pointer, and may have set @@ -121,22 +121,22 @@ default_decompress_parms (j_decompress_ptr cinfo) cinfo->jpeg_color_space = JCS_GRAYSCALE; cinfo->out_color_space = JCS_GRAYSCALE; break; - + case 3: if (cinfo->saw_JFIF_marker) { cinfo->jpeg_color_space = JCS_YCbCr; /* JFIF implies YCbCr */ } else if (cinfo->saw_Adobe_marker) { switch (cinfo->Adobe_transform) { case 0: - cinfo->jpeg_color_space = JCS_RGB; - break; + cinfo->jpeg_color_space = JCS_RGB; + break; case 1: - cinfo->jpeg_color_space = JCS_YCbCr; - break; + cinfo->jpeg_color_space = JCS_YCbCr; + break; default: - WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform); - cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */ - break; + WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform); + cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */ + break; } } else { /* Saw no special markers, try to guess from the component IDs */ @@ -145,31 +145,31 @@ default_decompress_parms (j_decompress_ptr cinfo) int cid2 = cinfo->comp_info[2].component_id; if (cid0 == 1 && cid1 == 2 && cid2 == 3) - cinfo->jpeg_color_space = JCS_YCbCr; /* assume JFIF w/out marker */ + cinfo->jpeg_color_space = JCS_YCbCr; /* assume JFIF w/out marker */ else if (cid0 == 82 && cid1 == 71 && cid2 == 66) - cinfo->jpeg_color_space = JCS_RGB; /* ASCII 'R', 'G', 'B' */ + cinfo->jpeg_color_space = JCS_RGB; /* ASCII 'R', 'G', 'B' */ else { - TRACEMS3(cinfo, 1, JTRC_UNKNOWN_IDS, cid0, cid1, cid2); - cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */ + TRACEMS3(cinfo, 1, JTRC_UNKNOWN_IDS, cid0, cid1, cid2); + cinfo->jpeg_color_space = JCS_YCbCr; /* assume it's YCbCr */ } } /* Always guess RGB is proper output colorspace. */ cinfo->out_color_space = JCS_RGB; break; - + case 4: if (cinfo->saw_Adobe_marker) { switch (cinfo->Adobe_transform) { case 0: - cinfo->jpeg_color_space = JCS_CMYK; - break; + cinfo->jpeg_color_space = JCS_CMYK; + break; case 2: - cinfo->jpeg_color_space = JCS_YCCK; - break; + cinfo->jpeg_color_space = JCS_YCCK; + break; default: - WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform); - cinfo->jpeg_color_space = JCS_YCCK; /* assume it's YCCK */ - break; + WARNMS1(cinfo, JWRN_ADOBE_XFORM, cinfo->Adobe_transform); + cinfo->jpeg_color_space = JCS_YCCK; /* assume it's YCCK */ + break; } } else { /* No special markers, assume straight CMYK. */ @@ -177,7 +177,7 @@ default_decompress_parms (j_decompress_ptr cinfo) } cinfo->out_color_space = JCS_CMYK; break; - + default: cinfo->jpeg_color_space = JCS_UNKNOWN; cinfo->out_color_space = JCS_UNKNOWN; @@ -185,7 +185,7 @@ default_decompress_parms (j_decompress_ptr cinfo) } /* Set defaults for other decompression parameters. */ - cinfo->scale_num = 1; /* 1:1 scaling */ + cinfo->scale_num = 1; /* 1:1 scaling */ cinfo->scale_denom = 1; cinfo->output_gamma = 1.0; cinfo->buffered_image = FALSE; @@ -253,7 +253,7 @@ jpeg_read_header (j_decompress_ptr cinfo, boolean require_image) retcode = JPEG_HEADER_OK; break; case JPEG_REACHED_EOI: - if (require_image) /* Complain if application wanted an image */ + if (require_image) /* Complain if application wanted an image */ ERREXIT(cinfo, JERR_NO_IMAGE); /* Reset to start state; it would be safer to require the application to * call jpeg_abort, but we can't change it now for compatibility reasons. @@ -385,7 +385,7 @@ jpeg_finish_decompress (j_decompress_ptr cinfo) /* Read until EOI */ while (! cinfo->inputctl->eoi_reached) { if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED) - return FALSE; /* Suspend, come back later */ + return FALSE; /* Suspend, come back later */ } /* Do final cleanup */ (*cinfo->src->term_source) (cinfo); diff --git a/jdapistd.c b/jdapistd.c index a50fda528..f4ac50d1b 100644 --- a/jdapistd.c +++ b/jdapistd.c @@ -55,24 +55,24 @@ jpeg_start_decompress (j_decompress_ptr cinfo) if (cinfo->inputctl->has_multiple_scans) { #ifdef D_MULTISCAN_FILES_SUPPORTED for (;;) { - int retcode; - /* Call progress monitor hook if present */ - if (cinfo->progress != NULL) - (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); - /* Absorb some more input */ - retcode = (*cinfo->inputctl->consume_input) (cinfo); - if (retcode == JPEG_SUSPENDED) - return FALSE; - if (retcode == JPEG_REACHED_EOI) - break; - /* Advance progress counter if appropriate */ - if (cinfo->progress != NULL && - (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) { - if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) { - /* jdmaster underestimated number of scans; ratchet up one scan */ - cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows; - } - } + int retcode; + /* Call progress monitor hook if present */ + if (cinfo->progress != NULL) + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + /* Absorb some more input */ + retcode = (*cinfo->inputctl->consume_input) (cinfo); + if (retcode == JPEG_SUSPENDED) + return FALSE; + if (retcode == JPEG_REACHED_EOI) + break; + /* Advance progress counter if appropriate */ + if (cinfo->progress != NULL && + (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) { + if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) { + /* jdmaster underestimated number of scans; ratchet up one scan */ + cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows; + } + } } #else ERREXIT(cinfo, JERR_NOT_COMPILED); @@ -111,16 +111,16 @@ output_pass_setup (j_decompress_ptr cinfo) JDIMENSION last_scanline; /* Call progress monitor hook if present */ if (cinfo->progress != NULL) { - cinfo->progress->pass_counter = (long) cinfo->output_scanline; - cinfo->progress->pass_limit = (long) cinfo->output_height; - (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + cinfo->progress->pass_counter = (long) cinfo->output_scanline; + cinfo->progress->pass_limit = (long) cinfo->output_height; + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); } /* Process some data */ last_scanline = cinfo->output_scanline; (*cinfo->main->process_data) (cinfo, (JSAMPARRAY) NULL, - &cinfo->output_scanline, (JDIMENSION) 0); + &cinfo->output_scanline, (JDIMENSION) 0); if (cinfo->output_scanline == last_scanline) - return FALSE; /* No progress made, must suspend */ + return FALSE; /* No progress made, must suspend */ } /* Finish up dummy pass, and set up for another one */ (*cinfo->master->finish_output_pass) (cinfo); @@ -153,7 +153,7 @@ output_pass_setup (j_decompress_ptr cinfo) GLOBAL(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines, - JDIMENSION max_lines) + JDIMENSION max_lines) { JDIMENSION row_ctr; @@ -186,7 +186,7 @@ jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines, GLOBAL(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data, - JDIMENSION max_lines) + JDIMENSION max_lines) { JDIMENSION lines_per_iMCU_row; @@ -211,7 +211,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data, /* Decompress directly into user's buffer. */ if (! (*cinfo->coef->decompress_data) (cinfo, data)) - return 0; /* suspension forced, can do nothing more */ + return 0; /* suspension forced, can do nothing more */ /* OK, we processed one iMCU row. */ cinfo->output_scanline += lines_per_iMCU_row; @@ -267,9 +267,9 @@ jpeg_finish_output (j_decompress_ptr cinfo) } /* Read markers looking for SOS or EOI */ while (cinfo->input_scan_number <= cinfo->output_scan_number && - ! cinfo->inputctl->eoi_reached) { + ! cinfo->inputctl->eoi_reached) { if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED) - return FALSE; /* Suspend, come back later */ + return FALSE; /* Suspend, come back later */ } cinfo->global_state = DSTATE_BUFIMAGE; return TRUE; diff --git a/jdarith.c b/jdarith.c index d5567339c..b945d64d2 100644 --- a/jdarith.c +++ b/jdarith.c @@ -32,7 +32,7 @@ typedef struct { int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */ - unsigned int restarts_to_go; /* MCUs left in this restart interval */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ /* Pointers to statistics areas (these workspaces have image lifespan) */ unsigned char * dc_stats[NUM_ARITH_TBLS]; @@ -115,32 +115,32 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st) if (--e->ct < 0) { /* Need to fetch next data byte */ if (cinfo->unread_marker) - data = 0; /* stuff zero data */ + data = 0; /* stuff zero data */ else { - data = get_byte(cinfo); /* read next input byte */ - if (data == 0xFF) { /* zero stuff or marker code */ - do data = get_byte(cinfo); - while (data == 0xFF); /* swallow extra 0xFF bytes */ - if (data == 0) - data = 0xFF; /* discard stuffed zero byte */ - else { - /* Note: Different from the Huffman decoder, hitting - * a marker while processing the compressed data - * segment is legal in arithmetic coding. - * The convention is to supply zero data - * then until decoding is complete. - */ - cinfo->unread_marker = data; - data = 0; - } - } + data = get_byte(cinfo); /* read next input byte */ + if (data == 0xFF) { /* zero stuff or marker code */ + do data = get_byte(cinfo); + while (data == 0xFF); /* swallow extra 0xFF bytes */ + if (data == 0) + data = 0xFF; /* discard stuffed zero byte */ + else { + /* Note: Different from the Huffman decoder, hitting + * a marker while processing the compressed data + * segment is legal in arithmetic coding. + * The convention is to supply zero data + * then until decoding is complete. + */ + cinfo->unread_marker = data; + data = 0; + } + } } e->c = (e->c << 8) | data; /* insert data into C register */ - if ((e->ct += 8) < 0) /* update bit shift counter */ - /* Need more initial bytes */ - if (++e->ct == 0) - /* Got 2 initial bytes -> re-init A and exit loop */ - e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */ + if ((e->ct += 8) < 0) /* update bit shift counter */ + /* Need more initial bytes */ + if (++e->ct == 0) + /* Got 2 initial bytes -> re-init A and exit loop */ + e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */ } e->a <<= 1; } @@ -149,9 +149,9 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st) * Qe values and probability estimation state machine */ sv = *st; - qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */ - nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */ - nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ + qe = jpeg_aritab[sv & 0x7F]; /* => Qe_Value */ + nl = qe & 0xFF; qe >>= 8; /* Next_Index_LPS + Switch_MPS */ + nm = qe & 0xFF; qe >>= 8; /* Next_Index_MPS */ /* Decode & estimation procedures per sections D.2.4 & D.2.5 */ temp = e->a - qe; @@ -162,19 +162,19 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st) /* Conditional LPS (less probable symbol) exchange */ if (e->a < qe) { e->a = qe; - *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ + *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ } else { e->a = qe; - *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ - sv ^= 0x80; /* Exchange LPS/MPS */ + *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ + sv ^= 0x80; /* Exchange LPS/MPS */ } } else if (e->a < 0x8000L) { /* Conditional MPS (more probable symbol) exchange */ if (e->a < qe) { - *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ - sv ^= 0x80; /* Exchange LPS/MPS */ + *st = (sv & 0x80) ^ nl; /* Estimate_after_LPS */ + sv ^= 0x80; /* Exchange LPS/MPS */ } else { - *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ + *st = (sv & 0x80) ^ nm; /* Estimate_after_MPS */ } } @@ -214,7 +214,7 @@ process_restart (j_decompress_ptr cinfo) /* Reset arithmetic decoding variables */ entropy->c = 0; entropy->a = 0; - entropy->ct = -16; /* force reading 2 initial bytes to fill C */ + entropy->ct = -16; /* force reading 2 initial bytes to fill C */ /* Reset restart counter */ entropy->restarts_to_go = cinfo->restart_interval; @@ -253,7 +253,7 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) entropy->restarts_to_go--; } - if (entropy->ct == -1) return TRUE; /* if error do nothing */ + if (entropy->ct == -1) return TRUE; /* if error do nothing */ /* Outer loop handles each block in the MCU */ @@ -277,28 +277,28 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) st += 2; st += sign; /* Figure F.23: Decoding the magnitude category of v */ if ((m = arith_decode(cinfo, st)) != 0) { - st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ - while (arith_decode(cinfo, st)) { - if ((m <<= 1) == 0x8000) { - WARNMS(cinfo, JWRN_ARITH_BAD_CODE); - entropy->ct = -1; /* magnitude overflow */ - return TRUE; - } - st += 1; - } + st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ + while (arith_decode(cinfo, st)) { + if ((m <<= 1) == 0x8000) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* magnitude overflow */ + return TRUE; + } + st += 1; + } } /* Section F.1.4.4.1.2: Establish dc_context conditioning category */ if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1)) - entropy->dc_context[ci] = 0; /* zero diff category */ + entropy->dc_context[ci] = 0; /* zero diff category */ else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1)) - entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */ + entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */ else - entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */ + entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */ v = m; /* Figure F.24: Decoding the magnitude bit pattern of v */ st += 14; while (m >>= 1) - if (arith_decode(cinfo, st)) v |= m; + if (arith_decode(cinfo, st)) v |= m; v += 1; if (sign) v = -v; entropy->last_dc_val[ci] += v; } @@ -332,7 +332,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) entropy->restarts_to_go--; } - if (entropy->ct == -1) return TRUE; /* if error do nothing */ + if (entropy->ct == -1) return TRUE; /* if error do nothing */ /* There is always only one block per MCU */ block = MCU_data[0]; @@ -343,13 +343,13 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) /* Figure F.20: Decode_AC_coefficients */ for (k = cinfo->Ss; k <= cinfo->Se; k++) { st = entropy->ac_stats[tbl] + 3 * (k - 1); - if (arith_decode(cinfo, st)) break; /* EOB flag */ + if (arith_decode(cinfo, st)) break; /* EOB flag */ while (arith_decode(cinfo, st + 1) == 0) { st += 3; k++; if (k > cinfo->Se) { - WARNMS(cinfo, JWRN_ARITH_BAD_CODE); - entropy->ct = -1; /* spectral overflow */ - return TRUE; + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* spectral overflow */ + return TRUE; } } /* Figure F.21: Decoding nonzero value v */ @@ -359,17 +359,17 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) /* Figure F.23: Decoding the magnitude category of v */ if ((m = arith_decode(cinfo, st)) != 0) { if (arith_decode(cinfo, st)) { - m <<= 1; - st = entropy->ac_stats[tbl] + - (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); - while (arith_decode(cinfo, st)) { - if ((m <<= 1) == 0x8000) { - WARNMS(cinfo, JWRN_ARITH_BAD_CODE); - entropy->ct = -1; /* magnitude overflow */ - return TRUE; - } - st += 1; - } + m <<= 1; + st = entropy->ac_stats[tbl] + + (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); + while (arith_decode(cinfo, st)) { + if ((m <<= 1) == 0x8000) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* magnitude overflow */ + return TRUE; + } + st += 1; + } } } v = m; @@ -404,8 +404,8 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) entropy->restarts_to_go--; } - st = entropy->fixed_bin; /* use fixed probability estimation */ - p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ + st = entropy->fixed_bin; /* use fixed probability estimation */ + p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ /* Outer loop handles each block in the MCU */ @@ -440,14 +440,14 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) entropy->restarts_to_go--; } - if (entropy->ct == -1) return TRUE; /* if error do nothing */ + if (entropy->ct == -1) return TRUE; /* if error do nothing */ /* There is always only one block per MCU */ block = MCU_data[0]; tbl = cinfo->cur_comp_info[0]->ac_tbl_no; - p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ - m1 = (-1) << cinfo->Al; /* -1 in the bit position being coded */ + p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ + m1 = (-1) << cinfo->Al; /* -1 in the bit position being coded */ /* Establish EOBx (previous stage end-of-block) index */ for (kex = cinfo->Se; kex > 0; kex--) @@ -456,30 +456,30 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) for (k = cinfo->Ss; k <= cinfo->Se; k++) { st = entropy->ac_stats[tbl] + 3 * (k - 1); if (k > kex) - if (arith_decode(cinfo, st)) break; /* EOB flag */ + if (arith_decode(cinfo, st)) break; /* EOB flag */ for (;;) { thiscoef = *block + jpeg_natural_order[k]; - if (*thiscoef) { /* previously nonzero coef */ - if (arith_decode(cinfo, st + 2)) { - if (*thiscoef < 0) - *thiscoef += m1; - else - *thiscoef += p1; - } - break; + if (*thiscoef) { /* previously nonzero coef */ + if (arith_decode(cinfo, st + 2)) { + if (*thiscoef < 0) + *thiscoef += m1; + else + *thiscoef += p1; + } + break; } - if (arith_decode(cinfo, st + 1)) { /* newly nonzero coef */ - if (arith_decode(cinfo, entropy->fixed_bin)) - *thiscoef = m1; - else - *thiscoef = p1; - break; + if (arith_decode(cinfo, st + 1)) { /* newly nonzero coef */ + if (arith_decode(cinfo, entropy->fixed_bin)) + *thiscoef = m1; + else + *thiscoef = p1; + break; } st += 3; k++; if (k > cinfo->Se) { - WARNMS(cinfo, JWRN_ARITH_BAD_CODE); - entropy->ct = -1; /* spectral overflow */ - return TRUE; + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* spectral overflow */ + return TRUE; } } } @@ -509,7 +509,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) entropy->restarts_to_go--; } - if (entropy->ct == -1) return TRUE; /* if error do nothing */ + if (entropy->ct == -1) return TRUE; /* if error do nothing */ /* Outer loop handles each block in the MCU */ @@ -535,28 +535,28 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) st += 2; st += sign; /* Figure F.23: Decoding the magnitude category of v */ if ((m = arith_decode(cinfo, st)) != 0) { - st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ - while (arith_decode(cinfo, st)) { - if ((m <<= 1) == 0x8000) { - WARNMS(cinfo, JWRN_ARITH_BAD_CODE); - entropy->ct = -1; /* magnitude overflow */ - return TRUE; - } - st += 1; - } + st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */ + while (arith_decode(cinfo, st)) { + if ((m <<= 1) == 0x8000) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* magnitude overflow */ + return TRUE; + } + st += 1; + } } /* Section F.1.4.4.1.2: Establish dc_context conditioning category */ if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1)) - entropy->dc_context[ci] = 0; /* zero diff category */ + entropy->dc_context[ci] = 0; /* zero diff category */ else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1)) - entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */ + entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */ else - entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */ + entropy->dc_context[ci] = 4 + (sign * 4); /* small diff category */ v = m; /* Figure F.24: Decoding the magnitude bit pattern of v */ st += 14; while (m >>= 1) - if (arith_decode(cinfo, st)) v |= m; + if (arith_decode(cinfo, st)) v |= m; v += 1; if (sign) v = -v; entropy->last_dc_val[ci] += v; } @@ -570,14 +570,14 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) /* Figure F.20: Decode_AC_coefficients */ for (k = 1; k <= DCTSIZE2 - 1; k++) { st = entropy->ac_stats[tbl] + 3 * (k - 1); - if (arith_decode(cinfo, st)) break; /* EOB flag */ + if (arith_decode(cinfo, st)) break; /* EOB flag */ while (arith_decode(cinfo, st + 1) == 0) { - st += 3; k++; - if (k > DCTSIZE2 - 1) { - WARNMS(cinfo, JWRN_ARITH_BAD_CODE); - entropy->ct = -1; /* spectral overflow */ - return TRUE; - } + st += 3; k++; + if (k > DCTSIZE2 - 1) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* spectral overflow */ + return TRUE; + } } /* Figure F.21: Decoding nonzero value v */ /* Figure F.22: Decoding the sign of v */ @@ -585,25 +585,25 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) st += 2; /* Figure F.23: Decoding the magnitude category of v */ if ((m = arith_decode(cinfo, st)) != 0) { - if (arith_decode(cinfo, st)) { - m <<= 1; - st = entropy->ac_stats[tbl] + - (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); - while (arith_decode(cinfo, st)) { - if ((m <<= 1) == 0x8000) { - WARNMS(cinfo, JWRN_ARITH_BAD_CODE); - entropy->ct = -1; /* magnitude overflow */ - return TRUE; - } - st += 1; - } - } + if (arith_decode(cinfo, st)) { + m <<= 1; + st = entropy->ac_stats[tbl] + + (k <= cinfo->arith_ac_K[tbl] ? 189 : 217); + while (arith_decode(cinfo, st)) { + if ((m <<= 1) == 0x8000) { + WARNMS(cinfo, JWRN_ARITH_BAD_CODE); + entropy->ct = -1; /* magnitude overflow */ + return TRUE; + } + st += 1; + } + } } v = m; /* Figure F.24: Decoding the magnitude bit pattern of v */ st += 14; while (m >>= 1) - if (arith_decode(cinfo, st)) v |= m; + if (arith_decode(cinfo, st)) v |= m; v += 1; if (sign) v = -v; (*block)[jpeg_natural_order[k]] = (JCOEF) v; } @@ -628,24 +628,24 @@ start_pass (j_decompress_ptr cinfo) /* Validate progressive scan parameters */ if (cinfo->Ss == 0) { if (cinfo->Se != 0) - goto bad; + goto bad; } else { /* need not check Ss/Se < 0 since they came from unsigned bytes */ if (cinfo->Se < cinfo->Ss || cinfo->Se > DCTSIZE2 - 1) - goto bad; + goto bad; /* AC scans may have only one component */ if (cinfo->comps_in_scan != 1) - goto bad; + goto bad; } if (cinfo->Ah != 0) { /* Successive approximation refinement scan: must have Al = Ah-1. */ if (cinfo->Ah-1 != cinfo->Al) - goto bad; + goto bad; } - if (cinfo->Al > 13) { /* need not check for < 0 */ + if (cinfo->Al > 13) { /* need not check for < 0 */ bad: ERREXIT4(cinfo, JERR_BAD_PROGRESSION, - cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al); + cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al); } /* Update progression status, and verify that scan order is legal. * Note that inter-scan inconsistencies are treated as warnings @@ -655,32 +655,32 @@ start_pass (j_decompress_ptr cinfo) int coefi, cindex = cinfo->cur_comp_info[ci]->component_index; int *coef_bit_ptr = & cinfo->coef_bits[cindex][0]; if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */ - WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0); + WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0); for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) { - int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi]; - if (cinfo->Ah != expected) - WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi); - coef_bit_ptr[coefi] = cinfo->Al; + int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi]; + if (cinfo->Ah != expected) + WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi); + coef_bit_ptr[coefi] = cinfo->Al; } } /* Select MCU decoding routine */ if (cinfo->Ah == 0) { if (cinfo->Ss == 0) - entropy->pub.decode_mcu = decode_mcu_DC_first; + entropy->pub.decode_mcu = decode_mcu_DC_first; else - entropy->pub.decode_mcu = decode_mcu_AC_first; + entropy->pub.decode_mcu = decode_mcu_AC_first; } else { if (cinfo->Ss == 0) - entropy->pub.decode_mcu = decode_mcu_DC_refine; + entropy->pub.decode_mcu = decode_mcu_DC_refine; else - entropy->pub.decode_mcu = decode_mcu_AC_refine; + entropy->pub.decode_mcu = decode_mcu_AC_refine; } } else { /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG. * This ought to be an error condition, but we make it a warning. */ if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 || - (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1)) + (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1)) WARNMS(cinfo, JWRN_NOT_SEQUENTIAL); /* Select MCU decoding routine */ entropy->pub.decode_mcu = decode_mcu; @@ -692,10 +692,10 @@ start_pass (j_decompress_ptr cinfo) if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) { tbl = compptr->dc_tbl_no; if (tbl < 0 || tbl >= NUM_ARITH_TBLS) - ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); + ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); if (entropy->dc_stats[tbl] == NULL) - entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) - ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS); + entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS); MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS); /* Initialize DC predictions to 0 */ entropy->last_dc_val[ci] = 0; @@ -704,10 +704,10 @@ start_pass (j_decompress_ptr cinfo) if (! cinfo->progressive_mode || cinfo->Ss) { tbl = compptr->ac_tbl_no; if (tbl < 0 || tbl >= NUM_ARITH_TBLS) - ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); + ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl); if (entropy->ac_stats[tbl] == NULL) - entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) - ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS); + entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small) + ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS); MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS); } } @@ -715,7 +715,7 @@ start_pass (j_decompress_ptr cinfo) /* Initialize arithmetic decoding variables */ entropy->c = 0; entropy->a = 0; - entropy->ct = -16; /* force reading 2 initial bytes to fill C */ + entropy->ct = -16; /* force reading 2 initial bytes to fill C */ /* Initialize restart counter */ entropy->restarts_to_go = cinfo->restart_interval; @@ -734,7 +734,7 @@ jinit_arith_decoder (j_decompress_ptr cinfo) entropy = (arith_entropy_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(arith_entropy_decoder)); + SIZEOF(arith_entropy_decoder)); cinfo->entropy = (struct jpeg_entropy_decoder *) entropy; entropy->pub.start_pass = start_pass; @@ -752,10 +752,10 @@ jinit_arith_decoder (j_decompress_ptr cinfo) int *coef_bit_ptr, ci; cinfo->coef_bits = (int (*)[DCTSIZE2]) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - cinfo->num_components*DCTSIZE2*SIZEOF(int)); + cinfo->num_components*DCTSIZE2*SIZEOF(int)); coef_bit_ptr = & cinfo->coef_bits[0][0]; - for (ci = 0; ci < cinfo->num_components; ci++) + for (ci = 0; ci < cinfo->num_components; ci++) for (i = 0; i < DCTSIZE2; i++) - *coef_bit_ptr++ = -1; + *coef_bit_ptr++ = -1; } } diff --git a/jdatadst-tj.c b/jdatadst-tj.c index a8bf2401a..95d982352 100644 --- a/jdatadst-tj.c +++ b/jdatadst-tj.c @@ -22,13 +22,13 @@ #include "jpeglib.h" #include "jerror.h" -#ifndef HAVE_STDLIB_H /* should declare malloc(),free() */ +#ifndef HAVE_STDLIB_H /* should declare malloc(),free() */ extern void * malloc JPP((size_t size)); extern void free JPP((void *ptr)); #endif -#define OUTPUT_BUF_SIZE 4096 /* choose an efficiently fwrite'able size */ +#define OUTPUT_BUF_SIZE 4096 /* choose an efficiently fwrite'able size */ /* Expanded data destination object for memory output */ @@ -36,10 +36,10 @@ extern void free JPP((void *ptr)); typedef struct { struct jpeg_destination_mgr pub; /* public fields */ - unsigned char ** outbuffer; /* target buffer */ + unsigned char ** outbuffer; /* target buffer */ unsigned long * outsize; - unsigned char * newbuffer; /* newly allocated buffer */ - JOCTET * buffer; /* start of buffer */ + unsigned char * newbuffer; /* newly allocated buffer */ + JOCTET * buffer; /* start of buffer */ size_t bufsize; boolean alloc; } my_mem_destination_mgr; @@ -147,21 +147,21 @@ term_mem_destination (j_compress_ptr cinfo) GLOBAL(void) jpeg_mem_dest_tj (j_compress_ptr cinfo, - unsigned char ** outbuffer, unsigned long * outsize, - boolean alloc) + unsigned char ** outbuffer, unsigned long * outsize, + boolean alloc) { my_mem_dest_ptr dest; - if (outbuffer == NULL || outsize == NULL) /* sanity check */ + if (outbuffer == NULL || outsize == NULL) /* sanity check */ ERREXIT(cinfo, JERR_BUFFER_SIZE); /* The destination object is made permanent so that multiple JPEG images * can be written to the same buffer without re-executing jpeg_mem_dest. */ - if (cinfo->dest == NULL) { /* first time for this JPEG object? */ + if (cinfo->dest == NULL) { /* first time for this JPEG object? */ cinfo->dest = (struct jpeg_destination_mgr *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - SIZEOF(my_mem_destination_mgr)); + SIZEOF(my_mem_destination_mgr)); dest = (my_mem_dest_ptr) cinfo->dest; dest->newbuffer = NULL; } diff --git a/jdatadst.c b/jdatadst.c index 1b89fabea..3fbc449bb 100644 --- a/jdatadst.c +++ b/jdatadst.c @@ -22,7 +22,7 @@ #include "jpeglib.h" #include "jerror.h" -#ifndef HAVE_STDLIB_H /* should declare malloc(),free() */ +#ifndef HAVE_STDLIB_H /* should declare malloc(),free() */ extern void * malloc JPP((size_t size)); extern void free JPP((void *ptr)); #endif @@ -33,13 +33,13 @@ extern void free JPP((void *ptr)); typedef struct { struct jpeg_destination_mgr pub; /* public fields */ - FILE * outfile; /* target stream */ - JOCTET * buffer; /* start of buffer */ + FILE * outfile; /* target stream */ + JOCTET * buffer; /* start of buffer */ } my_destination_mgr; typedef my_destination_mgr * my_dest_ptr; -#define OUTPUT_BUF_SIZE 4096 /* choose an efficiently fwrite'able size */ +#define OUTPUT_BUF_SIZE 4096 /* choose an efficiently fwrite'able size */ #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) @@ -48,10 +48,10 @@ typedef my_destination_mgr * my_dest_ptr; typedef struct { struct jpeg_destination_mgr pub; /* public fields */ - unsigned char ** outbuffer; /* target buffer */ + unsigned char ** outbuffer; /* target buffer */ unsigned long * outsize; - unsigned char * newbuffer; /* newly allocated buffer */ - JOCTET * buffer; /* start of buffer */ + unsigned char * newbuffer; /* newly allocated buffer */ + JOCTET * buffer; /* start of buffer */ size_t bufsize; } my_mem_destination_mgr; @@ -72,7 +72,7 @@ init_destination (j_compress_ptr cinfo) /* Allocate the output buffer --- it will be released when done with image */ dest->buffer = (JOCTET *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - OUTPUT_BUF_SIZE * SIZEOF(JOCTET)); + OUTPUT_BUF_SIZE * SIZEOF(JOCTET)); dest->pub.next_output_byte = dest->buffer; dest->pub.free_in_buffer = OUTPUT_BUF_SIZE; @@ -213,10 +213,10 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile) * manager serially with the same JPEG object, because their private object * sizes may be different. Caveat programmer. */ - if (cinfo->dest == NULL) { /* first time for this JPEG object? */ + if (cinfo->dest == NULL) { /* first time for this JPEG object? */ cinfo->dest = (struct jpeg_destination_mgr *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - SIZEOF(my_destination_mgr)); + SIZEOF(my_destination_mgr)); } dest = (my_dest_ptr) cinfo->dest; @@ -241,20 +241,20 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile) GLOBAL(void) jpeg_mem_dest (j_compress_ptr cinfo, - unsigned char ** outbuffer, unsigned long * outsize) + unsigned char ** outbuffer, unsigned long * outsize) { my_mem_dest_ptr dest; - if (outbuffer == NULL || outsize == NULL) /* sanity check */ + if (outbuffer == NULL || outsize == NULL) /* sanity check */ ERREXIT(cinfo, JERR_BUFFER_SIZE); /* The destination object is made permanent so that multiple JPEG images * can be written to the same buffer without re-executing jpeg_mem_dest. */ - if (cinfo->dest == NULL) { /* first time for this JPEG object? */ + if (cinfo->dest == NULL) { /* first time for this JPEG object? */ cinfo->dest = (struct jpeg_destination_mgr *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - SIZEOF(my_mem_destination_mgr)); + SIZEOF(my_mem_destination_mgr)); } dest = (my_mem_dest_ptr) cinfo->dest; diff --git a/jdatasrc-tj.c b/jdatasrc-tj.c index 259c6de92..f023a8bf9 100644 --- a/jdatasrc-tj.c +++ b/jdatasrc-tj.c @@ -157,21 +157,21 @@ term_source (j_decompress_ptr cinfo) GLOBAL(void) jpeg_mem_src_tj (j_decompress_ptr cinfo, - unsigned char * inbuffer, unsigned long insize) + unsigned char * inbuffer, unsigned long insize) { struct jpeg_source_mgr * src; - if (inbuffer == NULL || insize == 0) /* Treat empty input as fatal error */ + if (inbuffer == NULL || insize == 0) /* Treat empty input as fatal error */ ERREXIT(cinfo, JERR_INPUT_EMPTY); /* The source object is made permanent so that a series of JPEG images * can be read from the same buffer by calling jpeg_mem_src only before * the first one. */ - if (cinfo->src == NULL) { /* first time for this JPEG object? */ + if (cinfo->src == NULL) { /* first time for this JPEG object? */ cinfo->src = (struct jpeg_source_mgr *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - SIZEOF(struct jpeg_source_mgr)); + SIZEOF(struct jpeg_source_mgr)); } src = cinfo->src; diff --git a/jdatasrc.c b/jdatasrc.c index 1e9c8ad2b..5b4c17df2 100644 --- a/jdatasrc.c +++ b/jdatasrc.c @@ -26,16 +26,16 @@ /* Expanded data source object for stdio input */ typedef struct { - struct jpeg_source_mgr pub; /* public fields */ + struct jpeg_source_mgr pub; /* public fields */ - FILE * infile; /* source stream */ - JOCTET * buffer; /* start of buffer */ - boolean start_of_file; /* have we gotten any data yet? */ + FILE * infile; /* source stream */ + JOCTET * buffer; /* start of buffer */ + boolean start_of_file; /* have we gotten any data yet? */ } my_source_mgr; typedef my_source_mgr * my_src_ptr; -#define INPUT_BUF_SIZE 4096 /* choose an efficiently fread'able size */ +#define INPUT_BUF_SIZE 4096 /* choose an efficiently fread'able size */ /* @@ -106,7 +106,7 @@ fill_input_buffer (j_decompress_ptr cinfo) nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE); if (nbytes <= 0) { - if (src->start_of_file) /* Treat empty input file as fatal error */ + if (src->start_of_file) /* Treat empty input file as fatal error */ ERREXIT(cinfo, JERR_INPUT_EMPTY); WARNMS(cinfo, JWRN_JPEG_EOF); /* Insert a fake EOI marker */ @@ -224,14 +224,14 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile) * This makes it unsafe to use this manager and a different source * manager serially with the same JPEG object. Caveat programmer. */ - if (cinfo->src == NULL) { /* first time for this JPEG object? */ + if (cinfo->src == NULL) { /* first time for this JPEG object? */ cinfo->src = (struct jpeg_source_mgr *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - SIZEOF(my_source_mgr)); + SIZEOF(my_source_mgr)); src = (my_src_ptr) cinfo->src; src->buffer = (JOCTET *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - INPUT_BUF_SIZE * SIZEOF(JOCTET)); + INPUT_BUF_SIZE * SIZEOF(JOCTET)); } src = (my_src_ptr) cinfo->src; @@ -254,21 +254,21 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile) GLOBAL(void) jpeg_mem_src (j_decompress_ptr cinfo, - unsigned char * inbuffer, unsigned long insize) + unsigned char * inbuffer, unsigned long insize) { struct jpeg_source_mgr * src; - if (inbuffer == NULL || insize == 0) /* Treat empty input as fatal error */ + if (inbuffer == NULL || insize == 0) /* Treat empty input as fatal error */ ERREXIT(cinfo, JERR_INPUT_EMPTY); /* The source object is made permanent so that a series of JPEG images * can be read from the same buffer by calling jpeg_mem_src only before * the first one. */ - if (cinfo->src == NULL) { /* first time for this JPEG object? */ + if (cinfo->src == NULL) { /* first time for this JPEG object? */ cinfo->src = (struct jpeg_source_mgr *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - SIZEOF(struct jpeg_source_mgr)); + SIZEOF(struct jpeg_source_mgr)); } src = cinfo->src; diff --git a/jdcoefct.c b/jdcoefct.c index d38db6c33..1d18a7449 100644 --- a/jdcoefct.c +++ b/jdcoefct.c @@ -33,9 +33,9 @@ typedef struct { /* These variables keep track of the current location of the input side. */ /* cinfo->input_iMCU_row is also used for this. */ - JDIMENSION MCU_ctr; /* counts MCUs processed in current row */ - int MCU_vert_offset; /* counts MCU rows within iMCU row */ - int MCU_rows_per_iMCU_row; /* number of such rows needed */ + JDIMENSION MCU_ctr; /* counts MCUs processed in current row */ + int MCU_vert_offset; /* counts MCU rows within iMCU row */ + int MCU_rows_per_iMCU_row; /* number of such rows needed */ /* The output side's location is represented by cinfo->output_iMCU_row. */ @@ -61,7 +61,7 @@ typedef struct { #ifdef BLOCK_SMOOTHING_SUPPORTED /* When doing block smoothing, we latch coefficient Al values here */ int * coef_bits_latch; -#define SAVED_COEFS 6 /* we save coef_bits[0..5] */ +#define SAVED_COEFS 6 /* we save coef_bits[0..5] */ #endif } my_coef_controller; @@ -69,15 +69,15 @@ typedef my_coef_controller * my_coef_ptr; /* Forward declarations */ METHODDEF(int) decompress_onepass - JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf)); + JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf)); #ifdef D_MULTISCAN_FILES_SUPPORTED METHODDEF(int) decompress_data - JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf)); + JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf)); #endif #ifdef BLOCK_SMOOTHING_SUPPORTED LOCAL(boolean) smoothing_ok JPP((j_decompress_ptr cinfo)); METHODDEF(int) decompress_smooth_data - JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf)); + JPP((j_decompress_ptr cinfo, JSAMPIMAGE output_buf)); #endif @@ -153,7 +153,7 @@ METHODDEF(int) decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) { my_coef_ptr coef = (my_coef_ptr) cinfo->coef; - JDIMENSION MCU_col_num; /* index of current MCU within row */ + JDIMENSION MCU_col_num; /* index of current MCU within row */ JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1; JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1; int blkn, ci, xindex, yindex, yoffset, useful_width; @@ -166,49 +166,49 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; yoffset++) { for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col; - MCU_col_num++) { + MCU_col_num++) { /* Try to fetch an MCU. Entropy decoder expects buffer to be zeroed. */ jzero_far((void FAR *) coef->MCU_buffer[0], - (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK))); + (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK))); if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) { - /* Suspension forced; update state counters and exit */ - coef->MCU_vert_offset = yoffset; - coef->MCU_ctr = MCU_col_num; - return JPEG_SUSPENDED; + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->MCU_ctr = MCU_col_num; + return JPEG_SUSPENDED; } /* Determine where data should go in output_buf and do the IDCT thing. * We skip dummy blocks at the right and bottom edges (but blkn gets * incremented past them!). Note the inner loop relies on having * allocated the MCU_buffer[] blocks sequentially. */ - blkn = 0; /* index of current DCT block within MCU */ + blkn = 0; /* index of current DCT block within MCU */ for (ci = 0; ci < cinfo->comps_in_scan; ci++) { - compptr = cinfo->cur_comp_info[ci]; - /* Don't bother to IDCT an uninteresting component. */ - if (! compptr->component_needed) { - blkn += compptr->MCU_blocks; - continue; - } - inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index]; - useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width - : compptr->last_col_width; - output_ptr = output_buf[compptr->component_index] + - yoffset * compptr->_DCT_scaled_size; - start_col = MCU_col_num * compptr->MCU_sample_width; - for (yindex = 0; yindex < compptr->MCU_height; yindex++) { - if (cinfo->input_iMCU_row < last_iMCU_row || - yoffset+yindex < compptr->last_row_height) { - output_col = start_col; - for (xindex = 0; xindex < useful_width; xindex++) { - (*inverse_DCT) (cinfo, compptr, - (JCOEFPTR) coef->MCU_buffer[blkn+xindex], - output_ptr, output_col); - output_col += compptr->_DCT_scaled_size; - } - } - blkn += compptr->MCU_width; - output_ptr += compptr->_DCT_scaled_size; - } + compptr = cinfo->cur_comp_info[ci]; + /* Don't bother to IDCT an uninteresting component. */ + if (! compptr->component_needed) { + blkn += compptr->MCU_blocks; + continue; + } + inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index]; + useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width + : compptr->last_col_width; + output_ptr = output_buf[compptr->component_index] + + yoffset * compptr->_DCT_scaled_size; + start_col = MCU_col_num * compptr->MCU_sample_width; + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + if (cinfo->input_iMCU_row < last_iMCU_row || + yoffset+yindex < compptr->last_row_height) { + output_col = start_col; + for (xindex = 0; xindex < useful_width; xindex++) { + (*inverse_DCT) (cinfo, compptr, + (JCOEFPTR) coef->MCU_buffer[blkn+xindex], + output_ptr, output_col); + output_col += compptr->_DCT_scaled_size; + } + } + blkn += compptr->MCU_width; + output_ptr += compptr->_DCT_scaled_size; + } } } /* Completed an MCU row, but perhaps not an iMCU row */ @@ -233,7 +233,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) METHODDEF(int) dummy_consume_data (j_decompress_ptr cinfo) { - return JPEG_SUSPENDED; /* Always indicate nothing was done */ + return JPEG_SUSPENDED; /* Always indicate nothing was done */ } @@ -250,7 +250,7 @@ METHODDEF(int) consume_data (j_decompress_ptr cinfo) { my_coef_ptr coef = (my_coef_ptr) cinfo->coef; - JDIMENSION MCU_col_num; /* index of current MCU within row */ + JDIMENSION MCU_col_num; /* index of current MCU within row */ int blkn, ci, xindex, yindex, yoffset; JDIMENSION start_col; JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN]; @@ -274,25 +274,25 @@ consume_data (j_decompress_ptr cinfo) for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row; yoffset++) { for (MCU_col_num = coef->MCU_ctr; MCU_col_num < cinfo->MCUs_per_row; - MCU_col_num++) { + MCU_col_num++) { /* Construct list of pointers to DCT blocks belonging to this MCU */ - blkn = 0; /* index of current DCT block within MCU */ + blkn = 0; /* index of current DCT block within MCU */ for (ci = 0; ci < cinfo->comps_in_scan; ci++) { - compptr = cinfo->cur_comp_info[ci]; - start_col = MCU_col_num * compptr->MCU_width; - for (yindex = 0; yindex < compptr->MCU_height; yindex++) { - buffer_ptr = buffer[ci][yindex+yoffset] + start_col; - for (xindex = 0; xindex < compptr->MCU_width; xindex++) { - coef->MCU_buffer[blkn++] = buffer_ptr++; - } - } + compptr = cinfo->cur_comp_info[ci]; + start_col = MCU_col_num * compptr->MCU_width; + for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + buffer_ptr = buffer[ci][yindex+yoffset] + start_col; + for (xindex = 0; xindex < compptr->MCU_width; xindex++) { + coef->MCU_buffer[blkn++] = buffer_ptr++; + } + } } /* Try to fetch the MCU. */ if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) { - /* Suspension forced; update state counters and exit */ - coef->MCU_vert_offset = yoffset; - coef->MCU_ctr = MCU_col_num; - return JPEG_SUSPENDED; + /* Suspension forced; update state counters and exit */ + coef->MCU_vert_offset = yoffset; + coef->MCU_ctr = MCU_col_num; + return JPEG_SUSPENDED; } } /* Completed an MCU row, but perhaps not an iMCU row */ @@ -333,8 +333,8 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) /* Force some input to be done if we are getting ahead of the input. */ while (cinfo->input_scan_number < cinfo->output_scan_number || - (cinfo->input_scan_number == cinfo->output_scan_number && - cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) { + (cinfo->input_scan_number == cinfo->output_scan_number && + cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) { if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED) return JPEG_SUSPENDED; } @@ -365,10 +365,10 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) buffer_ptr = buffer[block_row]; output_col = 0; for (block_num = 0; block_num < compptr->width_in_blocks; block_num++) { - (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr, - output_ptr, output_col); - buffer_ptr++; - output_col += compptr->_DCT_scaled_size; + (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr, + output_ptr, output_col); + buffer_ptr++; + output_col += compptr->_DCT_scaled_size; } output_ptr += compptr->_DCT_scaled_size; } @@ -425,8 +425,8 @@ smoothing_ok (j_decompress_ptr cinfo) if (coef->coef_bits_latch == NULL) coef->coef_bits_latch = (int *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - cinfo->num_components * - (SAVED_COEFS * SIZEOF(int))); + cinfo->num_components * + (SAVED_COEFS * SIZEOF(int))); coef_bits_latch = coef->coef_bits_latch; for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; @@ -436,11 +436,11 @@ smoothing_ok (j_decompress_ptr cinfo) return FALSE; /* Verify DC & first 5 AC quantizers are nonzero to avoid zero-divide. */ if (qtable->quantval[0] == 0 || - qtable->quantval[Q01_POS] == 0 || - qtable->quantval[Q10_POS] == 0 || - qtable->quantval[Q20_POS] == 0 || - qtable->quantval[Q11_POS] == 0 || - qtable->quantval[Q02_POS] == 0) + qtable->quantval[Q01_POS] == 0 || + qtable->quantval[Q10_POS] == 0 || + qtable->quantval[Q20_POS] == 0 || + qtable->quantval[Q11_POS] == 0 || + qtable->quantval[Q02_POS] == 0) return FALSE; /* DC values must be at least partly known for all components. */ coef_bits = cinfo->coef_bits[ci]; @@ -450,7 +450,7 @@ smoothing_ok (j_decompress_ptr cinfo) for (coefi = 1; coefi <= 5; coefi++) { coef_bits_latch[coefi] = coef_bits[coefi]; if (coef_bits[coefi] != 0) - smoothing_useful = TRUE; + smoothing_useful = TRUE; } coef_bits_latch += SAVED_COEFS; } @@ -489,7 +489,7 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) /* Force some input to be done if we are getting ahead of the input. */ while (cinfo->input_scan_number <= cinfo->output_scan_number && - ! cinfo->inputctl->eoi_reached) { + ! cinfo->inputctl->eoi_reached) { if (cinfo->input_scan_number == cinfo->output_scan_number) { /* If input is working on current scan, we ordinarily want it to * have completed the current row. But if input scan is DC, @@ -498,7 +498,7 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) */ JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0; if (cinfo->input_iMCU_row > cinfo->output_iMCU_row+delta) - break; + break; } if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED) return JPEG_SUSPENDED; @@ -526,15 +526,15 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) if (cinfo->output_iMCU_row > 0) { access_rows += compptr->v_samp_factor; /* prior iMCU row too */ buffer = (*cinfo->mem->access_virt_barray) - ((j_common_ptr) cinfo, coef->whole_image[ci], - (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor, - (JDIMENSION) access_rows, FALSE); - buffer += compptr->v_samp_factor; /* point to current iMCU row */ + ((j_common_ptr) cinfo, coef->whole_image[ci], + (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor, + (JDIMENSION) access_rows, FALSE); + buffer += compptr->v_samp_factor; /* point to current iMCU row */ first_row = FALSE; } else { buffer = (*cinfo->mem->access_virt_barray) - ((j_common_ptr) cinfo, coef->whole_image[ci], - (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE); + ((j_common_ptr) cinfo, coef->whole_image[ci], + (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE); first_row = TRUE; } /* Fetch component-dependent info */ @@ -552,13 +552,13 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) for (block_row = 0; block_row < block_rows; block_row++) { buffer_ptr = buffer[block_row]; if (first_row && block_row == 0) - prev_block_row = buffer_ptr; + prev_block_row = buffer_ptr; else - prev_block_row = buffer[block_row-1]; + prev_block_row = buffer[block_row-1]; if (last_row && block_row == block_rows-1) - next_block_row = buffer_ptr; + next_block_row = buffer_ptr; else - next_block_row = buffer[block_row+1]; + next_block_row = buffer[block_row+1]; /* We fetch the surrounding DC values using a sliding-register approach. * Initialize all nine here so as to do the right thing on narrow pics. */ @@ -568,102 +568,102 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) output_col = 0; last_block_column = compptr->width_in_blocks - 1; for (block_num = 0; block_num <= last_block_column; block_num++) { - /* Fetch current DCT block into workspace so we can modify it. */ - jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1); - /* Update DC values */ - if (block_num < last_block_column) { - DC3 = (int) prev_block_row[1][0]; - DC6 = (int) buffer_ptr[1][0]; - DC9 = (int) next_block_row[1][0]; - } - /* Compute coefficient estimates per K.8. - * An estimate is applied only if coefficient is still zero, - * and is not known to be fully accurate. - */ - /* AC01 */ - if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) { - num = 36 * Q00 * (DC4 - DC6); - if (num >= 0) { - pred = (int) (((Q01<<7) + num) / (Q01<<8)); - if (Al > 0 && pred >= (1< 0 && pred >= (1<= 0) { - pred = (int) (((Q10<<7) + num) / (Q10<<8)); - if (Al > 0 && pred >= (1< 0 && pred >= (1<= 0) { - pred = (int) (((Q20<<7) + num) / (Q20<<8)); - if (Al > 0 && pred >= (1< 0 && pred >= (1<= 0) { - pred = (int) (((Q11<<7) + num) / (Q11<<8)); - if (Al > 0 && pred >= (1< 0 && pred >= (1<= 0) { - pred = (int) (((Q02<<7) + num) / (Q02<<8)); - if (Al > 0 && pred >= (1< 0 && pred >= (1<_DCT_scaled_size; + /* Fetch current DCT block into workspace so we can modify it. */ + jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1); + /* Update DC values */ + if (block_num < last_block_column) { + DC3 = (int) prev_block_row[1][0]; + DC6 = (int) buffer_ptr[1][0]; + DC9 = (int) next_block_row[1][0]; + } + /* Compute coefficient estimates per K.8. + * An estimate is applied only if coefficient is still zero, + * and is not known to be fully accurate. + */ + /* AC01 */ + if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) { + num = 36 * Q00 * (DC4 - DC6); + if (num >= 0) { + pred = (int) (((Q01<<7) + num) / (Q01<<8)); + if (Al > 0 && pred >= (1< 0 && pred >= (1<= 0) { + pred = (int) (((Q10<<7) + num) / (Q10<<8)); + if (Al > 0 && pred >= (1< 0 && pred >= (1<= 0) { + pred = (int) (((Q20<<7) + num) / (Q20<<8)); + if (Al > 0 && pred >= (1< 0 && pred >= (1<= 0) { + pred = (int) (((Q11<<7) + num) / (Q11<<8)); + if (Al > 0 && pred >= (1< 0 && pred >= (1<= 0) { + pred = (int) (((Q02<<7) + num) / (Q02<<8)); + if (Al > 0 && pred >= (1< 0 && pred >= (1<_DCT_scaled_size; } output_ptr += compptr->_DCT_scaled_size; } @@ -688,7 +688,7 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer) coef = (my_coef_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_coef_controller)); + SIZEOF(my_coef_controller)); cinfo->coef = (struct jpeg_d_coef_controller *) coef; coef->pub.start_input_pass = start_input_pass; coef->pub.start_output_pass = start_output_pass; @@ -706,20 +706,20 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer) jpeg_component_info *compptr; for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { + ci++, compptr++) { access_rows = compptr->v_samp_factor; #ifdef BLOCK_SMOOTHING_SUPPORTED /* If block smoothing could be used, need a bigger window */ if (cinfo->progressive_mode) - access_rows *= 3; + access_rows *= 3; #endif coef->whole_image[ci] = (*cinfo->mem->request_virt_barray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE, - (JDIMENSION) jround_up((long) compptr->width_in_blocks, - (long) compptr->h_samp_factor), - (JDIMENSION) jround_up((long) compptr->height_in_blocks, - (long) compptr->v_samp_factor), - (JDIMENSION) access_rows); + ((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE, + (JDIMENSION) jround_up((long) compptr->width_in_blocks, + (long) compptr->h_samp_factor), + (JDIMENSION) jround_up((long) compptr->height_in_blocks, + (long) compptr->v_samp_factor), + (JDIMENSION) access_rows); } coef->pub.consume_data = consume_data; coef->pub.decompress_data = decompress_data; @@ -734,7 +734,7 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer) buffer = (JBLOCKROW) (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, - D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)); + D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)); for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) { coef->MCU_buffer[i] = buffer + i; } diff --git a/jdcolext.c b/jdcolext.c index 6e9e31a93..f72cab0cc 100644 --- a/jdcolext.c +++ b/jdcolext.c @@ -58,8 +58,8 @@ ycc_rgb_convert_internal (j_decompress_ptr cinfo, /* Range-limiting is essential due to noise introduced by DCT losses. */ outptr[RGB_RED] = range_limit[y + Crrtab[cr]]; outptr[RGB_GREEN] = range_limit[y + - ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], - SCALEBITS))]; + ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS))]; outptr[RGB_BLUE] = range_limit[y + Cbbtab[cb]]; /* Set unused byte to 0xFF so it can be interpreted as an opaque */ /* alpha channel value */ diff --git a/jdcolor.c b/jdcolor.c index 458addac9..8dae08dfb 100644 --- a/jdcolor.c +++ b/jdcolor.c @@ -25,13 +25,13 @@ typedef struct { struct jpeg_color_deconverter pub; /* public fields */ /* Private state for YCC->RGB conversion */ - int * Cr_r_tab; /* => table for Cr to R conversion */ - int * Cb_b_tab; /* => table for Cb to B conversion */ - INT32 * Cr_g_tab; /* => table for Cr to G conversion */ - INT32 * Cb_g_tab; /* => table for Cb to G conversion */ + int * Cr_r_tab; /* => table for Cr to R conversion */ + int * Cb_b_tab; /* => table for Cb to B conversion */ + INT32 * Cr_g_tab; /* => table for Cr to G conversion */ + INT32 * Cb_g_tab; /* => table for Cb to G conversion */ /* Private state for RGB->Y conversion */ - INT32 * rgb_y_tab; /* => table for RGB to Y conversion */ + INT32 * rgb_y_tab; /* => table for RGB to Y conversion */ } my_color_deconverter; typedef my_color_deconverter * my_cconvert_ptr; @@ -45,11 +45,11 @@ typedef my_color_deconverter * my_cconvert_ptr; * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5. * The conversion equations to be implemented are therefore * - * R = Y + 1.40200 * Cr - * G = Y - 0.34414 * Cb - 0.71414 * Cr - * B = Y + 1.77200 * Cb + * R = Y + 1.40200 * Cr + * G = Y - 0.34414 * Cb - 0.71414 * Cr + * B = Y + 1.77200 * Cb * - * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B * * where Cb and Cr represent the incoming values less CENTERJSAMPLE. * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.) @@ -71,9 +71,9 @@ typedef my_color_deconverter * my_cconvert_ptr; * together before rounding. */ -#define SCALEBITS 16 /* speediest right-shift on some machines */ -#define ONE_HALF ((INT32) 1 << (SCALEBITS-1)) -#define FIX(x) ((INT32) ((x) * (1L<Y conversion and divide it up into * three parts, instead of doing three alloc_small requests. This lets us @@ -82,10 +82,10 @@ typedef my_color_deconverter * my_cconvert_ptr; * anyway). */ -#define R_Y_OFF 0 /* offset to R => Y section */ -#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */ -#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */ -#define TABLE_SIZE (3*(MAXJSAMPLE+1)) +#define R_Y_OFF 0 /* offset to R => Y section */ +#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */ +#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */ +#define TABLE_SIZE (3*(MAXJSAMPLE+1)) /* Include inline routines for colorspace extensions */ @@ -215,26 +215,26 @@ build_ycc_rgb_table (j_decompress_ptr cinfo) cconvert->Cr_r_tab = (int *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (MAXJSAMPLE+1) * SIZEOF(int)); + (MAXJSAMPLE+1) * SIZEOF(int)); cconvert->Cb_b_tab = (int *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (MAXJSAMPLE+1) * SIZEOF(int)); + (MAXJSAMPLE+1) * SIZEOF(int)); cconvert->Cr_g_tab = (INT32 *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (MAXJSAMPLE+1) * SIZEOF(INT32)); + (MAXJSAMPLE+1) * SIZEOF(INT32)); cconvert->Cb_g_tab = (INT32 *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (MAXJSAMPLE+1) * SIZEOF(INT32)); + (MAXJSAMPLE+1) * SIZEOF(INT32)); for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) { /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */ /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */ /* Cr=>R value is nearest int to 1.40200 * x */ cconvert->Cr_r_tab[i] = (int) - RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS); + RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS); /* Cb=>B value is nearest int to 1.77200 * x */ cconvert->Cb_b_tab[i] = (int) - RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS); + RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS); /* Cr=>G value is scaled-up -0.71414 * x */ cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x; /* Cb=>G value is scaled-up -0.34414 * x */ @@ -250,8 +250,8 @@ build_ycc_rgb_table (j_decompress_ptr cinfo) METHODDEF(void) ycc_rgb_convert (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows) + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) { switch (cinfo->out_color_space) { case JCS_EXT_RGB: @@ -307,7 +307,7 @@ build_rgb_y_table (j_decompress_ptr cinfo) /* Allocate and fill in the conversion tables. */ cconvert->rgb_y_tab = rgb_y_tab = (INT32 *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (TABLE_SIZE * SIZEOF(INT32))); + (TABLE_SIZE * SIZEOF(INT32))); for (i = 0; i <= MAXJSAMPLE; i++) { rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i; @@ -323,8 +323,8 @@ build_rgb_y_table (j_decompress_ptr cinfo) METHODDEF(void) rgb_gray_convert (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows) + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) { my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; register int r, g, b; @@ -346,8 +346,8 @@ rgb_gray_convert (j_decompress_ptr cinfo, b = GETJSAMPLE(inptr2[col]); /* Y */ outptr[col] = (JSAMPLE) - ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) - >> SCALEBITS); + ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF]) + >> SCALEBITS); } } } @@ -360,8 +360,8 @@ rgb_gray_convert (j_decompress_ptr cinfo, METHODDEF(void) null_convert (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows) + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) { register JSAMPROW inptr, outptr; register JDIMENSION count; @@ -374,8 +374,8 @@ null_convert (j_decompress_ptr cinfo, inptr = input_buf[ci][input_row]; outptr = output_buf[0] + ci; for (count = num_cols; count > 0; count--) { - *outptr = *inptr++; /* needn't bother with GETJSAMPLE() here */ - outptr += num_components; + *outptr = *inptr++; /* needn't bother with GETJSAMPLE() here */ + outptr += num_components; } } input_row++; @@ -392,11 +392,11 @@ null_convert (j_decompress_ptr cinfo, METHODDEF(void) grayscale_convert (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows) + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) { jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0, - num_rows, cinfo->output_width); + num_rows, cinfo->output_width); } @@ -406,8 +406,8 @@ grayscale_convert (j_decompress_ptr cinfo, METHODDEF(void) gray_rgb_convert (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows) + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) { switch (cinfo->out_color_space) { case JCS_EXT_RGB: @@ -452,8 +452,8 @@ gray_rgb_convert (j_decompress_ptr cinfo, METHODDEF(void) rgb_rgb_convert (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows) + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) { switch (cinfo->out_color_space) { case JCS_EXT_RGB: @@ -501,8 +501,8 @@ rgb_rgb_convert (j_decompress_ptr cinfo, METHODDEF(void) ycck_cmyk_convert (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows) + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) { my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert; register int y, cb, cr; @@ -530,13 +530,13 @@ ycck_cmyk_convert (j_decompress_ptr cinfo, cb = GETJSAMPLE(inptr1[col]); cr = GETJSAMPLE(inptr2[col]); /* Range-limiting is essential due to noise introduced by DCT losses. */ - outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])]; /* red */ - outptr[1] = range_limit[MAXJSAMPLE - (y + /* green */ - ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], - SCALEBITS)))]; - outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])]; /* blue */ + outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])]; /* red */ + outptr[1] = range_limit[MAXJSAMPLE - (y + /* green */ + ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], + SCALEBITS)))]; + outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])]; /* blue */ /* K passes through unchanged */ - outptr[3] = inptr3[col]; /* don't need GETJSAMPLE here */ + outptr[3] = inptr3[col]; /* don't need GETJSAMPLE here */ outptr += 4; } } @@ -566,7 +566,7 @@ jinit_color_deconverter (j_decompress_ptr cinfo) cconvert = (my_cconvert_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_color_deconverter)); + SIZEOF(my_color_deconverter)); cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert; cconvert->pub.start_pass = start_pass_dcolor; @@ -589,7 +589,7 @@ jinit_color_deconverter (j_decompress_ptr cinfo) ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); break; - default: /* JCS_UNKNOWN can be anything */ + default: /* JCS_UNKNOWN can be anything */ if (cinfo->num_components < 1) ERREXIT(cinfo, JERR_BAD_J_COLORSPACE); break; @@ -604,11 +604,11 @@ jinit_color_deconverter (j_decompress_ptr cinfo) case JCS_GRAYSCALE: cinfo->out_color_components = 1; if (cinfo->jpeg_color_space == JCS_GRAYSCALE || - cinfo->jpeg_color_space == JCS_YCbCr) { + cinfo->jpeg_color_space == JCS_YCbCr) { cconvert->pub.color_convert = grayscale_convert; /* For color->grayscale conversion, only the Y (0) component is needed */ for (ci = 1; ci < cinfo->num_components; ci++) - cinfo->comp_info[ci].component_needed = FALSE; + cinfo->comp_info[ci].component_needed = FALSE; } else if (cinfo->jpeg_color_space == JCS_RGB) { cconvert->pub.color_convert = rgb_gray_convert; build_rgb_y_table(cinfo); @@ -665,7 +665,7 @@ jinit_color_deconverter (j_decompress_ptr cinfo) if (cinfo->out_color_space == cinfo->jpeg_color_space) { cinfo->out_color_components = cinfo->num_components; cconvert->pub.color_convert = null_convert; - } else /* unsupported non-null conversion */ + } else /* unsupported non-null conversion */ ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL); break; } diff --git a/jdct.h b/jdct.h index 36374484f..310f2ca5d 100644 --- a/jdct.h +++ b/jdct.h @@ -8,7 +8,7 @@ * This include file contains common declarations for the forward and * inverse DCT modules. These declarations are private to the DCT managers * (jcdctmgr.c, jddctmgr.c) and the individual DCT algorithms. - * The individual DCT algorithms are kept in separate files to ease + * The individual DCT algorithms are kept in separate files to ease * machine-dependent tuning (e.g., assembly coding). */ @@ -29,7 +29,7 @@ #if BITS_IN_JSAMPLE == 8 #ifndef WITH_SIMD -typedef int DCTELEM; /* 16 or 32 bits is fine */ +typedef int DCTELEM; /* 16 or 32 bits is fine */ typedef unsigned int UDCTELEM; typedef unsigned long long UDCTELEM2; #else @@ -38,7 +38,7 @@ typedef unsigned short UDCTELEM; typedef unsigned int UDCTELEM2; #endif #else -typedef INT32 DCTELEM; /* must have 32 bits */ +typedef INT32 DCTELEM; /* must have 32 bits */ typedef UINT32 UDCTELEM; typedef unsigned long long UDCTELEM2; #endif @@ -64,10 +64,10 @@ typedef unsigned long long UDCTELEM2; typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */ #if BITS_IN_JSAMPLE == 8 typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */ -#define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */ +#define IFAST_SCALE_BITS 2 /* fractional bits in scale factors */ #else -typedef INT32 IFAST_MULT_TYPE; /* need 32 bits for scaled quantizers */ -#define IFAST_SCALE_BITS 13 /* fractional bits in scale factors */ +typedef INT32 IFAST_MULT_TYPE; /* need 32 bits for scaled quantizers */ +#define IFAST_SCALE_BITS 13 /* fractional bits in scale factors */ #endif typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ @@ -89,27 +89,27 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ /* Short forms of external names for systems with brain-damaged linkers. */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jpeg_fdct_islow jFDislow -#define jpeg_fdct_ifast jFDifast -#define jpeg_fdct_float jFDfloat -#define jpeg_idct_islow jRDislow -#define jpeg_idct_ifast jRDifast -#define jpeg_idct_float jRDfloat -#define jpeg_idct_7x7 jRD7x7 -#define jpeg_idct_6x6 jRD6x6 -#define jpeg_idct_5x5 jRD5x5 -#define jpeg_idct_4x4 jRD4x4 -#define jpeg_idct_3x3 jRD3x3 -#define jpeg_idct_2x2 jRD2x2 -#define jpeg_idct_1x1 jRD1x1 -#define jpeg_idct_9x9 jRD9x9 -#define jpeg_idct_10x10 jRD10x10 -#define jpeg_idct_11x11 jRD11x11 -#define jpeg_idct_12x12 jRD12x12 -#define jpeg_idct_13x13 jRD13x13 -#define jpeg_idct_14x14 jRD14x14 -#define jpeg_idct_15x15 jRD15x15 -#define jpeg_idct_16x16 jRD16x16 +#define jpeg_fdct_islow jFDislow +#define jpeg_fdct_ifast jFDifast +#define jpeg_fdct_float jFDfloat +#define jpeg_idct_islow jRDislow +#define jpeg_idct_ifast jRDifast +#define jpeg_idct_float jRDfloat +#define jpeg_idct_7x7 jRD7x7 +#define jpeg_idct_6x6 jRD6x6 +#define jpeg_idct_5x5 jRD5x5 +#define jpeg_idct_4x4 jRD4x4 +#define jpeg_idct_3x3 jRD3x3 +#define jpeg_idct_2x2 jRD2x2 +#define jpeg_idct_1x1 jRD1x1 +#define jpeg_idct_9x9 jRD9x9 +#define jpeg_idct_10x10 jRD10x10 +#define jpeg_idct_11x11 jRD11x11 +#define jpeg_idct_12x12 jRD12x12 +#define jpeg_idct_13x13 jRD13x13 +#define jpeg_idct_14x14 jRD14x14 +#define jpeg_idct_15x15 jRD15x15 +#define jpeg_idct_16x16 jRD16x16 #endif /* NEED_SHORT_EXTERNAL_NAMES */ /* Extern declarations for the forward and inverse DCT routines. */ @@ -120,58 +120,58 @@ EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data)); EXTERN(void) jpeg_idct_islow JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_ifast JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_float JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_7x7 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_6x6 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_5x5 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_4x4 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_3x3 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_2x2 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_1x1 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_9x9 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_10x10 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_11x11 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_12x12 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_13x13 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_14x14 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_15x15 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); EXTERN(void) jpeg_idct_16x16 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); + JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); /* @@ -184,7 +184,7 @@ EXTERN(void) jpeg_idct_16x16 * and may differ from one module to the next. */ -#define ONE ((INT32) 1) +#define ONE ((INT32) 1) #define CONST_SCALE (ONE << CONST_BITS) /* Convert a positive real constant to an integer scaled by CONST_SCALE. @@ -192,7 +192,7 @@ EXTERN(void) jpeg_idct_16x16 * thus causing a lot of useless floating-point operations at run time. */ -#define FIX(x) ((INT32) ((x) * CONST_SCALE + 0.5)) +#define FIX(x) ((INT32) ((x) * CONST_SCALE + 0.5)) /* Descale and correctly round an INT32 value that's scaled by N bits. * We assume RIGHT_SHIFT rounds towards minus infinity, so adding @@ -210,23 +210,23 @@ EXTERN(void) jpeg_idct_16x16 * correct combination of casts. */ -#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ +#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ #define MULTIPLY16C16(var,const) (((INT16) (var)) * ((INT16) (const))) #endif -#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ +#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ #define MULTIPLY16C16(var,const) (((INT16) (var)) * ((INT32) (const))) #endif -#ifndef MULTIPLY16C16 /* default definition */ +#ifndef MULTIPLY16C16 /* default definition */ #define MULTIPLY16C16(var,const) ((var) * (const)) #endif /* Same except both inputs are variables. */ -#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ +#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ #define MULTIPLY16V16(var1,var2) (((INT16) (var1)) * ((INT16) (var2))) #endif -#ifndef MULTIPLY16V16 /* default definition */ +#ifndef MULTIPLY16V16 /* default definition */ #define MULTIPLY16V16(var1,var2) ((var1) * (var2)) #endif diff --git a/jddctmgr.c b/jddctmgr.c index 88b470733..bcc4f556c 100644 --- a/jddctmgr.c +++ b/jddctmgr.c @@ -22,7 +22,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #include "jsimddct.h" #include "jpegcomp.h" @@ -47,7 +47,7 @@ /* Private subobject for this module */ typedef struct { - struct jpeg_inverse_dct pub; /* public fields */ + struct jpeg_inverse_dct pub; /* public fields */ /* This array contains the IDCT method code that each multiplier table * is currently set up for, or -1 if it's not yet set up. @@ -108,29 +108,29 @@ start_pass (j_decompress_ptr cinfo) #ifdef IDCT_SCALING_SUPPORTED case 1: method_ptr = jpeg_idct_1x1; - method = JDCT_ISLOW; /* jidctred uses islow-style table */ + method = JDCT_ISLOW; /* jidctred uses islow-style table */ break; case 2: if (jsimd_can_idct_2x2()) method_ptr = jsimd_idct_2x2; else method_ptr = jpeg_idct_2x2; - method = JDCT_ISLOW; /* jidctred uses islow-style table */ + method = JDCT_ISLOW; /* jidctred uses islow-style table */ break; case 3: method_ptr = jpeg_idct_3x3; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 4: if (jsimd_can_idct_4x4()) method_ptr = jsimd_idct_4x4; else method_ptr = jpeg_idct_4x4; - method = JDCT_ISLOW; /* jidctred uses islow-style table */ + method = JDCT_ISLOW; /* jidctred uses islow-style table */ break; case 5: method_ptr = jpeg_idct_5x5; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 6: #if defined(__mips__) @@ -139,58 +139,58 @@ start_pass (j_decompress_ptr cinfo) else #endif method_ptr = jpeg_idct_6x6; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 7: method_ptr = jpeg_idct_7x7; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; #endif case DCTSIZE: switch (cinfo->dct_method) { #ifdef DCT_ISLOW_SUPPORTED case JDCT_ISLOW: - if (jsimd_can_idct_islow()) - method_ptr = jsimd_idct_islow; - else - method_ptr = jpeg_idct_islow; - method = JDCT_ISLOW; - break; + if (jsimd_can_idct_islow()) + method_ptr = jsimd_idct_islow; + else + method_ptr = jpeg_idct_islow; + method = JDCT_ISLOW; + break; #endif #ifdef DCT_IFAST_SUPPORTED case JDCT_IFAST: - if (jsimd_can_idct_ifast()) - method_ptr = jsimd_idct_ifast; - else - method_ptr = jpeg_idct_ifast; - method = JDCT_IFAST; - break; + if (jsimd_can_idct_ifast()) + method_ptr = jsimd_idct_ifast; + else + method_ptr = jpeg_idct_ifast; + method = JDCT_IFAST; + break; #endif #ifdef DCT_FLOAT_SUPPORTED case JDCT_FLOAT: - if (jsimd_can_idct_float()) - method_ptr = jsimd_idct_float; - else - method_ptr = jpeg_idct_float; - method = JDCT_FLOAT; - break; + if (jsimd_can_idct_float()) + method_ptr = jsimd_idct_float; + else + method_ptr = jpeg_idct_float; + method = JDCT_FLOAT; + break; #endif default: - ERREXIT(cinfo, JERR_NOT_COMPILED); - break; + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; } break; case 9: method_ptr = jpeg_idct_9x9; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 10: method_ptr = jpeg_idct_10x10; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 11: method_ptr = jpeg_idct_11x11; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 12: #if defined(__mips__) @@ -199,23 +199,23 @@ start_pass (j_decompress_ptr cinfo) else #endif method_ptr = jpeg_idct_12x12; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 13: method_ptr = jpeg_idct_13x13; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 14: method_ptr = jpeg_idct_14x14; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 15: method_ptr = jpeg_idct_15x15; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 16: method_ptr = jpeg_idct_16x16; - method = JDCT_ISLOW; /* jidctint uses islow-style table */ + method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; default: ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->_DCT_scaled_size); @@ -232,81 +232,81 @@ start_pass (j_decompress_ptr cinfo) if (! compptr->component_needed || idct->cur_method[ci] == method) continue; qtbl = compptr->quant_table; - if (qtbl == NULL) /* happens if no data yet for component */ + if (qtbl == NULL) /* happens if no data yet for component */ continue; idct->cur_method[ci] = method; switch (method) { #ifdef PROVIDE_ISLOW_TABLES case JDCT_ISLOW: { - /* For LL&M IDCT method, multipliers are equal to raw quantization - * coefficients, but are stored as ints to ensure access efficiency. - */ - ISLOW_MULT_TYPE * ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table; - for (i = 0; i < DCTSIZE2; i++) { - ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i]; - } + /* For LL&M IDCT method, multipliers are equal to raw quantization + * coefficients, but are stored as ints to ensure access efficiency. + */ + ISLOW_MULT_TYPE * ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table; + for (i = 0; i < DCTSIZE2; i++) { + ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i]; + } } break; #endif #ifdef DCT_IFAST_SUPPORTED case JDCT_IFAST: { - /* For AA&N IDCT method, multipliers are equal to quantization - * coefficients scaled by scalefactor[row]*scalefactor[col], where - * scalefactor[0] = 1 - * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 - * For integer operation, the multiplier table is to be scaled by - * IFAST_SCALE_BITS. - */ - IFAST_MULT_TYPE * ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table; + /* For AA&N IDCT method, multipliers are equal to quantization + * coefficients scaled by scalefactor[row]*scalefactor[col], where + * scalefactor[0] = 1 + * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + * For integer operation, the multiplier table is to be scaled by + * IFAST_SCALE_BITS. + */ + IFAST_MULT_TYPE * ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table; #define CONST_BITS 14 - static const INT16 aanscales[DCTSIZE2] = { - /* precomputed values scaled up by 14 bits */ - 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, - 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270, - 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906, - 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, - 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, - 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552, - 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446, - 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247 - }; - SHIFT_TEMPS + static const INT16 aanscales[DCTSIZE2] = { + /* precomputed values scaled up by 14 bits */ + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, + 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270, + 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906, + 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, + 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, + 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552, + 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446, + 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247 + }; + SHIFT_TEMPS - for (i = 0; i < DCTSIZE2; i++) { - ifmtbl[i] = (IFAST_MULT_TYPE) - DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i], - (INT32) aanscales[i]), - CONST_BITS-IFAST_SCALE_BITS); - } + for (i = 0; i < DCTSIZE2; i++) { + ifmtbl[i] = (IFAST_MULT_TYPE) + DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i], + (INT32) aanscales[i]), + CONST_BITS-IFAST_SCALE_BITS); + } } break; #endif #ifdef DCT_FLOAT_SUPPORTED case JDCT_FLOAT: { - /* For float AA&N IDCT method, multipliers are equal to quantization - * coefficients scaled by scalefactor[row]*scalefactor[col], where - * scalefactor[0] = 1 - * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 - */ - FLOAT_MULT_TYPE * fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table; - int row, col; - static const double aanscalefactor[DCTSIZE] = { - 1.0, 1.387039845, 1.306562965, 1.175875602, - 1.0, 0.785694958, 0.541196100, 0.275899379 - }; + /* For float AA&N IDCT method, multipliers are equal to quantization + * coefficients scaled by scalefactor[row]*scalefactor[col], where + * scalefactor[0] = 1 + * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + */ + FLOAT_MULT_TYPE * fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table; + int row, col; + static const double aanscalefactor[DCTSIZE] = { + 1.0, 1.387039845, 1.306562965, 1.175875602, + 1.0, 0.785694958, 0.541196100, 0.275899379 + }; - i = 0; - for (row = 0; row < DCTSIZE; row++) { - for (col = 0; col < DCTSIZE; col++) { - fmtbl[i] = (FLOAT_MULT_TYPE) - ((double) qtbl->quantval[i] * - aanscalefactor[row] * aanscalefactor[col]); - i++; - } - } + i = 0; + for (row = 0; row < DCTSIZE; row++) { + for (col = 0; col < DCTSIZE; col++) { + fmtbl[i] = (FLOAT_MULT_TYPE) + ((double) qtbl->quantval[i] * + aanscalefactor[row] * aanscalefactor[col]); + i++; + } + } } break; #endif @@ -331,7 +331,7 @@ jinit_inverse_dct (j_decompress_ptr cinfo) idct = (my_idct_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_idct_controller)); + SIZEOF(my_idct_controller)); cinfo->idct = (struct jpeg_inverse_dct *) idct; idct->pub.start_pass = start_pass; @@ -340,7 +340,7 @@ jinit_inverse_dct (j_decompress_ptr cinfo) /* Allocate and pre-zero a multiplier table for each component */ compptr->dct_table = (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(multiplier_table)); + SIZEOF(multiplier_table)); MEMZERO(compptr->dct_table, SIZEOF(multiplier_table)); /* Mark multiplier table not yet set up for any method */ idct->cur_method[ci] = -1; diff --git a/jdhuff.c b/jdhuff.c index d21d39925..db2b5eaca 100644 --- a/jdhuff.c +++ b/jdhuff.c @@ -19,7 +19,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdhuff.h" /* Declarations shared with jdphuff.c */ +#include "jdhuff.h" /* Declarations shared with jdphuff.c */ #include "jpegcomp.h" #include "jstdhuff.c" @@ -45,10 +45,10 @@ typedef struct { #else #if MAX_COMPS_IN_SCAN == 4 #define ASSIGN_STATE(dest,src) \ - ((dest).last_dc_val[0] = (src).last_dc_val[0], \ - (dest).last_dc_val[1] = (src).last_dc_val[1], \ - (dest).last_dc_val[2] = (src).last_dc_val[2], \ - (dest).last_dc_val[3] = (src).last_dc_val[3]) + ((dest).last_dc_val[0] = (src).last_dc_val[0], \ + (dest).last_dc_val[1] = (src).last_dc_val[1], \ + (dest).last_dc_val[2] = (src).last_dc_val[2], \ + (dest).last_dc_val[3] = (src).last_dc_val[3]) #endif #endif @@ -59,11 +59,11 @@ typedef struct { /* These fields are loaded into local variables at start of each MCU. * In case of suspension, we exit WITHOUT updating them. */ - bitread_perm_state bitstate; /* Bit buffer at start of MCU */ - savable_state saved; /* Other state at start of MCU */ + bitread_perm_state bitstate; /* Bit buffer at start of MCU */ + savable_state saved; /* Other state at start of MCU */ /* These fields are NOT loaded into local working state. */ - unsigned int restarts_to_go; /* MCUs left in this restart interval */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ /* Pointers to derived tables (these workspaces have image lifespan) */ d_derived_tbl * dc_derived_tbls[NUM_HUFF_TBLS]; @@ -108,9 +108,9 @@ start_pass_huff_decoder (j_decompress_ptr cinfo) /* Compute derived values for Huffman tables */ /* We may do this more than once for a table, but it's not expensive */ jpeg_make_d_derived_tbl(cinfo, TRUE, dctbl, - & entropy->dc_derived_tbls[dctbl]); + & entropy->dc_derived_tbls[dctbl]); jpeg_make_d_derived_tbl(cinfo, FALSE, actbl, - & entropy->ac_derived_tbls[actbl]); + & entropy->ac_derived_tbls[actbl]); /* Initialize DC predictions to 0 */ entropy->saved.last_dc_val[ci] = 0; } @@ -151,7 +151,7 @@ start_pass_huff_decoder (j_decompress_ptr cinfo) GLOBAL(void) jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno, - d_derived_tbl ** pdtbl) + d_derived_tbl ** pdtbl) { JHUFF_TBL *htbl; d_derived_tbl *dtbl; @@ -177,26 +177,26 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno, if (*pdtbl == NULL) *pdtbl = (d_derived_tbl *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(d_derived_tbl)); + SIZEOF(d_derived_tbl)); dtbl = *pdtbl; - dtbl->pub = htbl; /* fill in back link */ - + dtbl->pub = htbl; /* fill in back link */ + /* Figure C.1: make table of Huffman code length for each symbol */ p = 0; for (l = 1; l <= 16; l++) { i = (int) htbl->bits[l]; - if (i < 0 || p + i > 256) /* protect against table overrun */ + if (i < 0 || p + i > 256) /* protect against table overrun */ ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); while (i--) huffsize[p++] = (char) l; } huffsize[p] = 0; numsymbols = p; - + /* Figure C.2: generate the codes themselves */ /* We also validate that the counts represent a legal Huffman code tree. */ - + code = 0; si = huffsize[0]; p = 0; @@ -226,7 +226,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno, p += htbl->bits[l]; dtbl->maxcode[l] = huffcode[p-1]; /* maximum code of length l */ } else { - dtbl->maxcode[l] = -1; /* -1 if no codes of this length */ + dtbl->maxcode[l] = -1; /* -1 if no codes of this length */ } } dtbl->valoffset[17] = 0; @@ -249,8 +249,8 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno, /* Generate left-justified code followed by all possible bit sequences */ lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l); for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) { - dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p]; - lookbits++; + dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p]; + lookbits++; } } } @@ -265,7 +265,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno, for (i = 0; i < numsymbols; i++) { int sym = htbl->huffval[i]; if (sym < 0 || sym > 15) - ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); + ERREXIT(cinfo, JERR_BAD_HUFF_TABLE); } } } @@ -287,7 +287,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno, */ #ifdef SLOW_SHIFT_32 -#define MIN_GET_BITS 15 /* minimum allowable value */ +#define MIN_GET_BITS 15 /* minimum allowable value */ #else #define MIN_GET_BITS (BIT_BUF_SIZE-7) #endif @@ -295,8 +295,8 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno, GLOBAL(boolean) jpeg_fill_bit_buffer (bitread_working_state * state, - register bit_buf_type get_buffer, register int bits_left, - int nbits) + register bit_buf_type get_buffer, register int bits_left, + int nbits) /* Load up the bit buffer to a depth of at least nbits */ { /* Copy heavily used state fields into locals (hopefully registers) */ @@ -308,54 +308,54 @@ jpeg_fill_bit_buffer (bitread_working_state * state, /* (It is assumed that no request will be for more than that many bits.) */ /* We fail to do so only if we hit a marker or are forced to suspend. */ - if (cinfo->unread_marker == 0) { /* cannot advance past a marker */ + if (cinfo->unread_marker == 0) { /* cannot advance past a marker */ while (bits_left < MIN_GET_BITS) { register int c; /* Attempt to read a byte */ if (bytes_in_buffer == 0) { - if (! (*cinfo->src->fill_input_buffer) (cinfo)) - return FALSE; - next_input_byte = cinfo->src->next_input_byte; - bytes_in_buffer = cinfo->src->bytes_in_buffer; + if (! (*cinfo->src->fill_input_buffer) (cinfo)) + return FALSE; + next_input_byte = cinfo->src->next_input_byte; + bytes_in_buffer = cinfo->src->bytes_in_buffer; } bytes_in_buffer--; c = GETJOCTET(*next_input_byte++); /* If it's 0xFF, check and discard stuffed zero byte */ if (c == 0xFF) { - /* Loop here to discard any padding FF's on terminating marker, - * so that we can save a valid unread_marker value. NOTE: we will - * accept multiple FF's followed by a 0 as meaning a single FF data - * byte. This data pattern is not valid according to the standard. - */ - do { - if (bytes_in_buffer == 0) { - if (! (*cinfo->src->fill_input_buffer) (cinfo)) - return FALSE; - next_input_byte = cinfo->src->next_input_byte; - bytes_in_buffer = cinfo->src->bytes_in_buffer; - } - bytes_in_buffer--; - c = GETJOCTET(*next_input_byte++); - } while (c == 0xFF); - - if (c == 0) { - /* Found FF/00, which represents an FF data byte */ - c = 0xFF; - } else { - /* Oops, it's actually a marker indicating end of compressed data. - * Save the marker code for later use. - * Fine point: it might appear that we should save the marker into - * bitread working state, not straight into permanent state. But - * once we have hit a marker, we cannot need to suspend within the - * current MCU, because we will read no more bytes from the data - * source. So it is OK to update permanent state right away. - */ - cinfo->unread_marker = c; - /* See if we need to insert some fake zero bits. */ - goto no_more_bytes; - } + /* Loop here to discard any padding FF's on terminating marker, + * so that we can save a valid unread_marker value. NOTE: we will + * accept multiple FF's followed by a 0 as meaning a single FF data + * byte. This data pattern is not valid according to the standard. + */ + do { + if (bytes_in_buffer == 0) { + if (! (*cinfo->src->fill_input_buffer) (cinfo)) + return FALSE; + next_input_byte = cinfo->src->next_input_byte; + bytes_in_buffer = cinfo->src->bytes_in_buffer; + } + bytes_in_buffer--; + c = GETJOCTET(*next_input_byte++); + } while (c == 0xFF); + + if (c == 0) { + /* Found FF/00, which represents an FF data byte */ + c = 0xFF; + } else { + /* Oops, it's actually a marker indicating end of compressed data. + * Save the marker code for later use. + * Fine point: it might appear that we should save the marker into + * bitread working state, not straight into permanent state. But + * once we have hit a marker, we cannot need to suspend within the + * current MCU, because we will read no more bytes from the data + * source. So it is OK to update permanent state right away. + */ + cinfo->unread_marker = c; + /* See if we need to insert some fake zero bits. */ + goto no_more_bytes; + } } /* OK, load c into get_buffer */ @@ -375,8 +375,8 @@ jpeg_fill_bit_buffer (bitread_working_state * state, * appears per data segment. */ if (! cinfo->entropy->insufficient_data) { - WARNMS(cinfo, JWRN_HIT_MARKER); - cinfo->entropy->insufficient_data = TRUE; + WARNMS(cinfo, JWRN_HIT_MARKER); + cinfo->entropy->insufficient_data = TRUE; } /* Fill the buffer with zero bits */ get_buffer <<= MIN_GET_BITS - bits_left; @@ -445,8 +445,8 @@ jpeg_fill_bit_buffer (bitread_working_state * state, GLOBAL(int) jpeg_huff_decode (bitread_working_state * state, - register bit_buf_type get_buffer, register int bits_left, - d_derived_tbl * htbl, int min_bits) + register bit_buf_type get_buffer, register int bits_left, + d_derived_tbl * htbl, int min_bits) { register int l = min_bits; register INT32 code; @@ -475,7 +475,7 @@ jpeg_huff_decode (bitread_working_state * state, if (l > 16) { WARNMS(state->cinfo, JWRN_HUFF_BAD_CODE); - return 0; /* fake a zero as the safest result */ + return 0; /* fake a zero as the safest result */ } return htbl->pub->huffval[ (int) (code + htbl->valoffset[l]) ]; @@ -595,7 +595,7 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) r = s >> 4; s &= 15; - + if (s) { k += r; CHECK_BIT_BUFFER(br_state, s, return FALSE); @@ -684,7 +684,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) HUFF_DECODE_FAST(s, l, actbl); r = s >> 4; s &= 15; - + if (s) { k += r; FILL_BIT_BUFFER_FAST @@ -756,7 +756,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) if (! process_restart(cinfo)) - return FALSE; + return FALSE; usefast = 0; } @@ -804,7 +804,7 @@ jinit_huff_decoder (j_decompress_ptr cinfo) entropy = (huff_entropy_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(huff_entropy_decoder)); + SIZEOF(huff_entropy_decoder)); cinfo->entropy = (struct jpeg_entropy_decoder *) entropy; entropy->pub.start_pass = start_pass_huff_decoder; entropy->pub.decode_mcu = decode_mcu; diff --git a/jdhuff.h b/jdhuff.h index 220143609..8d2c65454 100644 --- a/jdhuff.h +++ b/jdhuff.h @@ -15,21 +15,21 @@ /* Short forms of external names for systems with brain-damaged linkers. */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jpeg_make_d_derived_tbl jMkDDerived -#define jpeg_fill_bit_buffer jFilBitBuf -#define jpeg_huff_decode jHufDecode +#define jpeg_make_d_derived_tbl jMkDDerived +#define jpeg_fill_bit_buffer jFilBitBuf +#define jpeg_huff_decode jHufDecode #endif /* NEED_SHORT_EXTERNAL_NAMES */ /* Derived data constructed for each Huffman table */ -#define HUFF_LOOKAHEAD 8 /* # of bits of lookahead */ +#define HUFF_LOOKAHEAD 8 /* # of bits of lookahead */ typedef struct { /* Basic tables: (element [0] of each array is unused) */ - INT32 maxcode[18]; /* largest code of length k (-1 if none) */ + INT32 maxcode[18]; /* largest code of length k (-1 if none) */ /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */ - INT32 valoffset[18]; /* huffval[] offset for codes of length k */ + INT32 valoffset[18]; /* huffval[] offset for codes of length k */ /* valoffset[k] = huffval[] index of 1st symbol of code length k, less * the smallest code of length k; so given a code of length k, the * corresponding symbol is huffval[code + valoffset[k]] @@ -53,8 +53,8 @@ typedef struct { /* Expand a Huffman table definition into the derived format */ EXTERN(void) jpeg_make_d_derived_tbl - JPP((j_decompress_ptr cinfo, boolean isDC, int tblno, - d_derived_tbl ** pdtbl)); + JPP((j_decompress_ptr cinfo, boolean isDC, int tblno, + d_derived_tbl ** pdtbl)); /* @@ -77,13 +77,13 @@ EXTERN(void) jpeg_make_d_derived_tbl #if __WORDSIZE == 64 || defined(_WIN64) -typedef size_t bit_buf_type; /* type of bit-extraction buffer */ -#define BIT_BUF_SIZE 64 /* size of buffer in bits */ +typedef size_t bit_buf_type; /* type of bit-extraction buffer */ +#define BIT_BUF_SIZE 64 /* size of buffer in bits */ #else -typedef INT32 bit_buf_type; /* type of bit-extraction buffer */ -#define BIT_BUF_SIZE 32 /* size of buffer in bits */ +typedef INT32 bit_buf_type; /* type of bit-extraction buffer */ +#define BIT_BUF_SIZE 32 /* size of buffer in bits */ #endif @@ -94,43 +94,43 @@ typedef INT32 bit_buf_type; /* type of bit-extraction buffer */ * because not all machines measure sizeof in 8-bit bytes. */ -typedef struct { /* Bitreading state saved across MCUs */ - bit_buf_type get_buffer; /* current bit-extraction buffer */ - int bits_left; /* # of unused bits in it */ +typedef struct { /* Bitreading state saved across MCUs */ + bit_buf_type get_buffer; /* current bit-extraction buffer */ + int bits_left; /* # of unused bits in it */ } bitread_perm_state; -typedef struct { /* Bitreading working state within an MCU */ +typedef struct { /* Bitreading working state within an MCU */ /* Current data source location */ /* We need a copy, rather than munging the original, in case of suspension */ const JOCTET * next_input_byte; /* => next byte to read from source */ - size_t bytes_in_buffer; /* # of bytes remaining in source buffer */ + size_t bytes_in_buffer; /* # of bytes remaining in source buffer */ /* Bit input buffer --- note these values are kept in register variables, * not in this struct, inside the inner loops. */ - bit_buf_type get_buffer; /* current bit-extraction buffer */ - int bits_left; /* # of unused bits in it */ + bit_buf_type get_buffer; /* current bit-extraction buffer */ + int bits_left; /* # of unused bits in it */ /* Pointer needed by jpeg_fill_bit_buffer. */ - j_decompress_ptr cinfo; /* back link to decompress master record */ + j_decompress_ptr cinfo; /* back link to decompress master record */ } bitread_working_state; /* Macros to declare and load/save bitread local variables. */ #define BITREAD_STATE_VARS \ - register bit_buf_type get_buffer; \ - register int bits_left; \ - bitread_working_state br_state + register bit_buf_type get_buffer; \ + register int bits_left; \ + bitread_working_state br_state #define BITREAD_LOAD_STATE(cinfop,permstate) \ - br_state.cinfo = cinfop; \ - br_state.next_input_byte = cinfop->src->next_input_byte; \ - br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \ - get_buffer = permstate.get_buffer; \ - bits_left = permstate.bits_left; + br_state.cinfo = cinfop; \ + br_state.next_input_byte = cinfop->src->next_input_byte; \ + br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \ + get_buffer = permstate.get_buffer; \ + bits_left = permstate.bits_left; #define BITREAD_SAVE_STATE(cinfop,permstate) \ - cinfop->src->next_input_byte = br_state.next_input_byte; \ - cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \ - permstate.get_buffer = get_buffer; \ - permstate.bits_left = bits_left + cinfop->src->next_input_byte = br_state.next_input_byte; \ + cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \ + permstate.get_buffer = get_buffer; \ + permstate.bits_left = bits_left /* * These macros provide the in-line portion of bit fetching. @@ -138,37 +138,37 @@ typedef struct { /* Bitreading working state within an MCU */ * before using GET_BITS, PEEK_BITS, or DROP_BITS. * The variables get_buffer and bits_left are assumed to be locals, * but the state struct might not be (jpeg_huff_decode needs this). - * CHECK_BIT_BUFFER(state,n,action); - * Ensure there are N bits in get_buffer; if suspend, take action. + * CHECK_BIT_BUFFER(state,n,action); + * Ensure there are N bits in get_buffer; if suspend, take action. * val = GET_BITS(n); - * Fetch next N bits. + * Fetch next N bits. * val = PEEK_BITS(n); - * Fetch next N bits without removing them from the buffer. - * DROP_BITS(n); - * Discard next N bits. + * Fetch next N bits without removing them from the buffer. + * DROP_BITS(n); + * Discard next N bits. * The value N should be a simple variable, not an expression, because it * is evaluated multiple times. */ #define CHECK_BIT_BUFFER(state,nbits,action) \ - { if (bits_left < (nbits)) { \ - if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits)) \ - { action; } \ - get_buffer = (state).get_buffer; bits_left = (state).bits_left; } } + { if (bits_left < (nbits)) { \ + if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits)) \ + { action; } \ + get_buffer = (state).get_buffer; bits_left = (state).bits_left; } } #define GET_BITS(nbits) \ - (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1)) + (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1)) #define PEEK_BITS(nbits) \ - (((int) (get_buffer >> (bits_left - (nbits)))) & ((1<<(nbits))-1)) + (((int) (get_buffer >> (bits_left - (nbits)))) & ((1<<(nbits))-1)) #define DROP_BITS(nbits) \ - (bits_left -= (nbits)) + (bits_left -= (nbits)) /* Load up the bit buffer to a depth of at least nbits */ EXTERN(boolean) jpeg_fill_bit_buffer - JPP((bitread_working_state * state, register bit_buf_type get_buffer, - register int bits_left, int nbits)); + JPP((bitread_working_state * state, register bit_buf_type get_buffer, + register int bits_left, int nbits)); /* @@ -204,7 +204,7 @@ EXTERN(boolean) jpeg_fill_bit_buffer } else { \ slowlabel: \ if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \ - { failaction; } \ + { failaction; } \ get_buffer = state.get_buffer; bits_left = state.bits_left; \ } \ } @@ -231,5 +231,5 @@ slowlabel: \ /* Out-of-line case for Huffman code fetching */ EXTERN(int) jpeg_huff_decode - JPP((bitread_working_state * state, register bit_buf_type get_buffer, - register int bits_left, d_derived_tbl * htbl, int min_bits)); + JPP((bitread_working_state * state, register bit_buf_type get_buffer, + register int bits_left, d_derived_tbl * htbl, int min_bits)); diff --git a/jdinput.c b/jdinput.c index e7ba33f7b..4afb07428 100644 --- a/jdinput.c +++ b/jdinput.c @@ -24,7 +24,7 @@ typedef struct { struct jpeg_input_controller pub; /* public fields */ - boolean inheaders; /* TRUE until first SOS is reached */ + boolean inheaders; /* TRUE until first SOS is reached */ } my_input_controller; typedef my_input_controller * my_inputctl_ptr; @@ -57,7 +57,7 @@ initial_setup (j_decompress_ptr cinfo) /* Check that number of components won't exceed internal array sizes */ if (cinfo->num_components > MAX_COMPONENTS) ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components, - MAX_COMPONENTS); + MAX_COMPONENTS); /* Compute maximum sampling factors; check factor validity */ cinfo->max_h_samp_factor = 1; @@ -65,12 +65,12 @@ initial_setup (j_decompress_ptr cinfo) for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR || - compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR) + compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR) ERREXIT(cinfo, JERR_BAD_SAMPLING); cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor, - compptr->h_samp_factor); + compptr->h_samp_factor); cinfo->max_v_samp_factor = MAX(cinfo->max_v_samp_factor, - compptr->v_samp_factor); + compptr->v_samp_factor); } #if JPEG_LIB_VERSION >=80 @@ -100,10 +100,10 @@ initial_setup (j_decompress_ptr cinfo) /* Size in DCT blocks */ compptr->width_in_blocks = (JDIMENSION) jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor, - (long) (cinfo->max_h_samp_factor * DCTSIZE)); + (long) (cinfo->max_h_samp_factor * DCTSIZE)); compptr->height_in_blocks = (JDIMENSION) jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor, - (long) (cinfo->max_v_samp_factor * DCTSIZE)); + (long) (cinfo->max_v_samp_factor * DCTSIZE)); /* downsampled_width and downsampled_height will also be overridden by * jdmaster.c if we are doing full decompression. The transcoder library * doesn't use these values, but the calling application might. @@ -111,10 +111,10 @@ initial_setup (j_decompress_ptr cinfo) /* Size in samples */ compptr->downsampled_width = (JDIMENSION) jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor, - (long) cinfo->max_h_samp_factor); + (long) cinfo->max_h_samp_factor); compptr->downsampled_height = (JDIMENSION) jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor, - (long) cinfo->max_v_samp_factor); + (long) cinfo->max_v_samp_factor); /* Mark component needed, until color conversion says otherwise */ compptr->component_needed = TRUE; /* Mark no quantization table yet saved for component */ @@ -124,7 +124,7 @@ initial_setup (j_decompress_ptr cinfo) /* Compute number of fully interleaved MCU rows. */ cinfo->total_iMCU_rows = (JDIMENSION) jdiv_round_up((long) cinfo->image_height, - (long) (cinfo->max_v_samp_factor*DCTSIZE)); + (long) (cinfo->max_v_samp_factor*DCTSIZE)); /* Decide whether file contains multiple scans */ if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode) @@ -141,16 +141,16 @@ per_scan_setup (j_decompress_ptr cinfo) { int ci, mcublks, tmp; jpeg_component_info *compptr; - + if (cinfo->comps_in_scan == 1) { - + /* Noninterleaved (single-component) scan */ compptr = cinfo->cur_comp_info[0]; - + /* Overall image size in MCUs */ cinfo->MCUs_per_row = compptr->width_in_blocks; cinfo->MCU_rows_in_scan = compptr->height_in_blocks; - + /* For noninterleaved scan, always one block per MCU */ compptr->MCU_width = 1; compptr->MCU_height = 1; @@ -163,28 +163,28 @@ per_scan_setup (j_decompress_ptr cinfo) tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor); if (tmp == 0) tmp = compptr->v_samp_factor; compptr->last_row_height = tmp; - + /* Prepare array describing MCU composition */ cinfo->blocks_in_MCU = 1; cinfo->MCU_membership[0] = 0; - + } else { - + /* Interleaved (multi-component) scan */ if (cinfo->comps_in_scan <= 0 || cinfo->comps_in_scan > MAX_COMPS_IN_SCAN) ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->comps_in_scan, - MAX_COMPS_IN_SCAN); - + MAX_COMPS_IN_SCAN); + /* Overall image size in MCUs */ cinfo->MCUs_per_row = (JDIMENSION) jdiv_round_up((long) cinfo->image_width, - (long) (cinfo->max_h_samp_factor*DCTSIZE)); + (long) (cinfo->max_h_samp_factor*DCTSIZE)); cinfo->MCU_rows_in_scan = (JDIMENSION) jdiv_round_up((long) cinfo->image_height, - (long) (cinfo->max_v_samp_factor*DCTSIZE)); - + (long) (cinfo->max_v_samp_factor*DCTSIZE)); + cinfo->blocks_in_MCU = 0; - + for (ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; /* Sampling factors give # of blocks of component in each MCU */ @@ -202,12 +202,12 @@ per_scan_setup (j_decompress_ptr cinfo) /* Prepare array describing MCU composition */ mcublks = compptr->MCU_blocks; if (cinfo->blocks_in_MCU + mcublks > D_MAX_BLOCKS_IN_MCU) - ERREXIT(cinfo, JERR_BAD_MCU_SIZE); + ERREXIT(cinfo, JERR_BAD_MCU_SIZE); while (mcublks-- > 0) { - cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci; + cinfo->MCU_membership[cinfo->blocks_in_MCU++] = ci; } } - + } } @@ -248,12 +248,12 @@ latch_quant_tables (j_decompress_ptr cinfo) /* Make sure specified quantization table is present */ qtblno = compptr->quant_tbl_no; if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS || - cinfo->quant_tbl_ptrs[qtblno] == NULL) + cinfo->quant_tbl_ptrs[qtblno] == NULL) ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno); /* OK, save away the quantization table */ qtbl = (JQUANT_TBL *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(JQUANT_TBL)); + SIZEOF(JQUANT_TBL)); MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], SIZEOF(JQUANT_TBL)); compptr->quant_table = qtbl; } @@ -313,31 +313,31 @@ consume_markers (j_decompress_ptr cinfo) val = (*cinfo->marker->read_markers) (cinfo); switch (val) { - case JPEG_REACHED_SOS: /* Found SOS */ - if (inputctl->inheaders) { /* 1st SOS */ + case JPEG_REACHED_SOS: /* Found SOS */ + if (inputctl->inheaders) { /* 1st SOS */ initial_setup(cinfo); inputctl->inheaders = FALSE; /* Note: start_input_pass must be called by jdmaster.c * before any more input can be consumed. jdapimin.c is * responsible for enforcing this sequencing. */ - } else { /* 2nd or later SOS marker */ + } else { /* 2nd or later SOS marker */ if (! inputctl->pub.has_multiple_scans) - ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */ + ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */ start_input_pass(cinfo); } break; - case JPEG_REACHED_EOI: /* Found EOI */ + case JPEG_REACHED_EOI: /* Found EOI */ inputctl->pub.eoi_reached = TRUE; - if (inputctl->inheaders) { /* Tables-only datastream, apparently */ + if (inputctl->inheaders) { /* Tables-only datastream, apparently */ if (cinfo->marker->saw_SOF) - ERREXIT(cinfo, JERR_SOF_NO_SOS); + ERREXIT(cinfo, JERR_SOF_NO_SOS); } else { /* Prevent infinite loop in coef ctlr's decompress_data routine * if user set output_scan_number larger than number of scans. */ if (cinfo->output_scan_number > cinfo->input_scan_number) - cinfo->output_scan_number = cinfo->input_scan_number; + cinfo->output_scan_number = cinfo->input_scan_number; } break; case JPEG_SUSPENDED: @@ -382,7 +382,7 @@ jinit_input_controller (j_decompress_ptr cinfo) /* Create subobject in permanent pool */ inputctl = (my_inputctl_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - SIZEOF(my_input_controller)); + SIZEOF(my_input_controller)); cinfo->inputctl = (struct jpeg_input_controller *) inputctl; /* Initialize method pointers */ inputctl->pub.consume_input = consume_markers; diff --git a/jdmainct.c b/jdmainct.c index 26b816c52..922f649a7 100644 --- a/jdmainct.c +++ b/jdmainct.c @@ -120,39 +120,39 @@ typedef struct { /* Pointer to allocated workspace (M or M+2 row groups). */ JSAMPARRAY buffer[MAX_COMPONENTS]; - boolean buffer_full; /* Have we gotten an iMCU row from decoder? */ - JDIMENSION rowgroup_ctr; /* counts row groups output to postprocessor */ + boolean buffer_full; /* Have we gotten an iMCU row from decoder? */ + JDIMENSION rowgroup_ctr; /* counts row groups output to postprocessor */ /* Remaining fields are only used in the context case. */ /* These are the master pointers to the funny-order pointer lists. */ - JSAMPIMAGE xbuffer[2]; /* pointers to weird pointer lists */ + JSAMPIMAGE xbuffer[2]; /* pointers to weird pointer lists */ - int whichptr; /* indicates which pointer set is now in use */ - int context_state; /* process_data state machine status */ - JDIMENSION rowgroups_avail; /* row groups available to postprocessor */ - JDIMENSION iMCU_row_ctr; /* counts iMCU rows to detect image top/bot */ + int whichptr; /* indicates which pointer set is now in use */ + int context_state; /* process_data state machine status */ + JDIMENSION rowgroups_avail; /* row groups available to postprocessor */ + JDIMENSION iMCU_row_ctr; /* counts iMCU rows to detect image top/bot */ } my_main_controller; typedef my_main_controller * my_main_ptr; /* context_state values: */ -#define CTX_PREPARE_FOR_IMCU 0 /* need to prepare for MCU row */ -#define CTX_PROCESS_IMCU 1 /* feeding iMCU to postprocessor */ -#define CTX_POSTPONED_ROW 2 /* feeding postponed row group */ +#define CTX_PREPARE_FOR_IMCU 0 /* need to prepare for MCU row */ +#define CTX_PROCESS_IMCU 1 /* feeding iMCU to postprocessor */ +#define CTX_POSTPONED_ROW 2 /* feeding postponed row group */ /* Forward declarations */ METHODDEF(void) process_data_simple_main - JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf, - JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)); + JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)); METHODDEF(void) process_data_context_main - JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf, - JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)); + JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)); #ifdef QUANT_2PASS_SUPPORTED METHODDEF(void) process_data_crank_post - JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf, - JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)); + JPP((j_decompress_ptr cinfo, JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)); #endif @@ -173,7 +173,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo) */ main_ptr->xbuffer[0] = (JSAMPIMAGE) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - cinfo->num_components * 2 * SIZEOF(JSAMPARRAY)); + cinfo->num_components * 2 * SIZEOF(JSAMPARRAY)); main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components; for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; @@ -185,8 +185,8 @@ alloc_funny_pointers (j_decompress_ptr cinfo) */ xbuf = (JSAMPARRAY) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - 2 * (rgroup * (M + 4)) * SIZEOF(JSAMPROW)); - xbuf += rgroup; /* want one row group at negative offsets */ + 2 * (rgroup * (M + 4)) * SIZEOF(JSAMPROW)); + xbuf += rgroup; /* want one row group at negative offsets */ main_ptr->xbuffer[0][ci] = xbuf; xbuf += rgroup * (M + 4); main_ptr->xbuffer[1][ci] = xbuf; @@ -316,14 +316,14 @@ start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode) if (cinfo->upsample->need_context_rows) { main_ptr->pub.process_data = process_data_context_main; make_funny_pointers(cinfo); /* Create the xbuffer[] lists */ - main_ptr->whichptr = 0; /* Read first iMCU row into xbuffer[0] */ + main_ptr->whichptr = 0; /* Read first iMCU row into xbuffer[0] */ main_ptr->context_state = CTX_PREPARE_FOR_IMCU; main_ptr->iMCU_row_ctr = 0; } else { /* Simple case with no context needed */ main_ptr->pub.process_data = process_data_simple_main; } - main_ptr->buffer_full = FALSE; /* Mark buffer empty */ + main_ptr->buffer_full = FALSE; /* Mark buffer empty */ main_ptr->rowgroup_ctr = 0; break; #ifdef QUANT_2PASS_SUPPORTED @@ -346,8 +346,8 @@ start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode) METHODDEF(void) process_data_simple_main (j_decompress_ptr cinfo, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) { my_main_ptr main_ptr = (my_main_ptr) cinfo->main; JDIMENSION rowgroups_avail; @@ -355,8 +355,8 @@ process_data_simple_main (j_decompress_ptr cinfo, /* Read input data if we haven't filled the main buffer yet */ if (! main_ptr->buffer_full) { if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer)) - return; /* suspension forced, can do nothing more */ - main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */ + return; /* suspension forced, can do nothing more */ + main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */ } /* There are always min_DCT_scaled_size row groups in an iMCU row. */ @@ -368,8 +368,8 @@ process_data_simple_main (j_decompress_ptr cinfo, /* Feed the postprocessor */ (*cinfo->post->post_process_data) (cinfo, main_ptr->buffer, - &main_ptr->rowgroup_ctr, rowgroups_avail, - output_buf, out_row_ctr, out_rows_avail); + &main_ptr->rowgroup_ctr, rowgroups_avail, + output_buf, out_row_ctr, out_rows_avail); /* Has postprocessor consumed all the data yet? If so, mark buffer empty */ if (main_ptr->rowgroup_ctr >= rowgroups_avail) { @@ -386,18 +386,18 @@ process_data_simple_main (j_decompress_ptr cinfo, METHODDEF(void) process_data_context_main (j_decompress_ptr cinfo, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) { my_main_ptr main_ptr = (my_main_ptr) cinfo->main; /* Read input data if we haven't filled the main buffer yet */ if (! main_ptr->buffer_full) { if (! (*cinfo->coef->decompress_data) (cinfo, - main_ptr->xbuffer[main_ptr->whichptr])) - return; /* suspension forced, can do nothing more */ - main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */ - main_ptr->iMCU_row_ctr++; /* count rows received */ + main_ptr->xbuffer[main_ptr->whichptr])) + return; /* suspension forced, can do nothing more */ + main_ptr->buffer_full = TRUE; /* OK, we have an iMCU row to work with */ + main_ptr->iMCU_row_ctr++; /* count rows received */ } /* Postprocessor typically will not swallow all the input data it is handed @@ -409,13 +409,13 @@ process_data_context_main (j_decompress_ptr cinfo, case CTX_POSTPONED_ROW: /* Call postprocessor using previously set pointers for postponed row */ (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr], - &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail, - output_buf, out_row_ctr, out_rows_avail); + &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail, + output_buf, out_row_ctr, out_rows_avail); if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail) - return; /* Need to suspend */ + return; /* Need to suspend */ main_ptr->context_state = CTX_PREPARE_FOR_IMCU; if (*out_row_ctr >= out_rows_avail) - return; /* Postprocessor exactly filled output buf */ + return; /* Postprocessor exactly filled output buf */ /*FALLTHROUGH*/ case CTX_PREPARE_FOR_IMCU: /* Prepare to process first M-1 row groups of this iMCU row */ @@ -431,15 +431,15 @@ process_data_context_main (j_decompress_ptr cinfo, case CTX_PROCESS_IMCU: /* Call postprocessor using previously set pointers */ (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr], - &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail, - output_buf, out_row_ctr, out_rows_avail); + &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail, + output_buf, out_row_ctr, out_rows_avail); if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail) - return; /* Need to suspend */ + return; /* Need to suspend */ /* After the first iMCU, change wraparound pointers to normal state */ if (main_ptr->iMCU_row_ctr == 1) set_wraparound_pointers(cinfo); /* Prepare to load new iMCU row using other xbuffer list */ - main_ptr->whichptr ^= 1; /* 0=>1 or 1=>0 */ + main_ptr->whichptr ^= 1; /* 0=>1 or 1=>0 */ main_ptr->buffer_full = FALSE; /* Still need to process last row group of this iMCU row, */ /* which is saved at index M+1 of the other xbuffer */ @@ -460,12 +460,12 @@ process_data_context_main (j_decompress_ptr cinfo, METHODDEF(void) process_data_crank_post (j_decompress_ptr cinfo, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) { (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE) NULL, - (JDIMENSION *) NULL, (JDIMENSION) 0, - output_buf, out_row_ctr, out_rows_avail); + (JDIMENSION *) NULL, (JDIMENSION) 0, + output_buf, out_row_ctr, out_rows_avail); } #endif /* QUANT_2PASS_SUPPORTED */ @@ -484,11 +484,11 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer) main_ptr = (my_main_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_main_controller)); + SIZEOF(my_main_controller)); cinfo->main = (struct jpeg_d_main_controller *) main_ptr; main_ptr->pub.start_pass = start_pass_main; - if (need_full_buffer) /* shouldn't happen */ + if (need_full_buffer) /* shouldn't happen */ ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); /* Allocate the workspace. @@ -508,8 +508,8 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer) rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) / cinfo->_min_DCT_scaled_size; /* height of a row group of component */ main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, - compptr->width_in_blocks * compptr->_DCT_scaled_size, - (JDIMENSION) (rgroup * ngroups)); + ((j_common_ptr) cinfo, JPOOL_IMAGE, + compptr->width_in_blocks * compptr->_DCT_scaled_size, + (JDIMENSION) (rgroup * ngroups)); } } diff --git a/jdmarker.c b/jdmarker.c index c8771bc0a..d996f6b3d 100644 --- a/jdmarker.c +++ b/jdmarker.c @@ -19,29 +19,29 @@ #include "jpeglib.h" -typedef enum { /* JPEG marker codes */ +typedef enum { /* JPEG marker codes */ M_SOF0 = 0xc0, M_SOF1 = 0xc1, M_SOF2 = 0xc2, M_SOF3 = 0xc3, - + M_SOF5 = 0xc5, M_SOF6 = 0xc6, M_SOF7 = 0xc7, - + M_JPG = 0xc8, M_SOF9 = 0xc9, M_SOF10 = 0xca, M_SOF11 = 0xcb, - + M_SOF13 = 0xcd, M_SOF14 = 0xce, M_SOF15 = 0xcf, - + M_DHT = 0xc4, - + M_DAC = 0xcc, - + M_RST0 = 0xd0, M_RST1 = 0xd1, M_RST2 = 0xd2, @@ -50,7 +50,7 @@ typedef enum { /* JPEG marker codes */ M_RST5 = 0xd5, M_RST6 = 0xd6, M_RST7 = 0xd7, - + M_SOI = 0xd8, M_EOI = 0xd9, M_SOS = 0xda, @@ -59,7 +59,7 @@ typedef enum { /* JPEG marker codes */ M_DRI = 0xdd, M_DHP = 0xde, M_EXP = 0xdf, - + M_APP0 = 0xe0, M_APP1 = 0xe1, M_APP2 = 0xe2, @@ -76,13 +76,13 @@ typedef enum { /* JPEG marker codes */ M_APP13 = 0xed, M_APP14 = 0xee, M_APP15 = 0xef, - + M_JPG0 = 0xf0, M_JPG13 = 0xfd, M_COM = 0xfe, - + M_TEM = 0x01, - + M_ERROR = 0x100 } JPEG_MARKER; @@ -101,8 +101,8 @@ typedef struct { unsigned int length_limit_APPn[16]; /* Status of COM/APPn marker saving */ - jpeg_saved_marker_ptr cur_marker; /* NULL if not processing a marker */ - unsigned int bytes_read; /* data bytes read so far in marker */ + jpeg_saved_marker_ptr cur_marker; /* NULL if not processing a marker */ + unsigned int bytes_read; /* data bytes read so far in marker */ /* Note: cur_marker is not linked into marker_list until it's all read. */ } my_marker_reader; @@ -119,49 +119,49 @@ typedef my_marker_reader * my_marker_ptr; /* Declare and initialize local copies of input pointer/count */ #define INPUT_VARS(cinfo) \ - struct jpeg_source_mgr * datasrc = (cinfo)->src; \ - const JOCTET * next_input_byte = datasrc->next_input_byte; \ - size_t bytes_in_buffer = datasrc->bytes_in_buffer + struct jpeg_source_mgr * datasrc = (cinfo)->src; \ + const JOCTET * next_input_byte = datasrc->next_input_byte; \ + size_t bytes_in_buffer = datasrc->bytes_in_buffer /* Unload the local copies --- do this only at a restart boundary */ #define INPUT_SYNC(cinfo) \ - ( datasrc->next_input_byte = next_input_byte, \ - datasrc->bytes_in_buffer = bytes_in_buffer ) + ( datasrc->next_input_byte = next_input_byte, \ + datasrc->bytes_in_buffer = bytes_in_buffer ) /* Reload the local copies --- used only in MAKE_BYTE_AVAIL */ #define INPUT_RELOAD(cinfo) \ - ( next_input_byte = datasrc->next_input_byte, \ - bytes_in_buffer = datasrc->bytes_in_buffer ) + ( next_input_byte = datasrc->next_input_byte, \ + bytes_in_buffer = datasrc->bytes_in_buffer ) /* Internal macro for INPUT_BYTE and INPUT_2BYTES: make a byte available. * Note we do *not* do INPUT_SYNC before calling fill_input_buffer, * but we must reload the local copies after a successful fill. */ #define MAKE_BYTE_AVAIL(cinfo,action) \ - if (bytes_in_buffer == 0) { \ - if (! (*datasrc->fill_input_buffer) (cinfo)) \ - { action; } \ - INPUT_RELOAD(cinfo); \ - } + if (bytes_in_buffer == 0) { \ + if (! (*datasrc->fill_input_buffer) (cinfo)) \ + { action; } \ + INPUT_RELOAD(cinfo); \ + } /* Read a byte into variable V. * If must suspend, take the specified action (typically "return FALSE"). */ #define INPUT_BYTE(cinfo,V,action) \ - MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \ - bytes_in_buffer--; \ - V = GETJOCTET(*next_input_byte++); ) + MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \ + bytes_in_buffer--; \ + V = GETJOCTET(*next_input_byte++); ) /* As above, but read two bytes interpreted as an unsigned 16-bit integer. * V should be declared unsigned int or perhaps INT32. */ #define INPUT_2BYTES(cinfo,V,action) \ - MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \ - bytes_in_buffer--; \ - V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \ - MAKE_BYTE_AVAIL(cinfo,action); \ - bytes_in_buffer--; \ - V += GETJOCTET(*next_input_byte++); ) + MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \ + bytes_in_buffer--; \ + V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \ + MAKE_BYTE_AVAIL(cinfo,action); \ + bytes_in_buffer--; \ + V += GETJOCTET(*next_input_byte++); ) /* @@ -200,7 +200,7 @@ get_soi (j_decompress_ptr cinfo) /* Process an SOI marker */ { int i; - + TRACEMS(cinfo, 1, JTRC_SOI); if (cinfo->marker->saw_SOI) @@ -257,8 +257,8 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith) length -= 8; TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker, - (int) cinfo->image_width, (int) cinfo->image_height, - cinfo->num_components); + (int) cinfo->image_width, (int) cinfo->image_height, + cinfo->num_components); if (cinfo->marker->saw_SOF) ERREXIT(cinfo, JERR_SOF_DUPLICATE); @@ -273,11 +273,11 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith) if (length != (cinfo->num_components * 3)) ERREXIT(cinfo, JERR_BAD_LENGTH); - if (cinfo->comp_info == NULL) /* do only once, even if suspend */ + if (cinfo->comp_info == NULL) /* do only once, even if suspend */ cinfo->comp_info = (jpeg_component_info *) (*cinfo->mem->alloc_small) - ((j_common_ptr) cinfo, JPOOL_IMAGE, - cinfo->num_components * SIZEOF(jpeg_component_info)); - + ((j_common_ptr) cinfo, JPOOL_IMAGE, + cinfo->num_components * SIZEOF(jpeg_component_info)); + for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; ci++, compptr++) { compptr->component_index = ci; @@ -288,8 +288,8 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith) INPUT_BYTE(cinfo, compptr->quant_tbl_no, return FALSE); TRACEMS4(cinfo, 1, JTRC_SOF_COMPONENT, - compptr->component_id, compptr->h_samp_factor, - compptr->v_samp_factor, compptr->quant_tbl_no); + compptr->component_id, compptr->h_samp_factor, + compptr->v_samp_factor, compptr->quant_tbl_no); } cinfo->marker->saw_SOF = TRUE; @@ -330,12 +330,12 @@ get_sos (j_decompress_ptr cinfo) for (i = 0; i < n; i++) { INPUT_BYTE(cinfo, cc, return FALSE); INPUT_BYTE(cinfo, c, return FALSE); - + for (ci = 0, compptr = cinfo->comp_info; - ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN; - ci++, compptr++) { + ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN; + ci++, compptr++) { if (cc == compptr->component_id && !cinfo->cur_comp_info[ci]) - goto id_found; + goto id_found; } ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc); @@ -345,9 +345,9 @@ get_sos (j_decompress_ptr cinfo) cinfo->cur_comp_info[i] = compptr; compptr->dc_tbl_no = (c >> 4) & 15; compptr->ac_tbl_no = (c ) & 15; - + TRACEMS3(cinfo, 1, JTRC_SOS_COMPONENT, cc, - compptr->dc_tbl_no, compptr->ac_tbl_no); + compptr->dc_tbl_no, compptr->ac_tbl_no); /* This CSi (cc) should differ from the previous CSi */ for (pi = 0; pi < i; pi++) { @@ -367,7 +367,7 @@ get_sos (j_decompress_ptr cinfo) cinfo->Al = (c ) & 15; TRACEMS4(cinfo, 1, JTRC_SOS_PARAMS, cinfo->Ss, cinfo->Se, - cinfo->Ah, cinfo->Al); + cinfo->Ah, cinfo->Al); /* Prepare to scan data & restart markers */ cinfo->marker->next_restart_num = 0; @@ -392,7 +392,7 @@ get_dac (j_decompress_ptr cinfo) INPUT_2BYTES(cinfo, length, return FALSE); length -= 2; - + while (length > 0) { INPUT_BYTE(cinfo, index, return FALSE); INPUT_BYTE(cinfo, val, return FALSE); @@ -406,11 +406,11 @@ get_dac (j_decompress_ptr cinfo) if (index >= NUM_ARITH_TBLS) { /* define AC table */ cinfo->arith_ac_K[index-NUM_ARITH_TBLS] = (UINT8) val; - } else { /* define DC table */ + } else { /* define DC table */ cinfo->arith_dc_L[index] = (UINT8) (val & 0x0F); cinfo->arith_dc_U[index] = (UINT8) (val >> 4); if (cinfo->arith_dc_L[index] > cinfo->arith_dc_U[index]) - ERREXIT1(cinfo, JERR_DAC_VALUE, val); + ERREXIT1(cinfo, JERR_DAC_VALUE, val); } } @@ -441,12 +441,12 @@ get_dht (j_decompress_ptr cinfo) INPUT_2BYTES(cinfo, length, return FALSE); length -= 2; - + while (length > 16) { INPUT_BYTE(cinfo, index, return FALSE); TRACEMS1(cinfo, 1, JTRC_DHT, index); - + bits[0] = 0; count = 0; for (i = 1; i <= 16; i++) { @@ -457,11 +457,11 @@ get_dht (j_decompress_ptr cinfo) length -= 1 + 16; TRACEMS8(cinfo, 2, JTRC_HUFFBITS, - bits[1], bits[2], bits[3], bits[4], - bits[5], bits[6], bits[7], bits[8]); + bits[1], bits[2], bits[3], bits[4], + bits[5], bits[6], bits[7], bits[8]); TRACEMS8(cinfo, 2, JTRC_HUFFBITS, - bits[9], bits[10], bits[11], bits[12], - bits[13], bits[14], bits[15], bits[16]); + bits[9], bits[10], bits[11], bits[12], + bits[13], bits[14], bits[15], bits[16]); /* Here we just do minimal validation of the counts to avoid walking * off the end of our table space. jdhuff.c will check more carefully. @@ -476,12 +476,12 @@ get_dht (j_decompress_ptr cinfo) length -= count; - if (index & 0x10) { /* AC table definition */ + if (index & 0x10) { /* AC table definition */ index -= 0x10; if (index < 0 || index >= NUM_HUFF_TBLS) ERREXIT1(cinfo, JERR_DHT_INDEX, index); htblptr = &cinfo->ac_huff_tbl_ptrs[index]; - } else { /* DC table definition */ + } else { /* DC table definition */ if (index < 0 || index >= NUM_HUFF_TBLS) ERREXIT1(cinfo, JERR_DHT_INDEX, index); htblptr = &cinfo->dc_huff_tbl_ptrs[index]; @@ -489,7 +489,7 @@ get_dht (j_decompress_ptr cinfo) if (*htblptr == NULL) *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo); - + MEMCOPY((*htblptr)->bits, bits, SIZEOF((*htblptr)->bits)); MEMCOPY((*htblptr)->huffval, huffval, SIZEOF((*htblptr)->huffval)); } @@ -524,27 +524,27 @@ get_dqt (j_decompress_ptr cinfo) if (n >= NUM_QUANT_TBLS) ERREXIT1(cinfo, JERR_DQT_INDEX, n); - + if (cinfo->quant_tbl_ptrs[n] == NULL) cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo); quant_ptr = cinfo->quant_tbl_ptrs[n]; for (i = 0; i < DCTSIZE2; i++) { if (prec) - INPUT_2BYTES(cinfo, tmp, return FALSE); + INPUT_2BYTES(cinfo, tmp, return FALSE); else - INPUT_BYTE(cinfo, tmp, return FALSE); + INPUT_BYTE(cinfo, tmp, return FALSE); /* We convert the zigzag-order table to natural array order. */ quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp; } if (cinfo->err->trace_level >= 2) { for (i = 0; i < DCTSIZE2; i += 8) { - TRACEMS8(cinfo, 2, JTRC_QUANTVALS, - quant_ptr->quantval[i], quant_ptr->quantval[i+1], - quant_ptr->quantval[i+2], quant_ptr->quantval[i+3], - quant_ptr->quantval[i+4], quant_ptr->quantval[i+5], - quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]); + TRACEMS8(cinfo, 2, JTRC_QUANTVALS, + quant_ptr->quantval[i], quant_ptr->quantval[i+1], + quant_ptr->quantval[i+2], quant_ptr->quantval[i+3], + quant_ptr->quantval[i+4], quant_ptr->quantval[i+5], + quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]); } } @@ -569,7 +569,7 @@ get_dri (j_decompress_ptr cinfo) INPUT_VARS(cinfo); INPUT_2BYTES(cinfo, length, return FALSE); - + if (length != 4) ERREXIT(cinfo, JERR_BAD_LENGTH); @@ -591,14 +591,14 @@ get_dri (j_decompress_ptr cinfo) * JFIF and Adobe markers, respectively. */ -#define APP0_DATA_LEN 14 /* Length of interesting data in APP0 */ -#define APP14_DATA_LEN 12 /* Length of interesting data in APP14 */ -#define APPN_DATA_LEN 14 /* Must be the largest of the above!! */ +#define APP0_DATA_LEN 14 /* Length of interesting data in APP0 */ +#define APP14_DATA_LEN 12 /* Length of interesting data in APP14 */ +#define APPN_DATA_LEN 14 /* Must be the largest of the above!! */ LOCAL(void) examine_app0 (j_decompress_ptr cinfo, JOCTET FAR * data, - unsigned int datalen, INT32 remaining) + unsigned int datalen, INT32 remaining) /* Examine first few bytes from an APP0. * Take appropriate action if it is a JFIF marker. * datalen is # of bytes at data[], remaining is length of rest of marker data. @@ -627,18 +627,18 @@ examine_app0 (j_decompress_ptr cinfo, JOCTET FAR * data, */ if (cinfo->JFIF_major_version != 1) WARNMS2(cinfo, JWRN_JFIF_MAJOR, - cinfo->JFIF_major_version, cinfo->JFIF_minor_version); + cinfo->JFIF_major_version, cinfo->JFIF_minor_version); /* Generate trace messages */ TRACEMS5(cinfo, 1, JTRC_JFIF, - cinfo->JFIF_major_version, cinfo->JFIF_minor_version, - cinfo->X_density, cinfo->Y_density, cinfo->density_unit); + cinfo->JFIF_major_version, cinfo->JFIF_minor_version, + cinfo->X_density, cinfo->Y_density, cinfo->density_unit); /* Validate thumbnail dimensions and issue appropriate messages */ if (GETJOCTET(data[12]) | GETJOCTET(data[13])) TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, - GETJOCTET(data[12]), GETJOCTET(data[13])); + GETJOCTET(data[12]), GETJOCTET(data[13])); totallen -= APP0_DATA_LEN; if (totallen != - ((INT32)GETJOCTET(data[12]) * (INT32)GETJOCTET(data[13]) * (INT32) 3)) + ((INT32)GETJOCTET(data[12]) * (INT32)GETJOCTET(data[13]) * (INT32) 3)) TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int) totallen); } else if (datalen >= 6 && GETJOCTET(data[0]) == 0x4A && @@ -662,7 +662,7 @@ examine_app0 (j_decompress_ptr cinfo, JOCTET FAR * data, break; default: TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, - GETJOCTET(data[5]), (int) totallen); + GETJOCTET(data[5]), (int) totallen); break; } } else { @@ -674,7 +674,7 @@ examine_app0 (j_decompress_ptr cinfo, JOCTET FAR * data, LOCAL(void) examine_app14 (j_decompress_ptr cinfo, JOCTET FAR * data, - unsigned int datalen, INT32 remaining) + unsigned int datalen, INT32 remaining) /* Examine first few bytes from an APP14. * Take appropriate action if it is an Adobe marker. * datalen is # of bytes at data[], remaining is length of rest of marker data. @@ -766,19 +766,19 @@ save_marker (j_decompress_ptr cinfo) /* begin reading a marker */ INPUT_2BYTES(cinfo, length, return FALSE); length -= 2; - if (length >= 0) { /* watch out for bogus length word */ + if (length >= 0) { /* watch out for bogus length word */ /* figure out how much we want to save */ unsigned int limit; if (cinfo->unread_marker == (int) M_COM) - limit = marker->length_limit_COM; + limit = marker->length_limit_COM; else - limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0]; + limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0]; if ((unsigned int) length < limit) - limit = (unsigned int) length; + limit = (unsigned int) length; /* allocate and initialize the marker item */ cur_marker = (jpeg_saved_marker_ptr) - (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(struct jpeg_marker_struct) + limit); + (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, + SIZEOF(struct jpeg_marker_struct) + limit); cur_marker->next = NULL; cur_marker->marker = (UINT8) cinfo->unread_marker; cur_marker->original_length = (unsigned int) length; @@ -802,7 +802,7 @@ save_marker (j_decompress_ptr cinfo) } while (bytes_read < data_length) { - INPUT_SYNC(cinfo); /* move the restart point to here */ + INPUT_SYNC(cinfo); /* move the restart point to here */ marker->bytes_read = bytes_read; /* If there's not at least one byte in buffer, suspend */ MAKE_BYTE_AVAIL(cinfo, return FALSE); @@ -815,14 +815,14 @@ save_marker (j_decompress_ptr cinfo) } /* Done reading what we want to read */ - if (cur_marker != NULL) { /* will be NULL if bogus length word */ + if (cur_marker != NULL) { /* will be NULL if bogus length word */ /* Add new marker to end of list */ if (cinfo->marker_list == NULL) { cinfo->marker_list = cur_marker; } else { jpeg_saved_marker_ptr prev = cinfo->marker_list; while (prev->next != NULL) - prev = prev->next; + prev = prev->next; prev->next = cur_marker; } /* Reset pointer & calc remaining data length */ @@ -842,12 +842,12 @@ save_marker (j_decompress_ptr cinfo) break; default: TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, - (int) (data_length + length)); + (int) (data_length + length)); break; } /* skip any remaining data -- could be lots */ - INPUT_SYNC(cinfo); /* do before skip_input_data */ + INPUT_SYNC(cinfo); /* do before skip_input_data */ if (length > 0) (*cinfo->src->skip_input_data) (cinfo, (long) length); @@ -866,10 +866,10 @@ skip_variable (j_decompress_ptr cinfo) INPUT_2BYTES(cinfo, length, return FALSE); length -= 2; - + TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int) length); - INPUT_SYNC(cinfo); /* do before skip_input_data */ + INPUT_SYNC(cinfo); /* do before skip_input_data */ if (length > 0) (*cinfo->src->skip_input_data) (cinfo, (long) length); @@ -913,7 +913,7 @@ next_marker (j_decompress_ptr cinfo) INPUT_BYTE(cinfo, c, return FALSE); } while (c == 0xFF); if (c != 0) - break; /* found a valid marker, exit loop */ + break; /* found a valid marker, exit loop */ /* Reach here if we found a stuffed-zero data sequence (FF/00). * Discard it and loop back to try again. */ @@ -973,11 +973,11 @@ read_markers (j_decompress_ptr cinfo) /* NB: first_marker() enforces the requirement that SOI appear first. */ if (cinfo->unread_marker == 0) { if (! cinfo->marker->saw_SOI) { - if (! first_marker(cinfo)) - return JPEG_SUSPENDED; + if (! first_marker(cinfo)) + return JPEG_SUSPENDED; } else { - if (! next_marker(cinfo)) - return JPEG_SUSPENDED; + if (! next_marker(cinfo)) + return JPEG_SUSPENDED; } } /* At this point cinfo->unread_marker contains the marker code and the @@ -987,74 +987,74 @@ read_markers (j_decompress_ptr cinfo) switch (cinfo->unread_marker) { case M_SOI: if (! get_soi(cinfo)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - case M_SOF0: /* Baseline */ - case M_SOF1: /* Extended sequential, Huffman */ + case M_SOF0: /* Baseline */ + case M_SOF1: /* Extended sequential, Huffman */ if (! get_sof(cinfo, FALSE, FALSE)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - case M_SOF2: /* Progressive, Huffman */ + case M_SOF2: /* Progressive, Huffman */ if (! get_sof(cinfo, TRUE, FALSE)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - case M_SOF9: /* Extended sequential, arithmetic */ + case M_SOF9: /* Extended sequential, arithmetic */ if (! get_sof(cinfo, FALSE, TRUE)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - case M_SOF10: /* Progressive, arithmetic */ + case M_SOF10: /* Progressive, arithmetic */ if (! get_sof(cinfo, TRUE, TRUE)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; /* Currently unsupported SOFn types */ - case M_SOF3: /* Lossless, Huffman */ - case M_SOF5: /* Differential sequential, Huffman */ - case M_SOF6: /* Differential progressive, Huffman */ - case M_SOF7: /* Differential lossless, Huffman */ - case M_JPG: /* Reserved for JPEG extensions */ - case M_SOF11: /* Lossless, arithmetic */ - case M_SOF13: /* Differential sequential, arithmetic */ - case M_SOF14: /* Differential progressive, arithmetic */ - case M_SOF15: /* Differential lossless, arithmetic */ + case M_SOF3: /* Lossless, Huffman */ + case M_SOF5: /* Differential sequential, Huffman */ + case M_SOF6: /* Differential progressive, Huffman */ + case M_SOF7: /* Differential lossless, Huffman */ + case M_JPG: /* Reserved for JPEG extensions */ + case M_SOF11: /* Lossless, arithmetic */ + case M_SOF13: /* Differential sequential, arithmetic */ + case M_SOF14: /* Differential progressive, arithmetic */ + case M_SOF15: /* Differential lossless, arithmetic */ ERREXIT1(cinfo, JERR_SOF_UNSUPPORTED, cinfo->unread_marker); break; case M_SOS: if (! get_sos(cinfo)) - return JPEG_SUSPENDED; - cinfo->unread_marker = 0; /* processed the marker */ + return JPEG_SUSPENDED; + cinfo->unread_marker = 0; /* processed the marker */ return JPEG_REACHED_SOS; - + case M_EOI: TRACEMS(cinfo, 1, JTRC_EOI); - cinfo->unread_marker = 0; /* processed the marker */ + cinfo->unread_marker = 0; /* processed the marker */ return JPEG_REACHED_EOI; - + case M_DAC: if (! get_dac(cinfo)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - + case M_DHT: if (! get_dht(cinfo)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - + case M_DQT: if (! get_dqt(cinfo)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - + case M_DRI: if (! get_dri(cinfo)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - + case M_APP0: case M_APP1: case M_APP2: @@ -1072,16 +1072,16 @@ read_markers (j_decompress_ptr cinfo) case M_APP14: case M_APP15: if (! (*((my_marker_ptr) cinfo->marker)->process_APPn[ - cinfo->unread_marker - (int) M_APP0]) (cinfo)) - return JPEG_SUSPENDED; + cinfo->unread_marker - (int) M_APP0]) (cinfo)) + return JPEG_SUSPENDED; break; - + case M_COM: if (! (*((my_marker_ptr) cinfo->marker)->process_COM) (cinfo)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - case M_RST0: /* these are all parameterless */ + case M_RST0: /* these are all parameterless */ case M_RST1: case M_RST2: case M_RST3: @@ -1093,12 +1093,12 @@ read_markers (j_decompress_ptr cinfo) TRACEMS1(cinfo, 1, JTRC_PARMLESS_MARKER, cinfo->unread_marker); break; - case M_DNL: /* Ignore DNL ... perhaps the wrong thing */ + case M_DNL: /* Ignore DNL ... perhaps the wrong thing */ if (! skip_variable(cinfo)) - return JPEG_SUSPENDED; + return JPEG_SUSPENDED; break; - default: /* must be DHP, EXP, JPGn, or RESn */ + default: /* must be DHP, EXP, JPGn, or RESn */ /* For now, we treat the reserved markers as fatal errors since they are * likely to be used to signal incompatible JPEG Part 3 extensions. * Once the JPEG 3 version-number marker is well defined, this code @@ -1144,7 +1144,7 @@ read_restart_marker (j_decompress_ptr cinfo) /* Uh-oh, the restart markers have been messed up. */ /* Let the data source manager determine how to resync. */ if (! (*cinfo->src->resync_to_restart) (cinfo, - cinfo->marker->next_restart_num)) + cinfo->marker->next_restart_num)) return FALSE; } @@ -1209,25 +1209,25 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired) { int marker = cinfo->unread_marker; int action = 1; - + /* Always put up a warning. */ WARNMS2(cinfo, JWRN_MUST_RESYNC, marker, desired); - + /* Outer loop handles repeated decision after scanning forward. */ for (;;) { if (marker < (int) M_SOF0) - action = 2; /* invalid marker */ + action = 2; /* invalid marker */ else if (marker < (int) M_RST0 || marker > (int) M_RST7) - action = 3; /* valid non-restart marker */ + action = 3; /* valid non-restart marker */ else { if (marker == ((int) M_RST0 + ((desired+1) & 7)) || - marker == ((int) M_RST0 + ((desired+2) & 7))) - action = 3; /* one of the next two expected restarts */ + marker == ((int) M_RST0 + ((desired+2) & 7))) + action = 3; /* one of the next two expected restarts */ else if (marker == ((int) M_RST0 + ((desired-1) & 7)) || - marker == ((int) M_RST0 + ((desired-2) & 7))) - action = 2; /* a prior restart, so advance */ + marker == ((int) M_RST0 + ((desired-2) & 7))) + action = 2; /* a prior restart, so advance */ else - action = 1; /* desired restart or too far away */ + action = 1; /* desired restart or too far away */ } TRACEMS2(cinfo, 4, JTRC_RECOVERY_ACTION, marker, action); switch (action) { @@ -1238,7 +1238,7 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired) case 2: /* Scan to the next marker, and repeat the decision loop. */ if (! next_marker(cinfo)) - return FALSE; + return FALSE; marker = cinfo->unread_marker; break; case 3: @@ -1259,10 +1259,10 @@ reset_marker_reader (j_decompress_ptr cinfo) { my_marker_ptr marker = (my_marker_ptr) cinfo->marker; - cinfo->comp_info = NULL; /* until allocated by get_sof */ - cinfo->input_scan_number = 0; /* no SOS seen yet */ - cinfo->unread_marker = 0; /* no pending marker */ - marker->pub.saw_SOI = FALSE; /* set internal state too */ + cinfo->comp_info = NULL; /* until allocated by get_sof */ + cinfo->input_scan_number = 0; /* no SOS seen yet */ + cinfo->unread_marker = 0; /* no pending marker */ + marker->pub.saw_SOI = FALSE; /* set internal state too */ marker->pub.saw_SOF = FALSE; marker->pub.discarded_bytes = 0; marker->cur_marker = NULL; @@ -1283,7 +1283,7 @@ jinit_marker_reader (j_decompress_ptr cinfo) /* Create subobject in permanent pool */ marker = (my_marker_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT, - SIZEOF(my_marker_reader)); + SIZEOF(my_marker_reader)); cinfo->marker = (struct jpeg_marker_reader *) marker; /* Initialize public method pointers */ marker->pub.reset_marker_reader = reset_marker_reader; @@ -1314,7 +1314,7 @@ jinit_marker_reader (j_decompress_ptr cinfo) GLOBAL(void) jpeg_save_markers (j_decompress_ptr cinfo, int marker_code, - unsigned int length_limit) + unsigned int length_limit) { my_marker_ptr marker = (my_marker_ptr) cinfo->marker; long maxlength; @@ -1363,7 +1363,7 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code, GLOBAL(void) jpeg_set_marker_processor (j_decompress_ptr cinfo, int marker_code, - jpeg_marker_parser_method routine) + jpeg_marker_parser_method routine) { my_marker_ptr marker = (my_marker_ptr) cinfo->marker; diff --git a/jdmaster.c b/jdmaster.c index e1f9f9e7f..b9f78fd09 100644 --- a/jdmaster.c +++ b/jdmaster.c @@ -25,7 +25,7 @@ typedef struct { struct jpeg_decomp_master pub; /* public fields */ - int pass_number; /* # of passes completed */ + int pass_number; /* # of passes completed */ boolean using_merged_upsample; /* TRUE if using merged upsample/cconvert */ @@ -80,7 +80,7 @@ use_merged_upsample (j_decompress_ptr cinfo) cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size) return FALSE; /* ??? also need to test for upsample-time rescaling, when & if supported */ - return TRUE; /* by golly, it'll work... */ + return TRUE; /* by golly, it'll work... */ #else return FALSE; #endif @@ -292,10 +292,10 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo) ci++, compptr++) { int ssize = cinfo->_min_DCT_scaled_size; while (ssize < DCTSIZE && - ((cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) % - (compptr->h_samp_factor * ssize * 2) == 0) && - ((cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size) % - (compptr->v_samp_factor * ssize * 2) == 0)) { + ((cinfo->max_h_samp_factor * cinfo->_min_DCT_scaled_size) % + (compptr->h_samp_factor * ssize * 2) == 0) && + ((cinfo->max_v_samp_factor * cinfo->_min_DCT_scaled_size) % + (compptr->v_samp_factor * ssize * 2) == 0)) { ssize = ssize * 2; } #if JPEG_LIB_VERSION >= 70 @@ -313,12 +313,12 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo) /* Size in samples, after IDCT scaling */ compptr->downsampled_width = (JDIMENSION) jdiv_round_up((long) cinfo->image_width * - (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size), - (long) (cinfo->max_h_samp_factor * DCTSIZE)); + (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size), + (long) (cinfo->max_h_samp_factor * DCTSIZE)); compptr->downsampled_height = (JDIMENSION) jdiv_round_up((long) cinfo->image_height * - (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size), - (long) (cinfo->max_v_samp_factor * DCTSIZE)); + (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size), + (long) (cinfo->max_v_samp_factor * DCTSIZE)); } #else /* !IDCT_SCALING_SUPPORTED */ @@ -358,12 +358,12 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo) case JCS_YCCK: cinfo->out_color_components = 4; break; - default: /* else must be same colorspace as in file */ + default: /* else must be same colorspace as in file */ cinfo->out_color_components = cinfo->num_components; break; } cinfo->output_components = (cinfo->quantize_colors ? 1 : - cinfo->out_color_components); + cinfo->out_color_components); /* See if upsampler will want to emit more than one row at a time */ if (use_merged_upsample(cinfo)) @@ -380,20 +380,20 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo) * processes are inner loops and need to be as fast as possible. On most * machines, particularly CPUs with pipelines or instruction prefetch, * a (subscript-check-less) C table lookup - * x = sample_range_limit[x]; + * x = sample_range_limit[x]; * is faster than explicit tests - * if (x < 0) x = 0; - * else if (x > MAXJSAMPLE) x = MAXJSAMPLE; + * if (x < 0) x = 0; + * else if (x > MAXJSAMPLE) x = MAXJSAMPLE; * These processes all use a common table prepared by the routine below. * * For most steps we can mathematically guarantee that the initial value * of x is within MAXJSAMPLE+1 of the legal range, so a table running from * -(MAXJSAMPLE+1) to 2*MAXJSAMPLE+1 is sufficient. But for the initial - * limiting step (just after the IDCT), a wildly out-of-range value is + * limiting step (just after the IDCT), a wildly out-of-range value is * possible if the input data is corrupt. To avoid any chance of indexing * off the end of memory and getting a bad-pointer trap, we perform the * post-IDCT limiting thus: - * x = range_limit[x & MASK]; + * x = range_limit[x & MASK]; * where MASK is 2 bits wider than legal sample data, ie 10 bits for 8-bit * samples. Under normal circumstances this is more than enough range and * a correct output will be generated; with bogus input data the mask will @@ -425,23 +425,23 @@ prepare_range_limit_table (j_decompress_ptr cinfo) table = (JSAMPLE *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * SIZEOF(JSAMPLE)); - table += (MAXJSAMPLE+1); /* allow negative subscripts of simple table */ + (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * SIZEOF(JSAMPLE)); + table += (MAXJSAMPLE+1); /* allow negative subscripts of simple table */ cinfo->sample_range_limit = table; /* First segment of "simple" table: limit[x] = 0 for x < 0 */ MEMZERO(table - (MAXJSAMPLE+1), (MAXJSAMPLE+1) * SIZEOF(JSAMPLE)); /* Main part of "simple" table: limit[x] = x */ for (i = 0; i <= MAXJSAMPLE; i++) table[i] = (JSAMPLE) i; - table += CENTERJSAMPLE; /* Point to where post-IDCT table starts */ + table += CENTERJSAMPLE; /* Point to where post-IDCT table starts */ /* End of simple table, rest of first half of post-IDCT table */ for (i = CENTERJSAMPLE; i < 2*(MAXJSAMPLE+1); i++) table[i] = MAXJSAMPLE; /* Second half of post-IDCT table */ MEMZERO(table + (2 * (MAXJSAMPLE+1)), - (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * SIZEOF(JSAMPLE)); + (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * SIZEOF(JSAMPLE)); MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE), - cinfo->sample_range_limit, CENTERJSAMPLE * SIZEOF(JSAMPLE)); + cinfo->sample_range_limit, CENTERJSAMPLE * SIZEOF(JSAMPLE)); } @@ -629,24 +629,24 @@ prepare_for_output_pass (j_decompress_ptr cinfo) if (cinfo->quantize_colors && cinfo->colormap == NULL) { /* Select new quantization method */ if (cinfo->two_pass_quantize && cinfo->enable_2pass_quant) { - cinfo->cquantize = master->quantizer_2pass; - master->pub.is_dummy_pass = TRUE; + cinfo->cquantize = master->quantizer_2pass; + master->pub.is_dummy_pass = TRUE; } else if (cinfo->enable_1pass_quant) { - cinfo->cquantize = master->quantizer_1pass; + cinfo->cquantize = master->quantizer_1pass; } else { - ERREXIT(cinfo, JERR_MODE_CHANGE); + ERREXIT(cinfo, JERR_MODE_CHANGE); } } (*cinfo->idct->start_pass) (cinfo); (*cinfo->coef->start_output_pass) (cinfo); if (! cinfo->raw_data_out) { if (! master->using_merged_upsample) - (*cinfo->cconvert->start_pass) (cinfo); + (*cinfo->cconvert->start_pass) (cinfo); (*cinfo->upsample->start_pass) (cinfo); if (cinfo->quantize_colors) - (*cinfo->cquantize->start_pass) (cinfo, master->pub.is_dummy_pass); + (*cinfo->cquantize->start_pass) (cinfo, master->pub.is_dummy_pass); (*cinfo->post->start_pass) (cinfo, - (master->pub.is_dummy_pass ? JBUF_SAVE_AND_PASS : JBUF_PASS_THRU)); + (master->pub.is_dummy_pass ? JBUF_SAVE_AND_PASS : JBUF_PASS_THRU)); (*cinfo->main->start_pass) (cinfo, JBUF_PASS_THRU); } } @@ -655,7 +655,7 @@ prepare_for_output_pass (j_decompress_ptr cinfo) if (cinfo->progress != NULL) { cinfo->progress->completed_passes = master->pass_number; cinfo->progress->total_passes = master->pass_number + - (master->pub.is_dummy_pass ? 2 : 1); + (master->pub.is_dummy_pass ? 2 : 1); /* In buffered-image mode, we assume one more output pass if EOI not * yet reached, but no more passes if EOI has been reached. */ @@ -722,7 +722,7 @@ jinit_master_decompress (j_decompress_ptr cinfo) master = (my_master_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_decomp_master)); + SIZEOF(my_decomp_master)); cinfo->master = (struct jpeg_decomp_master *) master; master->pub.prepare_for_output_pass = prepare_for_output_pass; master->pub.finish_output_pass = finish_output_pass; diff --git a/jdmerge.c b/jdmerge.c index cb1aa0a38..c669b179e 100644 --- a/jdmerge.c +++ b/jdmerge.c @@ -17,19 +17,19 @@ * (ie, box filtering), we can save some work in color conversion by * calculating all the output pixels corresponding to a pair of chroma * samples at one time. In the conversion equations - * R = Y + K1 * Cr - * G = Y + K2 * Cb + K3 * Cr - * B = Y + K4 * Cb + * R = Y + K1 * Cr + * G = Y + K2 * Cb + K3 * Cr + * B = Y + K4 * Cb * only the Y term varies among the group of pixels corresponding to a pair * of chroma samples, so the rest of the terms can be calculated just once. * At typical sampling ratios, this eliminates half or three-quarters of the * multiplications needed for color conversion. * * This file currently provides implementations for the following cases: - * YCbCr => RGB color conversion only. - * Sampling ratios of 2h1v or 2h2v. - * No scaling needed at upsample time. - * Corner-aligned (non-CCIR601) sampling alignment. + * YCbCr => RGB color conversion only. + * Sampling ratios of 2h1v or 2h2v. + * No scaling needed at upsample time. + * Corner-aligned (non-CCIR601) sampling alignment. * Other special cases could be added, but in most applications these are * the only common cases. (For uncommon cases we fall back on the more * general code in jdsample.c and jdcolor.c.) @@ -47,18 +47,18 @@ /* Private subobject */ typedef struct { - struct jpeg_upsampler pub; /* public fields */ + struct jpeg_upsampler pub; /* public fields */ /* Pointer to routine to do actual upsampling/conversion of one row group */ JMETHOD(void, upmethod, (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, - JSAMPARRAY output_buf)); + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf)); /* Private state for YCC->RGB conversion */ - int * Cr_r_tab; /* => table for Cr to R conversion */ - int * Cb_b_tab; /* => table for Cb to B conversion */ - INT32 * Cr_g_tab; /* => table for Cr to G conversion */ - INT32 * Cb_g_tab; /* => table for Cb to G conversion */ + int * Cr_r_tab; /* => table for Cr to R conversion */ + int * Cb_b_tab; /* => table for Cb to B conversion */ + INT32 * Cr_g_tab; /* => table for Cr to G conversion */ + INT32 * Cb_g_tab; /* => table for Cb to G conversion */ /* For 2:1 vertical sampling, we produce two output rows at a time. * We need a "spare" row buffer to hold the second output row if the @@ -66,17 +66,17 @@ typedef struct { * to discard the dummy last row if the image height is odd. */ JSAMPROW spare_row; - boolean spare_full; /* T if spare buffer is occupied */ + boolean spare_full; /* T if spare buffer is occupied */ - JDIMENSION out_row_width; /* samples per output row */ - JDIMENSION rows_to_go; /* counts rows remaining in image */ + JDIMENSION out_row_width; /* samples per output row */ + JDIMENSION rows_to_go; /* counts rows remaining in image */ } my_upsampler; typedef my_upsampler * my_upsample_ptr; -#define SCALEBITS 16 /* speediest right-shift on some machines */ -#define ONE_HALF ((INT32) 1 << (SCALEBITS-1)) -#define FIX(x) ((INT32) ((x) * (1L<Cr_r_tab = (int *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (MAXJSAMPLE+1) * SIZEOF(int)); + (MAXJSAMPLE+1) * SIZEOF(int)); upsample->Cb_b_tab = (int *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (MAXJSAMPLE+1) * SIZEOF(int)); + (MAXJSAMPLE+1) * SIZEOF(int)); upsample->Cr_g_tab = (INT32 *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (MAXJSAMPLE+1) * SIZEOF(INT32)); + (MAXJSAMPLE+1) * SIZEOF(INT32)); upsample->Cb_g_tab = (INT32 *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (MAXJSAMPLE+1) * SIZEOF(INT32)); + (MAXJSAMPLE+1) * SIZEOF(INT32)); for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) { /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */ /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */ /* Cr=>R value is nearest int to 1.40200 * x */ upsample->Cr_r_tab[i] = (int) - RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS); + RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS); /* Cb=>B value is nearest int to 1.77200 * x */ upsample->Cb_b_tab[i] = (int) - RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS); + RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS); /* Cr=>G value is scaled-up -0.71414 * x */ upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x; /* Cb=>G value is scaled-up -0.34414 * x */ @@ -248,20 +248,20 @@ start_pass_merged_upsample (j_decompress_ptr cinfo) METHODDEF(void) merged_2v_upsample (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) /* 2:1 vertical sampling case: may need a spare row. */ { my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; JSAMPROW work_ptrs[2]; - JDIMENSION num_rows; /* number of rows returned to caller */ + JDIMENSION num_rows; /* number of rows returned to caller */ if (upsample->spare_full) { /* If we have a spare row saved from a previous cycle, just return it. */ jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0, - 1, upsample->out_row_width); + 1, upsample->out_row_width); num_rows = 1; upsample->spare_full = FALSE; } else { @@ -297,17 +297,17 @@ merged_2v_upsample (j_decompress_ptr cinfo, METHODDEF(void) merged_1v_upsample (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) /* 1:1 vertical sampling case: much easier, never need a spare row. */ { my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; /* Just do the upsampling. */ (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, - output_buf + *out_row_ctr); + output_buf + *out_row_ctr); /* Adjust counts */ (*out_row_ctr)++; (*in_row_group_ctr)++; @@ -330,8 +330,8 @@ merged_1v_upsample (j_decompress_ptr cinfo, METHODDEF(void) h2v1_merged_upsample (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, - JSAMPARRAY output_buf) + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) { switch (cinfo->out_color_space) { case JCS_EXT_RGB: @@ -376,8 +376,8 @@ h2v1_merged_upsample (j_decompress_ptr cinfo, METHODDEF(void) h2v2_merged_upsample (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, - JSAMPARRAY output_buf) + JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) { switch (cinfo->out_color_space) { case JCS_EXT_RGB: @@ -431,7 +431,7 @@ jinit_merged_upsampler (j_decompress_ptr cinfo) upsample = (my_upsample_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_upsampler)); + SIZEOF(my_upsampler)); cinfo->upsample = (struct jpeg_upsampler *) upsample; upsample->pub.start_pass = start_pass_merged_upsample; upsample->pub.need_context_rows = FALSE; @@ -447,7 +447,7 @@ jinit_merged_upsampler (j_decompress_ptr cinfo) /* Allocate a spare row buffer */ upsample->spare_row = (JSAMPROW) (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (size_t) (upsample->out_row_width * SIZEOF(JSAMPLE))); + (size_t) (upsample->out_row_width * SIZEOF(JSAMPLE))); } else { upsample->pub.upsample = merged_1v_upsample; if (jsimd_can_h2v1_merged_upsample()) diff --git a/jdphuff.c b/jdphuff.c index fa97aab6a..783d8a8f4 100644 --- a/jdphuff.c +++ b/jdphuff.c @@ -17,7 +17,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdhuff.h" /* Declarations shared with jdhuff.c */ +#include "jdhuff.h" /* Declarations shared with jdhuff.c */ #ifdef D_PROGRESSIVE_SUPPORTED @@ -30,8 +30,8 @@ */ typedef struct { - unsigned int EOBRUN; /* remaining EOBs in EOBRUN */ - int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ + unsigned int EOBRUN; /* remaining EOBs in EOBRUN */ + int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */ } savable_state; /* This macro is to work around compilers with missing or broken @@ -44,11 +44,11 @@ typedef struct { #else #if MAX_COMPS_IN_SCAN == 4 #define ASSIGN_STATE(dest,src) \ - ((dest).EOBRUN = (src).EOBRUN, \ - (dest).last_dc_val[0] = (src).last_dc_val[0], \ - (dest).last_dc_val[1] = (src).last_dc_val[1], \ - (dest).last_dc_val[2] = (src).last_dc_val[2], \ - (dest).last_dc_val[3] = (src).last_dc_val[3]) + ((dest).EOBRUN = (src).EOBRUN, \ + (dest).last_dc_val[0] = (src).last_dc_val[0], \ + (dest).last_dc_val[1] = (src).last_dc_val[1], \ + (dest).last_dc_val[2] = (src).last_dc_val[2], \ + (dest).last_dc_val[3] = (src).last_dc_val[3]) #endif #endif @@ -59,11 +59,11 @@ typedef struct { /* These fields are loaded into local variables at start of each MCU. * In case of suspension, we exit WITHOUT updating them. */ - bitread_perm_state bitstate; /* Bit buffer at start of MCU */ - savable_state saved; /* Other state at start of MCU */ + bitread_perm_state bitstate; /* Bit buffer at start of MCU */ + savable_state saved; /* Other state at start of MCU */ /* These fields are NOT loaded into local working state. */ - unsigned int restarts_to_go; /* MCUs left in this restart interval */ + unsigned int restarts_to_go; /* MCUs left in this restart interval */ /* Pointers to derived tables (these workspaces have image lifespan) */ d_derived_tbl * derived_tbls[NUM_HUFF_TBLS]; @@ -75,13 +75,13 @@ typedef phuff_entropy_decoder * phuff_entropy_ptr; /* Forward declarations */ METHODDEF(boolean) decode_mcu_DC_first JPP((j_decompress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(boolean) decode_mcu_AC_first JPP((j_decompress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(boolean) decode_mcu_DC_refine JPP((j_decompress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); METHODDEF(boolean) decode_mcu_AC_refine JPP((j_decompress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); /* @@ -117,7 +117,7 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo) if (cinfo->Al != cinfo->Ah-1) bad = TRUE; } - if (cinfo->Al > 13) /* need not check for < 0 */ + if (cinfo->Al > 13) /* need not check for < 0 */ bad = TRUE; /* Arguably the maximum Al value should be less than 13 for 8-bit precision, * but the spec doesn't say so, and we try to be liberal about what we @@ -127,7 +127,7 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo) */ if (bad) ERREXIT4(cinfo, JERR_BAD_PROGRESSION, - cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al); + cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al); /* Update progression status, and verify that scan order is legal. * Note that inter-scan inconsistencies are treated as warnings * not fatal errors ... not clear if this is right way to behave. @@ -140,7 +140,7 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo) for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) { int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi]; if (cinfo->Ah != expected) - WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi); + WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi); coef_bit_ptr[coefi] = cinfo->Al; } } @@ -164,15 +164,15 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo) * We may build same derived table more than once, but it's not expensive. */ if (is_DC_band) { - if (cinfo->Ah == 0) { /* DC refinement needs no table */ - tbl = compptr->dc_tbl_no; - jpeg_make_d_derived_tbl(cinfo, TRUE, tbl, - & entropy->derived_tbls[tbl]); + if (cinfo->Ah == 0) { /* DC refinement needs no table */ + tbl = compptr->dc_tbl_no; + jpeg_make_d_derived_tbl(cinfo, TRUE, tbl, + & entropy->derived_tbls[tbl]); } } else { tbl = compptr->ac_tbl_no; jpeg_make_d_derived_tbl(cinfo, FALSE, tbl, - & entropy->derived_tbls[tbl]); + & entropy->derived_tbls[tbl]); /* remember the single active table */ entropy->ac_derived_tbl = entropy->derived_tbls[tbl]; } @@ -264,7 +264,7 @@ process_restart (j_decompress_ptr cinfo) /* * Huffman MCU decoding. * Each of these routines decodes and returns one MCU's worth of - * Huffman-compressed coefficients. + * Huffman-compressed coefficients. * The coefficients are reordered from zigzag order into natural array order, * but are not dequantized. * @@ -285,7 +285,7 @@ process_restart (j_decompress_ptr cinfo) METHODDEF(boolean) decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) -{ +{ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; int Al = cinfo->Al; register int s, r; @@ -300,7 +300,7 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) if (! process_restart(cinfo)) - return FALSE; + return FALSE; } /* If we've run out of data, just leave the MCU set to zeroes. @@ -325,9 +325,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) /* Section F.2.2.1: decode the DC coefficient difference */ HUFF_DECODE(s, br_state, tbl, return FALSE, label1); if (s) { - CHECK_BIT_BUFFER(br_state, s, return FALSE); - r = GET_BITS(s); - s = HUFF_EXTEND(r, s); + CHECK_BIT_BUFFER(br_state, s, return FALSE); + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); } /* Convert DC difference to actual value, update last_dc_val */ @@ -356,7 +356,7 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) METHODDEF(boolean) decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) -{ +{ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; int Se = cinfo->Se; int Al = cinfo->Al; @@ -370,7 +370,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) if (! process_restart(cinfo)) - return FALSE; + return FALSE; } /* If we've run out of data, just leave the MCU set to zeroes. @@ -381,49 +381,49 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) /* Load up working state. * We can avoid loading/saving bitread state if in an EOB run. */ - EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */ + EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */ /* There is always only one block per MCU */ - if (EOBRUN > 0) /* if it's a band of zeroes... */ - EOBRUN--; /* ...process it now (we do nothing) */ + if (EOBRUN > 0) /* if it's a band of zeroes... */ + EOBRUN--; /* ...process it now (we do nothing) */ else { BITREAD_LOAD_STATE(cinfo,entropy->bitstate); block = MCU_data[0]; tbl = entropy->ac_derived_tbl; for (k = cinfo->Ss; k <= Se; k++) { - HUFF_DECODE(s, br_state, tbl, return FALSE, label2); - r = s >> 4; - s &= 15; - if (s) { - k += r; - CHECK_BIT_BUFFER(br_state, s, return FALSE); - r = GET_BITS(s); - s = HUFF_EXTEND(r, s); - /* Scale and output coefficient in natural (dezigzagged) order */ - (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al); - } else { - if (r == 15) { /* ZRL */ - k += 15; /* skip 15 zeroes in band */ - } else { /* EOBr, run length is 2^r + appended bits */ - EOBRUN = 1 << r; - if (r) { /* EOBr, r > 0 */ - CHECK_BIT_BUFFER(br_state, r, return FALSE); - r = GET_BITS(r); - EOBRUN += r; - } - EOBRUN--; /* this band is processed at this moment */ - break; /* force end-of-band */ - } - } + HUFF_DECODE(s, br_state, tbl, return FALSE, label2); + r = s >> 4; + s &= 15; + if (s) { + k += r; + CHECK_BIT_BUFFER(br_state, s, return FALSE); + r = GET_BITS(s); + s = HUFF_EXTEND(r, s); + /* Scale and output coefficient in natural (dezigzagged) order */ + (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al); + } else { + if (r == 15) { /* ZRL */ + k += 15; /* skip 15 zeroes in band */ + } else { /* EOBr, run length is 2^r + appended bits */ + EOBRUN = 1 << r; + if (r) { /* EOBr, r > 0 */ + CHECK_BIT_BUFFER(br_state, r, return FALSE); + r = GET_BITS(r); + EOBRUN += r; + } + EOBRUN--; /* this band is processed at this moment */ + break; /* force end-of-band */ + } + } } BITREAD_SAVE_STATE(cinfo,entropy->bitstate); } /* Completed MCU, so update state */ - entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */ + entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */ } /* Account for restart interval (no-op if not using restarts) */ @@ -441,9 +441,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) METHODDEF(boolean) decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) -{ +{ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; - int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ + int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ int blkn; JBLOCKROW block; BITREAD_STATE_VARS; @@ -452,7 +452,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) if (! process_restart(cinfo)) - return FALSE; + return FALSE; } /* Not worth the cycles to check insufficient_data here, @@ -490,11 +490,11 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) -{ +{ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy; int Se = cinfo->Se; - int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ - int m1 = (-1) << cinfo->Al; /* -1 in the bit position being coded */ + int p1 = 1 << cinfo->Al; /* 1 in the bit position being coded */ + int m1 = (-1) << cinfo->Al; /* -1 in the bit position being coded */ register int s, k, r; unsigned int EOBRUN; JBLOCKROW block; @@ -508,7 +508,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (cinfo->restart_interval) { if (entropy->restarts_to_go == 0) if (! process_restart(cinfo)) - return FALSE; + return FALSE; } /* If we've run out of data, don't modify the MCU. @@ -536,58 +536,58 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) if (EOBRUN == 0) { for (; k <= Se; k++) { - HUFF_DECODE(s, br_state, tbl, goto undoit, label3); - r = s >> 4; - s &= 15; - if (s) { - if (s != 1) /* size of new coef should always be 1 */ - WARNMS(cinfo, JWRN_HUFF_BAD_CODE); - CHECK_BIT_BUFFER(br_state, 1, goto undoit); - if (GET_BITS(1)) - s = p1; /* newly nonzero coef is positive */ - else - s = m1; /* newly nonzero coef is negative */ - } else { - if (r != 15) { - EOBRUN = 1 << r; /* EOBr, run length is 2^r + appended bits */ - if (r) { - CHECK_BIT_BUFFER(br_state, r, goto undoit); - r = GET_BITS(r); - EOBRUN += r; - } - break; /* rest of block is handled by EOB logic */ - } - /* note s = 0 for processing ZRL */ - } - /* Advance over already-nonzero coefs and r still-zero coefs, - * appending correction bits to the nonzeroes. A correction bit is 1 - * if the absolute value of the coefficient must be increased. - */ - do { - thiscoef = *block + jpeg_natural_order[k]; - if (*thiscoef != 0) { - CHECK_BIT_BUFFER(br_state, 1, goto undoit); - if (GET_BITS(1)) { - if ((*thiscoef & p1) == 0) { /* do nothing if already set it */ - if (*thiscoef >= 0) - *thiscoef += p1; - else - *thiscoef += m1; - } - } - } else { - if (--r < 0) - break; /* reached target zero coefficient */ - } - k++; - } while (k <= Se); - if (s) { - int pos = jpeg_natural_order[k]; - /* Output newly nonzero coefficient */ - (*block)[pos] = (JCOEF) s; - /* Remember its position in case we have to suspend */ - newnz_pos[num_newnz++] = pos; - } + HUFF_DECODE(s, br_state, tbl, goto undoit, label3); + r = s >> 4; + s &= 15; + if (s) { + if (s != 1) /* size of new coef should always be 1 */ + WARNMS(cinfo, JWRN_HUFF_BAD_CODE); + CHECK_BIT_BUFFER(br_state, 1, goto undoit); + if (GET_BITS(1)) + s = p1; /* newly nonzero coef is positive */ + else + s = m1; /* newly nonzero coef is negative */ + } else { + if (r != 15) { + EOBRUN = 1 << r; /* EOBr, run length is 2^r + appended bits */ + if (r) { + CHECK_BIT_BUFFER(br_state, r, goto undoit); + r = GET_BITS(r); + EOBRUN += r; + } + break; /* rest of block is handled by EOB logic */ + } + /* note s = 0 for processing ZRL */ + } + /* Advance over already-nonzero coefs and r still-zero coefs, + * appending correction bits to the nonzeroes. A correction bit is 1 + * if the absolute value of the coefficient must be increased. + */ + do { + thiscoef = *block + jpeg_natural_order[k]; + if (*thiscoef != 0) { + CHECK_BIT_BUFFER(br_state, 1, goto undoit); + if (GET_BITS(1)) { + if ((*thiscoef & p1) == 0) { /* do nothing if already set it */ + if (*thiscoef >= 0) + *thiscoef += p1; + else + *thiscoef += m1; + } + } + } else { + if (--r < 0) + break; /* reached target zero coefficient */ + } + k++; + } while (k <= Se); + if (s) { + int pos = jpeg_natural_order[k]; + /* Output newly nonzero coefficient */ + (*block)[pos] = (JCOEF) s; + /* Remember its position in case we have to suspend */ + newnz_pos[num_newnz++] = pos; + } } } @@ -598,18 +598,18 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) * if the absolute value of the coefficient must be increased. */ for (; k <= Se; k++) { - thiscoef = *block + jpeg_natural_order[k]; - if (*thiscoef != 0) { - CHECK_BIT_BUFFER(br_state, 1, goto undoit); - if (GET_BITS(1)) { - if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */ - if (*thiscoef >= 0) - *thiscoef += p1; - else - *thiscoef += m1; - } - } - } + thiscoef = *block + jpeg_natural_order[k]; + if (*thiscoef != 0) { + CHECK_BIT_BUFFER(br_state, 1, goto undoit); + if (GET_BITS(1)) { + if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */ + if (*thiscoef >= 0) + *thiscoef += p1; + else + *thiscoef += m1; + } + } + } } /* Count one block completed in EOB run */ EOBRUN--; @@ -647,7 +647,7 @@ jinit_phuff_decoder (j_decompress_ptr cinfo) entropy = (phuff_entropy_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(phuff_entropy_decoder)); + SIZEOF(phuff_entropy_decoder)); cinfo->entropy = (struct jpeg_entropy_decoder *) entropy; entropy->pub.start_pass = start_pass_phuff_decoder; @@ -659,9 +659,9 @@ jinit_phuff_decoder (j_decompress_ptr cinfo) /* Create progression status table */ cinfo->coef_bits = (int (*)[DCTSIZE2]) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - cinfo->num_components*DCTSIZE2*SIZEOF(int)); + cinfo->num_components*DCTSIZE2*SIZEOF(int)); coef_bit_ptr = & cinfo->coef_bits[0][0]; - for (ci = 0; ci < cinfo->num_components; ci++) + for (ci = 0; ci < cinfo->num_components; ci++) for (i = 0; i < DCTSIZE2; i++) *coef_bit_ptr++ = -1; } diff --git a/jdpostct.c b/jdpostct.c index 571563d72..aa2af07d8 100644 --- a/jdpostct.c +++ b/jdpostct.c @@ -31,12 +31,12 @@ typedef struct { * For two-pass color quantization, we need a full-image buffer; * for one-pass operation, a strip buffer is sufficient. */ - jvirt_sarray_ptr whole_image; /* virtual array, or NULL if one-pass */ - JSAMPARRAY buffer; /* strip buffer, or current strip of virtual */ - JDIMENSION strip_height; /* buffer size in rows */ + jvirt_sarray_ptr whole_image; /* virtual array, or NULL if one-pass */ + JSAMPARRAY buffer; /* strip buffer, or current strip of virtual */ + JDIMENSION strip_height; /* buffer size in rows */ /* for two-pass mode only: */ - JDIMENSION starting_row; /* row # of first row in current strip */ - JDIMENSION next_row; /* index of next row to fill/empty in strip */ + JDIMENSION starting_row; /* row # of first row in current strip */ + JDIMENSION next_row; /* index of next row to fill/empty in strip */ } my_post_controller; typedef my_post_controller * my_post_ptr; @@ -44,24 +44,24 @@ typedef my_post_controller * my_post_ptr; /* Forward declarations */ METHODDEF(void) post_process_1pass - JPP((j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail)); + JPP((j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail)); #ifdef QUANT_2PASS_SUPPORTED METHODDEF(void) post_process_prepass - JPP((j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail)); + JPP((j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail)); METHODDEF(void) post_process_2pass - JPP((j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail)); + JPP((j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail)); #endif @@ -84,9 +84,9 @@ start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode) * allocate a strip buffer. Use the virtual-array buffer as workspace. */ if (post->buffer == NULL) { - post->buffer = (*cinfo->mem->access_virt_sarray) - ((j_common_ptr) cinfo, post->whole_image, - (JDIMENSION) 0, post->strip_height, TRUE); + post->buffer = (*cinfo->mem->access_virt_sarray) + ((j_common_ptr) cinfo, post->whole_image, + (JDIMENSION) 0, post->strip_height, TRUE); } } else { /* For single-pass processing without color quantization, @@ -124,10 +124,10 @@ start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode) METHODDEF(void) post_process_1pass (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) { my_post_ptr post = (my_post_ptr) cinfo->post; JDIMENSION num_rows, max_rows; @@ -139,11 +139,11 @@ post_process_1pass (j_decompress_ptr cinfo, max_rows = post->strip_height; num_rows = 0; (*cinfo->upsample->upsample) (cinfo, - input_buf, in_row_group_ctr, in_row_groups_avail, - post->buffer, &num_rows, max_rows); + input_buf, in_row_group_ctr, in_row_groups_avail, + post->buffer, &num_rows, max_rows); /* Quantize and emit data. */ (*cinfo->cquantize->color_quantize) (cinfo, - post->buffer, output_buf + *out_row_ctr, (int) num_rows); + post->buffer, output_buf + *out_row_ctr, (int) num_rows); *out_row_ctr += num_rows; } @@ -156,10 +156,10 @@ post_process_1pass (j_decompress_ptr cinfo, METHODDEF(void) post_process_prepass (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) { my_post_ptr post = (my_post_ptr) cinfo->post; JDIMENSION old_next_row, num_rows; @@ -167,22 +167,22 @@ post_process_prepass (j_decompress_ptr cinfo, /* Reposition virtual buffer if at start of strip. */ if (post->next_row == 0) { post->buffer = (*cinfo->mem->access_virt_sarray) - ((j_common_ptr) cinfo, post->whole_image, - post->starting_row, post->strip_height, TRUE); + ((j_common_ptr) cinfo, post->whole_image, + post->starting_row, post->strip_height, TRUE); } /* Upsample some data (up to a strip height's worth). */ old_next_row = post->next_row; (*cinfo->upsample->upsample) (cinfo, - input_buf, in_row_group_ctr, in_row_groups_avail, - post->buffer, &post->next_row, post->strip_height); + input_buf, in_row_group_ctr, in_row_groups_avail, + post->buffer, &post->next_row, post->strip_height); /* Allow quantizer to scan new data. No data is emitted, */ /* but we advance out_row_ctr so outer loop can tell when we're done. */ if (post->next_row > old_next_row) { num_rows = post->next_row - old_next_row; (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row, - (JSAMPARRAY) NULL, (int) num_rows); + (JSAMPARRAY) NULL, (int) num_rows); *out_row_ctr += num_rows; } @@ -200,10 +200,10 @@ post_process_prepass (j_decompress_ptr cinfo, METHODDEF(void) post_process_2pass (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) { my_post_ptr post = (my_post_ptr) cinfo->post; JDIMENSION num_rows, max_rows; @@ -211,8 +211,8 @@ post_process_2pass (j_decompress_ptr cinfo, /* Reposition virtual buffer if at start of strip. */ if (post->next_row == 0) { post->buffer = (*cinfo->mem->access_virt_sarray) - ((j_common_ptr) cinfo, post->whole_image, - post->starting_row, post->strip_height, FALSE); + ((j_common_ptr) cinfo, post->whole_image, + post->starting_row, post->strip_height, FALSE); } /* Determine number of rows to emit. */ @@ -227,8 +227,8 @@ post_process_2pass (j_decompress_ptr cinfo, /* Quantize and emit data. */ (*cinfo->cquantize->color_quantize) (cinfo, - post->buffer + post->next_row, output_buf + *out_row_ctr, - (int) num_rows); + post->buffer + post->next_row, output_buf + *out_row_ctr, + (int) num_rows); *out_row_ctr += num_rows; /* Advance if we filled the strip. */ @@ -253,11 +253,11 @@ jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer) post = (my_post_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_post_controller)); + SIZEOF(my_post_controller)); cinfo->post = (struct jpeg_d_post_controller *) post; post->pub.start_pass = start_pass_dpost; - post->whole_image = NULL; /* flag for no virtual arrays */ - post->buffer = NULL; /* flag for no strip buffer */ + post->whole_image = NULL; /* flag for no virtual arrays */ + post->buffer = NULL; /* flag for no strip buffer */ /* Create the quantization buffer, if needed */ if (cinfo->quantize_colors) { @@ -271,20 +271,20 @@ jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer) /* We round up the number of rows to a multiple of the strip height. */ #ifdef QUANT_2PASS_SUPPORTED post->whole_image = (*cinfo->mem->request_virt_sarray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, - cinfo->output_width * cinfo->out_color_components, - (JDIMENSION) jround_up((long) cinfo->output_height, - (long) post->strip_height), - post->strip_height); + ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE, + cinfo->output_width * cinfo->out_color_components, + (JDIMENSION) jround_up((long) cinfo->output_height, + (long) post->strip_height), + post->strip_height); #else ERREXIT(cinfo, JERR_BAD_BUFFER_MODE); #endif /* QUANT_2PASS_SUPPORTED */ } else { /* One-pass color quantization: just make a strip buffer. */ post->buffer = (*cinfo->mem->alloc_sarray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, - cinfo->output_width * cinfo->out_color_components, - post->strip_height); + ((j_common_ptr) cinfo, JPOOL_IMAGE, + cinfo->output_width * cinfo->out_color_components, + post->strip_height); } } } diff --git a/jdsample.c b/jdsample.c index 361b589e3..3da1d064a 100644 --- a/jdsample.c +++ b/jdsample.c @@ -30,13 +30,13 @@ /* Pointer to routine to upsample a single component */ typedef JMETHOD(void, upsample1_ptr, - (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); + (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); /* Private subobject */ typedef struct { - struct jpeg_upsampler pub; /* public fields */ + struct jpeg_upsampler pub; /* public fields */ /* Color conversion buffer. When using separate upsampling and color * conversion steps, this buffer holds one upsampled row group until it @@ -50,8 +50,8 @@ typedef struct { /* Per-component upsampling method pointers */ upsample1_ptr methods[MAX_COMPONENTS]; - int next_row_out; /* counts rows emitted from color_buf */ - JDIMENSION rows_to_go; /* counts rows remaining in image */ + int next_row_out; /* counts rows emitted from color_buf */ + JDIMENSION rows_to_go; /* counts rows remaining in image */ /* Height of an input row group for each component. */ int rowgroup_height[MAX_COMPONENTS]; @@ -92,10 +92,10 @@ start_pass_upsample (j_decompress_ptr cinfo) METHODDEF(void) sep_upsample (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail) + JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail) { my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; int ci; @@ -105,13 +105,13 @@ sep_upsample (j_decompress_ptr cinfo, /* Fill the conversion buffer, if it's empty */ if (upsample->next_row_out >= cinfo->max_v_samp_factor) { for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; - ci++, compptr++) { + ci++, compptr++) { /* Invoke per-component upsample method. Notice we pass a POINTER * to color_buf[ci], so that fullsize_upsample can change it. */ (*upsample->methods[ci]) (cinfo, compptr, - input_buf[ci] + (*in_row_group_ctr * upsample->rowgroup_height[ci]), - upsample->color_buf + ci); + input_buf[ci] + (*in_row_group_ctr * upsample->rowgroup_height[ci]), + upsample->color_buf + ci); } upsample->next_row_out = 0; } @@ -123,7 +123,7 @@ sep_upsample (j_decompress_ptr cinfo, /* Not more than the distance to the end of the image. Need this test * in case the image height is not a multiple of max_v_samp_factor: */ - if (num_rows > upsample->rows_to_go) + if (num_rows > upsample->rows_to_go) num_rows = upsample->rows_to_go; /* And not more than what the client can accept: */ out_rows_avail -= *out_row_ctr; @@ -131,9 +131,9 @@ sep_upsample (j_decompress_ptr cinfo, num_rows = out_rows_avail; (*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf, - (JDIMENSION) upsample->next_row_out, - output_buf + *out_row_ctr, - (int) num_rows); + (JDIMENSION) upsample->next_row_out, + output_buf + *out_row_ctr, + (int) num_rows); /* Adjust counts */ *out_row_ctr += num_rows; @@ -160,7 +160,7 @@ sep_upsample (j_decompress_ptr cinfo, METHODDEF(void) fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { *output_data_ptr = input_data; } @@ -173,9 +173,9 @@ fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) noop_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { - *output_data_ptr = NULL; /* safety check */ + *output_data_ptr = NULL; /* safety check */ } @@ -192,7 +192,7 @@ noop_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample; JSAMPARRAY output_data = *output_data_ptr; @@ -213,15 +213,15 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, outptr = output_data[outrow]; outend = outptr + cinfo->output_width; while (outptr < outend) { - invalue = *inptr++; /* don't need GETJSAMPLE() here */ + invalue = *inptr++; /* don't need GETJSAMPLE() here */ for (h = h_expand; h > 0; h--) { - *outptr++ = invalue; + *outptr++ = invalue; } } /* Generate any additional output rows by duplicating the first one */ if (v_expand > 1) { jcopy_sample_rows(output_data, outrow, output_data, outrow+1, - v_expand-1, cinfo->output_width); + v_expand-1, cinfo->output_width); } inrow++; outrow += v_expand; @@ -236,7 +236,7 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { JSAMPARRAY output_data = *output_data_ptr; register JSAMPROW inptr, outptr; @@ -249,7 +249,7 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, outptr = output_data[inrow]; outend = outptr + cinfo->output_width; while (outptr < outend) { - invalue = *inptr++; /* don't need GETJSAMPLE() here */ + invalue = *inptr++; /* don't need GETJSAMPLE() here */ *outptr++ = invalue; *outptr++ = invalue; } @@ -264,7 +264,7 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { JSAMPARRAY output_data = *output_data_ptr; register JSAMPROW inptr, outptr; @@ -278,12 +278,12 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, outptr = output_data[outrow]; outend = outptr + cinfo->output_width; while (outptr < outend) { - invalue = *inptr++; /* don't need GETJSAMPLE() here */ + invalue = *inptr++; /* don't need GETJSAMPLE() here */ *outptr++ = invalue; *outptr++ = invalue; } jcopy_sample_rows(output_data, outrow, output_data, outrow+1, - 1, cinfo->output_width); + 1, cinfo->output_width); inrow++; outrow += 2; } @@ -307,7 +307,7 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { JSAMPARRAY output_data = *output_data_ptr; register JSAMPROW inptr, outptr; @@ -348,7 +348,7 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, METHODDEF(void) h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr) { JSAMPARRAY output_data = *output_data_ptr; register JSAMPROW inptr0, inptr1, outptr; @@ -365,10 +365,10 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, for (v = 0; v < 2; v++) { /* inptr0 points to nearest input row, inptr1 points to next nearest */ inptr0 = input_data[inrow]; - if (v == 0) /* next nearest is row above */ - inptr1 = input_data[inrow-1]; - else /* next nearest is row below */ - inptr1 = input_data[inrow+1]; + if (v == 0) /* next nearest is row above */ + inptr1 = input_data[inrow-1]; + else /* next nearest is row below */ + inptr1 = input_data[inrow+1]; outptr = output_data[outrow++]; /* Special case for first column */ @@ -379,12 +379,12 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, lastcolsum = thiscolsum; thiscolsum = nextcolsum; for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) { - /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */ - /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */ - nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++); - *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4); - *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4); - lastcolsum = thiscolsum; thiscolsum = nextcolsum; + /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */ + /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */ + nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++); + *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4); + *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4); + lastcolsum = thiscolsum; thiscolsum = nextcolsum; } /* Special case for last column */ @@ -411,13 +411,13 @@ jinit_upsampler (j_decompress_ptr cinfo) upsample = (my_upsample_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_upsampler)); + SIZEOF(my_upsampler)); cinfo->upsample = (struct jpeg_upsampler *) upsample; upsample->pub.start_pass = start_pass_upsample; upsample->pub.upsample = sep_upsample; upsample->pub.need_context_rows = FALSE; /* until we find out differently */ - if (cinfo->CCIR601_sampling) /* this isn't supported */ + if (cinfo->CCIR601_sampling) /* this isn't supported */ ERREXIT(cinfo, JERR_CCIR601_NOTIMPL); /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1, @@ -434,9 +434,9 @@ jinit_upsampler (j_decompress_ptr cinfo) * are to be converted to max_h_samp_factor * max_v_samp_factor pixels. */ h_in_group = (compptr->h_samp_factor * compptr->_DCT_scaled_size) / - cinfo->_min_DCT_scaled_size; + cinfo->_min_DCT_scaled_size; v_in_group = (compptr->v_samp_factor * compptr->_DCT_scaled_size) / - cinfo->_min_DCT_scaled_size; + cinfo->_min_DCT_scaled_size; h_out_group = cinfo->max_h_samp_factor; v_out_group = cinfo->max_v_samp_factor; upsample->rowgroup_height[ci] = v_in_group; /* save for use later */ @@ -450,36 +450,36 @@ jinit_upsampler (j_decompress_ptr cinfo) upsample->methods[ci] = fullsize_upsample; need_buffer = FALSE; } else if (h_in_group * 2 == h_out_group && - v_in_group == v_out_group) { + v_in_group == v_out_group) { /* Special cases for 2h1v upsampling */ if (do_fancy && compptr->downsampled_width > 2) { - if (jsimd_can_h2v1_fancy_upsample()) - upsample->methods[ci] = jsimd_h2v1_fancy_upsample; - else - upsample->methods[ci] = h2v1_fancy_upsample; + if (jsimd_can_h2v1_fancy_upsample()) + upsample->methods[ci] = jsimd_h2v1_fancy_upsample; + else + upsample->methods[ci] = h2v1_fancy_upsample; } else { - if (jsimd_can_h2v1_upsample()) - upsample->methods[ci] = jsimd_h2v1_upsample; - else - upsample->methods[ci] = h2v1_upsample; + if (jsimd_can_h2v1_upsample()) + upsample->methods[ci] = jsimd_h2v1_upsample; + else + upsample->methods[ci] = h2v1_upsample; } } else if (h_in_group * 2 == h_out_group && - v_in_group * 2 == v_out_group) { + v_in_group * 2 == v_out_group) { /* Special cases for 2h2v upsampling */ if (do_fancy && compptr->downsampled_width > 2) { - if (jsimd_can_h2v2_fancy_upsample()) - upsample->methods[ci] = jsimd_h2v2_fancy_upsample; - else - upsample->methods[ci] = h2v2_fancy_upsample; - upsample->pub.need_context_rows = TRUE; + if (jsimd_can_h2v2_fancy_upsample()) + upsample->methods[ci] = jsimd_h2v2_fancy_upsample; + else + upsample->methods[ci] = h2v2_fancy_upsample; + upsample->pub.need_context_rows = TRUE; } else { - if (jsimd_can_h2v2_upsample()) - upsample->methods[ci] = jsimd_h2v2_upsample; - else - upsample->methods[ci] = h2v2_upsample; + if (jsimd_can_h2v2_upsample()) + upsample->methods[ci] = jsimd_h2v2_upsample; + else + upsample->methods[ci] = h2v2_upsample; } } else if ((h_out_group % h_in_group) == 0 && - (v_out_group % v_in_group) == 0) { + (v_out_group % v_in_group) == 0) { /* Generic integral-factors upsampling method */ upsample->methods[ci] = int_upsample; upsample->h_expand[ci] = (UINT8) (h_out_group / h_in_group); @@ -488,10 +488,10 @@ jinit_upsampler (j_decompress_ptr cinfo) ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL); if (need_buffer) { upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray) - ((j_common_ptr) cinfo, JPOOL_IMAGE, - (JDIMENSION) jround_up((long) cinfo->output_width, - (long) cinfo->max_h_samp_factor), - (JDIMENSION) cinfo->max_v_samp_factor); + ((j_common_ptr) cinfo, JPOOL_IMAGE, + (JDIMENSION) jround_up((long) cinfo->output_width, + (long) cinfo->max_h_samp_factor), + (JDIMENSION) cinfo->max_v_samp_factor); } } } diff --git a/jdtrans.c b/jdtrans.c index f0cd0aef9..0a163c045 100644 --- a/jdtrans.c +++ b/jdtrans.c @@ -55,20 +55,20 @@ jpeg_read_coefficients (j_decompress_ptr cinfo) int retcode; /* Call progress monitor hook if present */ if (cinfo->progress != NULL) - (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); + (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo); /* Absorb some more input */ retcode = (*cinfo->inputctl->consume_input) (cinfo); if (retcode == JPEG_SUSPENDED) - return NULL; + return NULL; if (retcode == JPEG_REACHED_EOI) - break; + break; /* Advance progress counter if appropriate */ if (cinfo->progress != NULL && - (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) { - if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) { - /* startup underestimated number of scans; ratchet up one scan */ - cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows; - } + (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) { + if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) { + /* startup underestimated number of scans; ratchet up one scan */ + cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows; + } } } /* Set state so that jpeg_finish_decompress does the right thing */ @@ -84,7 +84,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo) } /* Oops, improper usage */ ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state); - return NULL; /* keep compiler happy */ + return NULL; /* keep compiler happy */ } diff --git a/jerror.c b/jerror.c index 3da7be86a..cd4c2a3c1 100644 --- a/jerror.c +++ b/jerror.c @@ -28,7 +28,7 @@ #include #endif -#ifndef EXIT_FAILURE /* define exit() codes if not provided */ +#ifndef EXIT_FAILURE /* define exit() codes if not provided */ #define EXIT_FAILURE 1 #endif @@ -42,10 +42,10 @@ */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jpeg_std_message_table jMsgTable +#define jpeg_std_message_table jMsgTable #endif -#define JMESSAGE(code,string) string , +#define JMESSAGE(code,string) string , const char * const jpeg_std_message_table[] = { #include "jerror.h" @@ -105,7 +105,7 @@ output_message (j_common_ptr cinfo) #ifdef USE_WINDOWS_MESSAGEBOX /* Display it in a message dialog box */ MessageBox(GetActiveWindow(), buffer, "JPEG Library Error", - MB_OK | MB_ICONERROR); + MB_OK | MB_ICONERROR); #else /* Send it to stderr, adding a newline */ fprintf(stderr, "%s\n", buffer); @@ -167,8 +167,8 @@ format_message (j_common_ptr cinfo, char * buffer) if (msg_code > 0 && msg_code <= err->last_jpeg_message) { msgtext = err->jpeg_message_table[msg_code]; } else if (err->addon_message_table != NULL && - msg_code >= err->first_addon_message && - msg_code <= err->last_addon_message) { + msg_code >= err->first_addon_message && + msg_code <= err->last_addon_message) { msgtext = err->addon_message_table[msg_code - err->first_addon_message]; } @@ -193,10 +193,10 @@ format_message (j_common_ptr cinfo, char * buffer) sprintf(buffer, msgtext, err->msg_parm.s); else sprintf(buffer, msgtext, - err->msg_parm.i[0], err->msg_parm.i[1], - err->msg_parm.i[2], err->msg_parm.i[3], - err->msg_parm.i[4], err->msg_parm.i[5], - err->msg_parm.i[6], err->msg_parm.i[7]); + err->msg_parm.i[0], err->msg_parm.i[1], + err->msg_parm.i[2], err->msg_parm.i[3], + err->msg_parm.i[4], err->msg_parm.i[5], + err->msg_parm.i[6], err->msg_parm.i[7]); } @@ -213,17 +213,17 @@ reset_error_mgr (j_common_ptr cinfo) { cinfo->err->num_warnings = 0; /* trace_level is not reset since it is an application-supplied parameter */ - cinfo->err->msg_code = 0; /* may be useful as a flag for "no error" */ + cinfo->err->msg_code = 0; /* may be useful as a flag for "no error" */ } /* * Fill in the standard error-handling methods in a jpeg_error_mgr object. * Typical call is: - * struct jpeg_compress_struct cinfo; - * struct jpeg_error_mgr err; + * struct jpeg_compress_struct cinfo; + * struct jpeg_error_mgr err; * - * cinfo.err = jpeg_std_error(&err); + * cinfo.err = jpeg_std_error(&err); * after which the application may override some of the methods. */ @@ -236,16 +236,16 @@ jpeg_std_error (struct jpeg_error_mgr * err) err->format_message = format_message; err->reset_error_mgr = reset_error_mgr; - err->trace_level = 0; /* default = no tracing */ - err->num_warnings = 0; /* no warnings emitted yet */ - err->msg_code = 0; /* may be useful as a flag for "no error" */ + err->trace_level = 0; /* default = no tracing */ + err->num_warnings = 0; /* no warnings emitted yet */ + err->msg_code = 0; /* may be useful as a flag for "no error" */ /* Initialize message table pointers */ err->jpeg_message_table = jpeg_std_message_table; err->last_jpeg_message = (int) JMSG_LASTMSGCODE - 1; err->addon_message_table = NULL; - err->first_addon_message = 0; /* for safety */ + err->first_addon_message = 0; /* for safety */ err->last_addon_message = 0; return err; diff --git a/jerror.h b/jerror.h index 275086e67..fab59b380 100644 --- a/jerror.h +++ b/jerror.h @@ -33,7 +33,7 @@ typedef enum { -#define JMESSAGE(code,string) code , +#define JMESSAGE(code,string) code , #endif /* JMAKE_ENUM_LIST */ @@ -42,7 +42,7 @@ JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */ /* For maintenance convenience, list is alphabetical by message code name */ #if JPEG_LIB_VERSION < 70 JMESSAGE(JERR_ARITH_NOTIMPL, - "Sorry, arithmetic coding is not implemented") + "Sorry, arithmetic coding is not implemented") #endif JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix") JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix") @@ -55,26 +55,26 @@ JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range") JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported") #if JPEG_LIB_VERSION >= 70 JMESSAGE(JERR_BAD_DROP_SAMPLING, - "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c") + "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c") #endif JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition") JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace") JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace") JMESSAGE(JERR_BAD_LENGTH, "Bogus marker length") JMESSAGE(JERR_BAD_LIB_VERSION, - "Wrong JPEG library version: library is %d, caller expects %d") + "Wrong JPEG library version: library is %d, caller expects %d") JMESSAGE(JERR_BAD_MCU_SIZE, "Sampling factors too large for interleaved scan") JMESSAGE(JERR_BAD_POOL_ID, "Invalid memory pool code %d") JMESSAGE(JERR_BAD_PRECISION, "Unsupported JPEG data precision %d") JMESSAGE(JERR_BAD_PROGRESSION, - "Invalid progressive parameters Ss=%d Se=%d Ah=%d Al=%d") + "Invalid progressive parameters Ss=%d Se=%d Ah=%d Al=%d") JMESSAGE(JERR_BAD_PROG_SCRIPT, - "Invalid progressive parameters at scan script entry %d") + "Invalid progressive parameters at scan script entry %d") JMESSAGE(JERR_BAD_SAMPLING, "Bogus sampling factors") JMESSAGE(JERR_BAD_SCAN_SCRIPT, "Invalid scan script at entry %d") JMESSAGE(JERR_BAD_STATE, "Improper call to JPEG library in state %d") JMESSAGE(JERR_BAD_STRUCT_SIZE, - "JPEG parameter struct mismatch: library thinks size is %u, caller expects %u") + "JPEG parameter struct mismatch: library thinks size is %u, caller expects %u") JMESSAGE(JERR_BAD_VIRTUAL_ACCESS, "Bogus virtual array access") JMESSAGE(JERR_BUFFER_SIZE, "Buffer passed to JPEG library is too small") JMESSAGE(JERR_CANT_SUSPEND, "Suspension not allowed here") @@ -98,7 +98,7 @@ JMESSAGE(JERR_IMAGE_TOO_BIG, "Maximum supported image dimension is %u pixels") JMESSAGE(JERR_INPUT_EMPTY, "Empty input file") JMESSAGE(JERR_INPUT_EOF, "Premature end of input file") JMESSAGE(JERR_MISMATCHED_QUANT_TABLE, - "Cannot transcode due to multiple use of quantization table %d") + "Cannot transcode due to multiple use of quantization table %d") JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data") JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change") JMESSAGE(JERR_NOTIMPL, "Not implemented yet") @@ -113,7 +113,7 @@ JMESSAGE(JERR_NO_QUANT_TABLE, "Quantization table 0x%02x was not defined") JMESSAGE(JERR_NO_SOI, "Not a JPEG file: starts with 0x%02x 0x%02x") JMESSAGE(JERR_OUT_OF_MEMORY, "Insufficient memory (case %d)") JMESSAGE(JERR_QUANT_COMPONENTS, - "Cannot quantize more than %d color components") + "Cannot quantize more than %d color components") JMESSAGE(JERR_QUANT_FEW_COLORS, "Cannot quantize to fewer than %d colors") JMESSAGE(JERR_QUANT_MANY_COLORS, "Cannot quantize to more than %d colors") JMESSAGE(JERR_SOF_DUPLICATE, "Invalid JPEG file structure: two SOF markers") @@ -125,7 +125,7 @@ JMESSAGE(JERR_TFILE_CREATE, "Failed to create temporary file %s") JMESSAGE(JERR_TFILE_READ, "Read failed on temporary file") JMESSAGE(JERR_TFILE_SEEK, "Seek failed on temporary file") JMESSAGE(JERR_TFILE_WRITE, - "Write failed on temporary file --- out of disk space?") + "Write failed on temporary file --- out of disk space?") JMESSAGE(JERR_TOO_LITTLE_DATA, "Application transferred too few scanlines") JMESSAGE(JERR_UNKNOWN_MARKER, "Unsupported marker type 0x%02x") JMESSAGE(JERR_VIRTUAL_BUG, "Virtual array controller messed up") @@ -135,9 +135,9 @@ JMESSAGE(JERR_XMS_WRITE, "Write to XMS failed") JMESSAGE(JMSG_COPYRIGHT, JCOPYRIGHT) JMESSAGE(JMSG_VERSION, JVERSION) JMESSAGE(JTRC_16BIT_TABLES, - "Caution: quantization tables are too coarse for baseline JPEG") + "Caution: quantization tables are too coarse for baseline JPEG") JMESSAGE(JTRC_ADOBE, - "Adobe APP14 marker: version %d, flags 0x%04x 0x%04x, transform %d") + "Adobe APP14 marker: version %d, flags 0x%04x 0x%04x, transform %d") JMESSAGE(JTRC_APP0, "Unknown APP0 marker (not JFIF), length %u") JMESSAGE(JTRC_APP14, "Unknown APP14 marker (not Adobe), length %u") JMESSAGE(JTRC_DAC, "Define Arithmetic Table 0x%02x: 0x%02x") @@ -150,9 +150,9 @@ JMESSAGE(JTRC_EOI, "End Of Image") JMESSAGE(JTRC_HUFFBITS, " %3d %3d %3d %3d %3d %3d %3d %3d") JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d %d") JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE, - "Warning: thumbnail image size does not match data length %u") + "Warning: thumbnail image size does not match data length %u") JMESSAGE(JTRC_JFIF_EXTENSION, - "JFIF extension marker: type 0x%02x, length %u") + "JFIF extension marker: type 0x%02x, length %u") JMESSAGE(JTRC_JFIF_THUMBNAIL, " with %d x %d thumbnail image") JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u") JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x") @@ -163,7 +163,7 @@ JMESSAGE(JTRC_QUANT_SELECTED, "Selected %d colors for quantization") JMESSAGE(JTRC_RECOVERY_ACTION, "At marker 0x%02x, recovery action %d") JMESSAGE(JTRC_RST, "RST%d") JMESSAGE(JTRC_SMOOTH_NOTIMPL, - "Smoothing not supported with nonstandard sampling ratios") + "Smoothing not supported with nonstandard sampling ratios") JMESSAGE(JTRC_SOF, "Start Of Frame 0x%02x: width=%u, height=%u, components=%d") JMESSAGE(JTRC_SOF_COMPONENT, " Component %d: %dhx%dv q=%d") JMESSAGE(JTRC_SOI, "Start of Image") @@ -173,13 +173,13 @@ JMESSAGE(JTRC_SOS_PARAMS, " Ss=%d, Se=%d, Ah=%d, Al=%d") JMESSAGE(JTRC_TFILE_CLOSE, "Closed temporary file %s") JMESSAGE(JTRC_TFILE_OPEN, "Opened temporary file %s") JMESSAGE(JTRC_THUMB_JPEG, - "JFIF extension marker: JPEG-compressed thumbnail image, length %u") + "JFIF extension marker: JPEG-compressed thumbnail image, length %u") JMESSAGE(JTRC_THUMB_PALETTE, - "JFIF extension marker: palette thumbnail image, length %u") + "JFIF extension marker: palette thumbnail image, length %u") JMESSAGE(JTRC_THUMB_RGB, - "JFIF extension marker: RGB thumbnail image, length %u") + "JFIF extension marker: RGB thumbnail image, length %u") JMESSAGE(JTRC_UNKNOWN_IDS, - "Unrecognized component IDs %d %d %d, assuming YCbCr") + "Unrecognized component IDs %d %d %d, assuming YCbCr") JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u") JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u") JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d") @@ -187,15 +187,15 @@ JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d") JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code") #endif JMESSAGE(JWRN_BOGUS_PROGRESSION, - "Inconsistent progression sequence for component %d coefficient %d") + "Inconsistent progression sequence for component %d coefficient %d") JMESSAGE(JWRN_EXTRANEOUS_DATA, - "Corrupt JPEG data: %u extraneous bytes before marker 0x%02x") + "Corrupt JPEG data: %u extraneous bytes before marker 0x%02x") JMESSAGE(JWRN_HIT_MARKER, "Corrupt JPEG data: premature end of data segment") JMESSAGE(JWRN_HUFF_BAD_CODE, "Corrupt JPEG data: bad Huffman code") JMESSAGE(JWRN_JFIF_MAJOR, "Warning: unknown JFIF revision number %d.%02d") JMESSAGE(JWRN_JPEG_EOF, "Premature end of JPEG file") JMESSAGE(JWRN_MUST_RESYNC, - "Corrupt JPEG data: found marker 0x%02x instead of RST%d") + "Corrupt JPEG data: found marker 0x%02x instead of RST%d") JMESSAGE(JWRN_NOT_SEQUENTIAL, "Invalid SOS parameters for sequential JPEG") JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines") #if JPEG_LIB_VERSION < 70 @@ -255,7 +255,7 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code") strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \ (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo))) -#define MAKESTMT(stuff) do { stuff } while (0) +#define MAKESTMT(stuff) do { stuff } while (0) /* Nonfatal errors (we can keep going, but the data is probably corrupt) */ #define WARNMS(cinfo,code) \ @@ -286,26 +286,26 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code") (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl))) #define TRACEMS3(cinfo,lvl,code,p1,p2,p3) \ MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \ - _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \ - (cinfo)->err->msg_code = (code); \ - (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) + _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \ + (cinfo)->err->msg_code = (code); \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) #define TRACEMS4(cinfo,lvl,code,p1,p2,p3,p4) \ MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \ - _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ - (cinfo)->err->msg_code = (code); \ - (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) + _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ + (cinfo)->err->msg_code = (code); \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) #define TRACEMS5(cinfo,lvl,code,p1,p2,p3,p4,p5) \ MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \ - _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ - _mp[4] = (p5); \ - (cinfo)->err->msg_code = (code); \ - (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) + _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ + _mp[4] = (p5); \ + (cinfo)->err->msg_code = (code); \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) #define TRACEMS8(cinfo,lvl,code,p1,p2,p3,p4,p5,p6,p7,p8) \ MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \ - _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ - _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \ - (cinfo)->err->msg_code = (code); \ - (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) + _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \ + _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \ + (cinfo)->err->msg_code = (code); \ + (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); ) #define TRACEMSS(cinfo,lvl,code,str) \ ((cinfo)->err->msg_code = (code), \ strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \ diff --git a/jfdctflt.c b/jfdctflt.c index 79d7a0078..a8367c68c 100644 --- a/jfdctflt.c +++ b/jfdctflt.c @@ -37,7 +37,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #ifdef DCT_FLOAT_SUPPORTED @@ -76,24 +76,24 @@ jpeg_fdct_float (FAST_FLOAT * data) tmp5 = dataptr[2] - dataptr[5]; tmp3 = dataptr[3] + dataptr[4]; tmp4 = dataptr[3] - dataptr[4]; - + /* Even part */ - - tmp10 = tmp0 + tmp3; /* phase 2 */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + dataptr[0] = tmp10 + tmp11; /* phase 3 */ dataptr[4] = tmp10 - tmp11; - + z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */ - dataptr[2] = tmp13 + z1; /* phase 5 */ + dataptr[2] = tmp13 + z1; /* phase 5 */ dataptr[6] = tmp13 - z1; - + /* Odd part */ - tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp10 = tmp4 + tmp5; /* phase 2 */ tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; @@ -103,15 +103,15 @@ jpeg_fdct_float (FAST_FLOAT * data) z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */ z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */ - z11 = tmp7 + z3; /* phase 5 */ + z11 = tmp7 + z3; /* phase 5 */ z13 = tmp7 - z3; - dataptr[5] = z13 + z2; /* phase 6 */ + dataptr[5] = z13 + z2; /* phase 6 */ dataptr[3] = z13 - z2; dataptr[1] = z11 + z4; dataptr[7] = z11 - z4; - dataptr += DCTSIZE; /* advance pointer to next row */ + dataptr += DCTSIZE; /* advance pointer to next row */ } /* Pass 2: process columns. */ @@ -126,24 +126,24 @@ jpeg_fdct_float (FAST_FLOAT * data) tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; - + /* Even part */ - - tmp10 = tmp0 + tmp3; /* phase 2 */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ dataptr[DCTSIZE*4] = tmp10 - tmp11; - + z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */ dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ dataptr[DCTSIZE*6] = tmp13 - z1; - + /* Odd part */ - tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp10 = tmp4 + tmp5; /* phase 2 */ tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; @@ -153,7 +153,7 @@ jpeg_fdct_float (FAST_FLOAT * data) z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */ z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */ - z11 = tmp7 + z3; /* phase 5 */ + z11 = tmp7 + z3; /* phase 5 */ z13 = tmp7 - z3; dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */ @@ -161,7 +161,7 @@ jpeg_fdct_float (FAST_FLOAT * data) dataptr[DCTSIZE*1] = z11 + z4; dataptr[DCTSIZE*7] = z11 - z4; - dataptr++; /* advance pointer to next column */ + dataptr++; /* advance pointer to next column */ } } diff --git a/jfdctfst.c b/jfdctfst.c index ccb378a3b..4936d4763 100644 --- a/jfdctfst.c +++ b/jfdctfst.c @@ -33,7 +33,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #ifdef DCT_IFAST_SUPPORTED @@ -76,10 +76,10 @@ */ #if CONST_BITS == 8 -#define FIX_0_382683433 ((INT32) 98) /* FIX(0.382683433) */ -#define FIX_0_541196100 ((INT32) 139) /* FIX(0.541196100) */ -#define FIX_0_707106781 ((INT32) 181) /* FIX(0.707106781) */ -#define FIX_1_306562965 ((INT32) 334) /* FIX(1.306562965) */ +#define FIX_0_382683433 ((INT32) 98) /* FIX(0.382683433) */ +#define FIX_0_541196100 ((INT32) 139) /* FIX(0.541196100) */ +#define FIX_0_707106781 ((INT32) 181) /* FIX(0.707106781) */ +#define FIX_1_306562965 ((INT32) 334) /* FIX(1.306562965) */ #else #define FIX_0_382683433 FIX(0.382683433) #define FIX_0_541196100 FIX(0.541196100) @@ -132,24 +132,24 @@ jpeg_fdct_ifast (DCTELEM * data) tmp5 = dataptr[2] - dataptr[5]; tmp3 = dataptr[3] + dataptr[4]; tmp4 = dataptr[3] - dataptr[4]; - + /* Even part */ - - tmp10 = tmp0 + tmp3; /* phase 2 */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + dataptr[0] = tmp10 + tmp11; /* phase 3 */ dataptr[4] = tmp10 - tmp11; - + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ - dataptr[2] = tmp13 + z1; /* phase 5 */ + dataptr[2] = tmp13 + z1; /* phase 5 */ dataptr[6] = tmp13 - z1; - + /* Odd part */ - tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp10 = tmp4 + tmp5; /* phase 2 */ tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; @@ -159,15 +159,15 @@ jpeg_fdct_ifast (DCTELEM * data) z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ - z11 = tmp7 + z3; /* phase 5 */ + z11 = tmp7 + z3; /* phase 5 */ z13 = tmp7 - z3; - dataptr[5] = z13 + z2; /* phase 6 */ + dataptr[5] = z13 + z2; /* phase 6 */ dataptr[3] = z13 - z2; dataptr[1] = z11 + z4; dataptr[7] = z11 - z4; - dataptr += DCTSIZE; /* advance pointer to next row */ + dataptr += DCTSIZE; /* advance pointer to next row */ } /* Pass 2: process columns. */ @@ -182,24 +182,24 @@ jpeg_fdct_ifast (DCTELEM * data) tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; - + /* Even part */ - - tmp10 = tmp0 + tmp3; /* phase 2 */ + + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */ dataptr[DCTSIZE*4] = tmp10 - tmp11; - + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */ dataptr[DCTSIZE*6] = tmp13 - z1; - + /* Odd part */ - tmp10 = tmp4 + tmp5; /* phase 2 */ + tmp10 = tmp4 + tmp5; /* phase 2 */ tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; @@ -209,7 +209,7 @@ jpeg_fdct_ifast (DCTELEM * data) z4 = MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ z3 = MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ - z11 = tmp7 + z3; /* phase 5 */ + z11 = tmp7 + z3; /* phase 5 */ z13 = tmp7 - z3; dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */ @@ -217,7 +217,7 @@ jpeg_fdct_ifast (DCTELEM * data) dataptr[DCTSIZE*1] = z11 + z4; dataptr[DCTSIZE*7] = z11 - z4; - dataptr++; /* advance pointer to next column */ + dataptr++; /* advance pointer to next column */ } } diff --git a/jfdctint.c b/jfdctint.c index 0a78b64ae..14f486cdc 100644 --- a/jfdctint.c +++ b/jfdctint.c @@ -26,7 +26,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #ifdef DCT_ISLOW_SUPPORTED @@ -79,7 +79,7 @@ #define PASS1_BITS 2 #else #define CONST_BITS 13 -#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ #endif /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus @@ -90,18 +90,18 @@ */ #if CONST_BITS == 13 -#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */ -#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */ -#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */ -#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */ -#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */ -#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */ -#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */ -#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */ -#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */ -#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */ -#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */ -#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */ +#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */ +#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */ +#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */ +#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */ +#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */ +#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */ +#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */ +#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */ +#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */ +#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */ +#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */ +#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */ #else #define FIX_0_298631336 FIX(0.298631336) #define FIX_0_390180644 FIX(0.390180644) @@ -160,36 +160,36 @@ jpeg_fdct_islow (DCTELEM * data) tmp5 = dataptr[2] - dataptr[5]; tmp3 = dataptr[3] + dataptr[4]; tmp4 = dataptr[3] - dataptr[4]; - + /* Even part per LL&M figure 1 --- note that published figure is faulty; * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". */ - + tmp10 = tmp0 + tmp3; tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); - + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), - CONST_BITS-PASS1_BITS); + CONST_BITS-PASS1_BITS); dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), - CONST_BITS-PASS1_BITS); - + CONST_BITS-PASS1_BITS); + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). * cK represents cos(K*pi/16). * i0..i3 in the paper are tmp4..tmp7 here. */ - + z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; z3 = tmp4 + tmp6; z4 = tmp5 + tmp7; z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - + tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ @@ -198,16 +198,16 @@ jpeg_fdct_islow (DCTELEM * data) z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - + z3 += z5; z4 += z5; - + dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); - - dataptr += DCTSIZE; /* advance pointer to next row */ + + dataptr += DCTSIZE; /* advance pointer to next row */ } /* Pass 2: process columns. @@ -225,36 +225,36 @@ jpeg_fdct_islow (DCTELEM * data) tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5]; tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4]; tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4]; - + /* Even part per LL&M figure 1 --- note that published figure is faulty; * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". */ - + tmp10 = tmp0 + tmp3; tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); - + z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), - CONST_BITS+PASS1_BITS); + CONST_BITS+PASS1_BITS); dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), - CONST_BITS+PASS1_BITS); - + CONST_BITS+PASS1_BITS); + /* Odd part per figure 8 --- note paper omits factor of sqrt(2). * cK represents cos(K*pi/16). * i0..i3 in the paper are tmp4..tmp7 here. */ - + z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; z3 = tmp4 + tmp6; z4 = tmp5 + tmp7; z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - + tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ @@ -263,20 +263,20 @@ jpeg_fdct_islow (DCTELEM * data) z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - + z3 += z5; z4 += z5; - + dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, - CONST_BITS+PASS1_BITS); + CONST_BITS+PASS1_BITS); dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, - CONST_BITS+PASS1_BITS); + CONST_BITS+PASS1_BITS); dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, - CONST_BITS+PASS1_BITS); + CONST_BITS+PASS1_BITS); dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, - CONST_BITS+PASS1_BITS); - - dataptr++; /* advance pointer to next column */ + CONST_BITS+PASS1_BITS); + + dataptr++; /* advance pointer to next column */ } } diff --git a/jidctflt.c b/jidctflt.c index 0188ce3df..c172ea1fe 100644 --- a/jidctflt.c +++ b/jidctflt.c @@ -39,7 +39,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #ifdef DCT_FLOAT_SUPPORTED @@ -66,8 +66,8 @@ GLOBAL(void) jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; FAST_FLOAT tmp10, tmp11, tmp12, tmp13; @@ -95,14 +95,14 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, * With typical images and quantization tables, half or more of the * column DCT calculations can be simplified this way. */ - + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && - inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && - inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && - inptr[DCTSIZE*7] == 0) { + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && + inptr[DCTSIZE*7] == 0) { /* AC terms all zero */ FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); - + wsptr[DCTSIZE*0] = dcval; wsptr[DCTSIZE*1] = dcval; wsptr[DCTSIZE*2] = dcval; @@ -111,13 +111,13 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, wsptr[DCTSIZE*5] = dcval; wsptr[DCTSIZE*6] = dcval; wsptr[DCTSIZE*7] = dcval; - - inptr++; /* advance pointers to next column */ + + inptr++; /* advance pointers to next column */ quantptr++; wsptr++; continue; } - + /* Even part */ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); @@ -125,17 +125,17 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); - tmp10 = tmp0 + tmp2; /* phase 3 */ + tmp10 = tmp0 + tmp2; /* phase 3 */ tmp11 = tmp0 - tmp2; - tmp13 = tmp1 + tmp3; /* phases 5-3 */ + tmp13 = tmp1 + tmp3; /* phases 5-3 */ tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */ - tmp0 = tmp10 + tmp13; /* phase 2 */ + tmp0 = tmp10 + tmp13; /* phase 2 */ tmp3 = tmp10 - tmp13; tmp1 = tmp11 + tmp12; tmp2 = tmp11 - tmp12; - + /* Odd part */ tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); @@ -143,19 +143,19 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); - z13 = tmp6 + tmp5; /* phase 6 */ + z13 = tmp6 + tmp5; /* phase 6 */ z10 = tmp6 - tmp5; z11 = tmp4 + tmp7; z12 = tmp4 - tmp7; - tmp7 = z11 + z13; /* phase 5 */ + tmp7 = z11 + z13; /* phase 5 */ tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */ z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */ tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */ tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */ - tmp6 = tmp12 - tmp7; /* phase 2 */ + tmp6 = tmp12 - tmp7; /* phase 2 */ tmp5 = tmp11 - tmp6; tmp4 = tmp10 + tmp5; @@ -168,11 +168,11 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, wsptr[DCTSIZE*4] = tmp3 + tmp4; wsptr[DCTSIZE*3] = tmp3 - tmp4; - inptr++; /* advance pointers to next column */ + inptr++; /* advance pointers to next column */ quantptr++; wsptr++; } - + /* Pass 2: process rows from work array, store into output array. */ /* Note that we must descale the results by a factor of 8 == 2**3. */ @@ -184,7 +184,7 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, * the simplification applies less often (typically 5% to 10% of the time). * And testing floats for zero is relatively expensive, so we don't bother. */ - + /* Even part */ tmp10 = wsptr[0] + wsptr[4]; @@ -219,23 +219,23 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage: scale down by a factor of 8 and range-limit */ outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3) - & RANGE_MASK]; - - wsptr += DCTSIZE; /* advance pointer to next row */ + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ } } diff --git a/jidctfst.c b/jidctfst.c index dba4216fb..cae22b9a8 100644 --- a/jidctfst.c +++ b/jidctfst.c @@ -35,7 +35,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #ifdef DCT_IFAST_SUPPORTED @@ -78,7 +78,7 @@ #define PASS1_BITS 2 #else #define CONST_BITS 8 -#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ #endif /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus @@ -89,10 +89,10 @@ */ #if CONST_BITS == 8 -#define FIX_1_082392200 ((INT32) 277) /* FIX(1.082392200) */ -#define FIX_1_414213562 ((INT32) 362) /* FIX(1.414213562) */ -#define FIX_1_847759065 ((INT32) 473) /* FIX(1.847759065) */ -#define FIX_2_613125930 ((INT32) 669) /* FIX(2.613125930) */ +#define FIX_1_082392200 ((INT32) 277) /* FIX(1.082392200) */ +#define FIX_1_414213562 ((INT32) 362) /* FIX(1.414213562) */ +#define FIX_1_847759065 ((INT32) 473) /* FIX(1.847759065) */ +#define FIX_2_613125930 ((INT32) 669) /* FIX(2.613125930) */ #else #define FIX_1_082392200 FIX(1.082392200) #define FIX_1_414213562 FIX(1.414213562) @@ -129,7 +129,7 @@ #define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval)) #else #define DEQUANTIZE(coef,quantval) \ - DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS) + DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS) #endif @@ -138,11 +138,11 @@ */ #ifdef RIGHT_SHIFT_IS_UNSIGNED -#define ISHIFT_TEMPS DCTELEM ishift_temp; +#define ISHIFT_TEMPS DCTELEM ishift_temp; #if BITS_IN_JSAMPLE == 8 -#define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */ +#define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */ #else -#define DCTELEMBITS 32 /* DCTELEM must be 32 bits */ +#define DCTELEMBITS 32 /* DCTELEM must be 32 bits */ #endif #define IRIGHT_SHIFT(x,shft) \ ((ishift_temp = (x)) < 0 ? \ @@ -150,7 +150,7 @@ (ishift_temp >> (shft))) #else #define ISHIFT_TEMPS -#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) +#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) #endif #ifdef USE_ACCURATE_ROUNDING @@ -166,8 +166,8 @@ GLOBAL(void) jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; DCTELEM tmp10, tmp11, tmp12, tmp13; @@ -178,9 +178,9 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[DCTSIZE2]; /* buffers data between passes */ - SHIFT_TEMPS /* for DESCALE */ - ISHIFT_TEMPS /* for IDESCALE */ + int workspace[DCTSIZE2]; /* buffers data between passes */ + SHIFT_TEMPS /* for DESCALE */ + ISHIFT_TEMPS /* for IDESCALE */ /* Pass 1: process columns from input, store into work array. */ @@ -196,11 +196,11 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, * With typical images and quantization tables, half or more of the * column DCT calculations can be simplified this way. */ - + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && - inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && - inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && - inptr[DCTSIZE*7] == 0) { + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && + inptr[DCTSIZE*7] == 0) { /* AC terms all zero */ int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); @@ -212,13 +212,13 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, wsptr[DCTSIZE*5] = dcval; wsptr[DCTSIZE*6] = dcval; wsptr[DCTSIZE*7] = dcval; - - inptr++; /* advance pointers to next column */ + + inptr++; /* advance pointers to next column */ quantptr++; wsptr++; continue; } - + /* Even part */ tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); @@ -226,17 +226,17 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); - tmp10 = tmp0 + tmp2; /* phase 3 */ + tmp10 = tmp0 + tmp2; /* phase 3 */ tmp11 = tmp0 - tmp2; - tmp13 = tmp1 + tmp3; /* phases 5-3 */ + tmp13 = tmp1 + tmp3; /* phases 5-3 */ tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ - tmp0 = tmp10 + tmp13; /* phase 2 */ + tmp0 = tmp10 + tmp13; /* phase 2 */ tmp3 = tmp10 - tmp13; tmp1 = tmp11 + tmp12; tmp2 = tmp11 - tmp12; - + /* Odd part */ tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); @@ -244,19 +244,19 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); - z13 = tmp6 + tmp5; /* phase 6 */ + z13 = tmp6 + tmp5; /* phase 6 */ z10 = tmp6 - tmp5; z11 = tmp4 + tmp7; z12 = tmp4 - tmp7; - tmp7 = z11 + z13; /* phase 5 */ + tmp7 = z11 + z13; /* phase 5 */ tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ - tmp6 = tmp12 - tmp7; /* phase 2 */ + tmp6 = tmp12 - tmp7; /* phase 2 */ tmp5 = tmp11 - tmp6; tmp4 = tmp10 + tmp5; @@ -269,11 +269,11 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); - inptr++; /* advance pointers to next column */ + inptr++; /* advance pointers to next column */ quantptr++; wsptr++; } - + /* Pass 2: process rows from work array, store into output array. */ /* Note that we must descale the results by a factor of 8 == 2**3, */ /* and also undo the PASS1_BITS scaling. */ @@ -288,14 +288,14 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, * test takes more time than it's worth. In that case this section * may be commented out. */ - + #ifndef NO_ZERO_ROW_TEST if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && - wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { + wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { /* AC terms all zero */ JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3) - & RANGE_MASK]; - + & RANGE_MASK]; + outptr[0] = dcval; outptr[1] = dcval; outptr[2] = dcval; @@ -305,11 +305,11 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, outptr[6] = dcval; outptr[7] = dcval; - wsptr += DCTSIZE; /* advance pointer to next row */ + wsptr += DCTSIZE; /* advance pointer to next row */ continue; } #endif - + /* Even part */ tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); @@ -317,7 +317,7 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - - tmp13; + - tmp13; tmp0 = tmp10 + tmp13; tmp3 = tmp10 - tmp13; @@ -331,37 +331,37 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; - tmp7 = z11 + z13; /* phase 5 */ + tmp7 = z11 + z13; /* phase 5 */ tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ - tmp6 = tmp12 - tmp7; /* phase 2 */ + tmp6 = tmp12 - tmp7; /* phase 2 */ tmp5 = tmp11 - tmp6; tmp4 = tmp10 + tmp5; /* Final output stage: scale down by a factor of 8 and range-limit */ outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) - & RANGE_MASK]; + & RANGE_MASK]; outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) - & RANGE_MASK]; + & RANGE_MASK]; - wsptr += DCTSIZE; /* advance pointer to next row */ + wsptr += DCTSIZE; /* advance pointer to next row */ } } diff --git a/jidctint.c b/jidctint.c index 77d812153..688fd2243 100644 --- a/jidctint.c +++ b/jidctint.c @@ -50,7 +50,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #ifdef DCT_ISLOW_SUPPORTED @@ -101,7 +101,7 @@ #define PASS1_BITS 2 #else #define CONST_BITS 13 -#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ #endif /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus @@ -112,18 +112,18 @@ */ #if CONST_BITS == 13 -#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */ -#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */ -#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */ -#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */ -#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */ -#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */ -#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */ -#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */ -#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */ -#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */ -#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */ -#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */ +#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */ +#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */ +#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */ +#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */ +#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */ +#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */ +#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */ +#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */ +#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */ +#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */ +#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */ +#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */ #else #define FIX_0_298631336 FIX(0.298631336) #define FIX_0_390180644 FIX(0.390180644) @@ -168,8 +168,8 @@ GLOBAL(void) jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp1, tmp2, tmp3; INT32 tmp10, tmp11, tmp12, tmp13; @@ -180,7 +180,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[DCTSIZE2]; /* buffers data between passes */ + int workspace[DCTSIZE2]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -199,14 +199,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, * With typical images and quantization tables, half or more of the * column DCT calculations can be simplified this way. */ - + if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && - inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && - inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && - inptr[DCTSIZE*7] == 0) { + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && + inptr[DCTSIZE*7] == 0) { /* AC terms all zero */ int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; - + wsptr[DCTSIZE*0] = dcval; wsptr[DCTSIZE*1] = dcval; wsptr[DCTSIZE*2] = dcval; @@ -215,49 +215,49 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, wsptr[DCTSIZE*5] = dcval; wsptr[DCTSIZE*6] = dcval; wsptr[DCTSIZE*7] = dcval; - - inptr++; /* advance pointers to next column */ + + inptr++; /* advance pointers to next column */ quantptr++; wsptr++; continue; } - + /* Even part: reverse the even part of the forward DCT. */ /* The rotator is sqrt(2)*c(-6). */ - + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); - + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); - + z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); tmp0 = (z2 + z3) << CONST_BITS; tmp1 = (z2 - z3) << CONST_BITS; - + tmp10 = tmp0 + tmp3; tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + /* Odd part per figure 8; the matrix is unitary and hence its * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */ - + tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); - + z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; z3 = tmp0 + tmp2; z4 = tmp1 + tmp3; z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ @@ -266,17 +266,17 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - + z3 += z5; z4 += z5; - + tmp0 += z1 + z3; tmp1 += z2 + z4; tmp2 += z2 + z3; tmp3 += z1 + z4; - + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - + wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); @@ -285,12 +285,12 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); - - inptr++; /* advance pointers to next column */ + + inptr++; /* advance pointers to next column */ quantptr++; wsptr++; } - + /* Pass 2: process rows from work array, store into output array. */ /* Note that we must descale the results by a factor of 8 == 2**3, */ /* and also undo the PASS1_BITS scaling. */ @@ -305,14 +305,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, * test takes more time than it's worth. In that case this section * may be commented out. */ - + #ifndef NO_ZERO_ROW_TEST if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && - wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { + wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { /* AC terms all zero */ JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3) - & RANGE_MASK]; - + & RANGE_MASK]; + outptr[0] = dcval; outptr[1] = dcval; outptr[2] = dcval; @@ -322,44 +322,44 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, outptr[6] = dcval; outptr[7] = dcval; - wsptr += DCTSIZE; /* advance pointer to next row */ + wsptr += DCTSIZE; /* advance pointer to next row */ continue; } #endif - + /* Even part: reverse the even part of the forward DCT. */ /* The rotator is sqrt(2)*c(-6). */ - + z2 = (INT32) wsptr[2]; z3 = (INT32) wsptr[6]; - + z1 = MULTIPLY(z2 + z3, FIX_0_541196100); tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); - + tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS; tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS; - + tmp10 = tmp0 + tmp3; tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + /* Odd part per figure 8; the matrix is unitary and hence its * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */ - + tmp0 = (INT32) wsptr[7]; tmp1 = (INT32) wsptr[5]; tmp2 = (INT32) wsptr[3]; tmp3 = (INT32) wsptr[1]; - + z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; z3 = tmp0 + tmp2; z4 = tmp1 + tmp3; z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - + tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ @@ -368,43 +368,43 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - + z3 += z5; z4 += z5; - + tmp0 += z1 + z3; tmp1 += z2 + z4; tmp2 += z2 + z3; tmp3 += z1 + z4; - + /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; - - wsptr += DCTSIZE; /* advance pointer to next row */ + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ } } @@ -421,8 +421,8 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13; INT32 z1, z2, z3; @@ -432,7 +432,7 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[7*7]; /* buffers data between passes */ + int workspace[7*7]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -534,28 +534,28 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 7; /* advance pointer to next row */ + wsptr += 7; /* advance pointer to next row */ } } @@ -570,8 +570,8 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12; INT32 z1, z2, z3; @@ -581,7 +581,7 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[6*6]; /* buffers data between passes */ + int workspace[6*6]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -658,25 +658,25 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 6; /* advance pointer to next row */ + wsptr += 6; /* advance pointer to next row */ } } @@ -691,8 +691,8 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp1, tmp10, tmp11, tmp12; INT32 z1, z2, z3; @@ -702,7 +702,7 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[5*5]; /* buffers data between passes */ + int workspace[5*5]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -776,22 +776,22 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 5; /* advance pointer to next row */ + wsptr += 5; /* advance pointer to next row */ } } @@ -806,8 +806,8 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp2, tmp10, tmp12; JCOEFPTR inptr; @@ -816,7 +816,7 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[3*3]; /* buffers data between passes */ + int workspace[3*3]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -872,16 +872,16 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 3; /* advance pointer to next row */ + wsptr += 3; /* advance pointer to next row */ } } @@ -896,8 +896,8 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14; INT32 z1, z2, z3, z4; @@ -907,7 +907,7 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[8*9]; /* buffers data between passes */ + int workspace[8*9]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -1025,34 +1025,34 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 8; /* advance pointer to next row */ + wsptr += 8; /* advance pointer to next row */ } } @@ -1067,8 +1067,8 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp10, tmp11, tmp12, tmp13, tmp14; INT32 tmp20, tmp21, tmp22, tmp23, tmp24; @@ -1079,7 +1079,7 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[8*10]; /* buffers data between passes */ + int workspace[8*10]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -1101,7 +1101,7 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp11 = z3 - z2; tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1), /* c0 = (c4-c8)*2 */ - CONST_BITS-PASS1_BITS); + CONST_BITS-PASS1_BITS); z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); @@ -1217,37 +1217,37 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 8; /* advance pointer to next row */ + wsptr += 8; /* advance pointer to next row */ } } @@ -1262,8 +1262,8 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp10, tmp11, tmp12, tmp13, tmp14; INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25; @@ -1274,7 +1274,7 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[8*11]; /* buffers data between passes */ + int workspace[8*11]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -1301,13 +1301,13 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, z4 -= z2; tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */ tmp21 = tmp20 + tmp23 + tmp25 - - MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */ + MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */ tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */ tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */ tmp24 += tmp25; tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */ tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */ - MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */ + MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */ tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */ /* Odd part */ @@ -1323,7 +1323,7 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */ tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */ tmp10 = tmp11 + tmp12 + tmp13 - - MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */ + MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */ z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */ tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */ tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */ @@ -1331,8 +1331,8 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp11 += z1; tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */ tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */ - MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */ - MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */ + MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */ + MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */ /* Final output stage */ @@ -1372,13 +1372,13 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, z4 -= z2; tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976)); /* c2 */ tmp21 = tmp20 + tmp23 + tmp25 - - MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */ + MULTIPLY(z2, FIX(1.821790775)); /* c2+c4+c10-c6 */ tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */ tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */ tmp24 += tmp25; tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120)); /* c8+c10 */ tmp24 += MULTIPLY(z2, FIX(1.944413522)) - /* c2+c8 */ - MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */ + MULTIPLY(z1, FIX(1.390975730)); /* c4+c10 */ tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562)); /* c0 */ /* Odd part */ @@ -1394,7 +1394,7 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295)); /* c5-c9 */ tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */ tmp10 = tmp11 + tmp12 + tmp13 - - MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */ + MULTIPLY(z1, FIX(0.923107866)); /* c7+c5+c3-c1-2*c9 */ z1 = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */ tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588)); /* c1+c7+3*c9-c3 */ tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623)); /* c3+c5-c7-c9 */ @@ -1402,46 +1402,46 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp11 += z1; tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632)); /* c1+c5+c9-c7 */ tmp14 += MULTIPLY(z2, - FIX(1.467221301)) + /* -(c5+c9) */ - MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */ - MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */ + MULTIPLY(z3, FIX(1.001388905)) - /* c1-c9 */ + MULTIPLY(z4, FIX(1.684843907)); /* c3+c9 */ /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 8; /* advance pointer to next row */ + wsptr += 8; /* advance pointer to next row */ } } @@ -1456,8 +1456,8 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25; @@ -1468,7 +1468,7 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[8*12]; /* buffers data between passes */ + int workspace[8*12]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -1529,7 +1529,7 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */ tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */ tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */ - MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ + MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ z1 -= z4; z2 -= z3; @@ -1610,7 +1610,7 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */ tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */ tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) - /* c7-c11 */ - MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ + MULTIPLY(z4, FIX(1.982889723)); /* c5+c7 */ z1 -= z4; z2 -= z3; @@ -1621,43 +1621,43 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 8; /* advance pointer to next row */ + wsptr += 8; /* advance pointer to next row */ } } @@ -1672,8 +1672,8 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26; @@ -1684,7 +1684,7 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[8*13]; /* buffers data between passes */ + int workspace[8*13]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -1739,7 +1739,7 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp15 = z1 + z4; tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */ tmp10 = tmp11 + tmp12 + tmp13 - - MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */ + MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */ tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */ tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */ tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */ @@ -1751,11 +1751,11 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp13 += tmp14; tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */ tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */ - MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */ + MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */ z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */ tmp14 += z1; tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */ - MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */ + MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */ /* Final output stage */ @@ -1825,7 +1825,7 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp15 = z1 + z4; tmp13 = MULTIPLY(tmp15, FIX(0.937797057)); /* c7 */ tmp10 = tmp11 + tmp12 + tmp13 - - MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */ + MULTIPLY(z1, FIX(2.020082300)); /* c7+c5+c3-c1 */ tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458)); /* -c11 */ tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */ tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */ @@ -1837,55 +1837,55 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp13 += tmp14; tmp15 = MULTIPLY(tmp15, FIX(0.338443458)); /* c11 */ tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */ - MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */ + MULTIPLY(z2, FIX(0.466105296)); /* c1-c7 */ z1 = MULTIPLY(z3 - z2, FIX(0.937797057)); /* c7 */ tmp14 += z1; tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) - /* c3-c7 */ - MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */ + MULTIPLY(z4, FIX(1.742345811)); /* c1+c11 */ /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 8; /* advance pointer to next row */ + wsptr += 8; /* advance pointer to next row */ } } @@ -1900,8 +1900,8 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26; @@ -1912,7 +1912,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[8*14]; /* buffers data between passes */ + int workspace[8*14]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -1937,7 +1937,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp12 = z1 - z4; tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */ - CONST_BITS-PASS1_BITS); + CONST_BITS-PASS1_BITS); z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); @@ -1947,7 +1947,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */ tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */ tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */ - MULTIPLY(z2, FIX(1.378756276)); /* c2 */ + MULTIPLY(z2, FIX(1.378756276)); /* c2 */ tmp20 = tmp10 + tmp13; tmp26 = tmp10 - tmp13; @@ -2031,7 +2031,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */ tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */ tmp15 = MULTIPLY(z1, FIX(0.613604268)) - /* c10 */ - MULTIPLY(z2, FIX(1.378756276)); /* c2 */ + MULTIPLY(z2, FIX(1.378756276)); /* c2 */ tmp20 = tmp10 + tmp13; tmp26 = tmp10 - tmp13; @@ -2069,49 +2069,49 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 8; /* advance pointer to next row */ + wsptr += 8; /* advance pointer to next row */ } } @@ -2126,8 +2126,8 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; @@ -2138,7 +2138,7 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[8*15]; /* buffers data between passes */ + int workspace[8*15]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -2308,52 +2308,52 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 8; /* advance pointer to next row */ + wsptr += 8; /* advance pointer to next row */ } } @@ -2368,8 +2368,8 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13; INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27; @@ -2380,7 +2380,7 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[8*16]; /* buffers data between passes */ + int workspace[8*16]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -2441,9 +2441,9 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */ tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */ tmp0 = tmp1 + tmp2 + tmp3 - - MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ + MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ tmp13 = tmp10 + tmp11 + tmp12 - - MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ + MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */ tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */ tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */ @@ -2541,9 +2541,9 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp11 = MULTIPLY(tmp11, FIX(0.666655658)); /* c11 */ tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528)); /* c13 */ tmp0 = tmp1 + tmp2 + tmp3 - - MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ + MULTIPLY(z1, FIX(2.286341144)); /* c7+c5+c3-c1 */ tmp13 = tmp10 + tmp11 + tmp12 - - MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ + MULTIPLY(z1, FIX(1.835730603)); /* c9+c11+c13-c15 */ z1 = MULTIPLY(z2 + z3, FIX(0.138617169)); /* c15 */ tmp1 += z1 + MULTIPLY(z2, FIX(0.071888074)); /* c9+c11-c3-c15 */ tmp2 += z1 - MULTIPLY(z3, FIX(1.125726048)); /* c5+c7+c15-c3 */ @@ -2567,55 +2567,55 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr, /* Final output stage */ outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13, - CONST_BITS+PASS1_BITS+3) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3) + & RANGE_MASK]; - wsptr += 8; /* advance pointer to next row */ + wsptr += 8; /* advance pointer to next row */ } } diff --git a/jidctred.c b/jidctred.c index 421f3c7ca..2b385f86c 100644 --- a/jidctred.c +++ b/jidctred.c @@ -23,7 +23,7 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jdct.h" /* Private declarations for DCT subsystem */ +#include "jdct.h" /* Private declarations for DCT subsystem */ #ifdef IDCT_SCALING_SUPPORTED @@ -44,7 +44,7 @@ #define PASS1_BITS 2 #else #define CONST_BITS 13 -#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ #endif /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus @@ -55,20 +55,20 @@ */ #if CONST_BITS == 13 -#define FIX_0_211164243 ((INT32) 1730) /* FIX(0.211164243) */ -#define FIX_0_509795579 ((INT32) 4176) /* FIX(0.509795579) */ -#define FIX_0_601344887 ((INT32) 4926) /* FIX(0.601344887) */ -#define FIX_0_720959822 ((INT32) 5906) /* FIX(0.720959822) */ -#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */ -#define FIX_0_850430095 ((INT32) 6967) /* FIX(0.850430095) */ -#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */ -#define FIX_1_061594337 ((INT32) 8697) /* FIX(1.061594337) */ -#define FIX_1_272758580 ((INT32) 10426) /* FIX(1.272758580) */ -#define FIX_1_451774981 ((INT32) 11893) /* FIX(1.451774981) */ -#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */ -#define FIX_2_172734803 ((INT32) 17799) /* FIX(2.172734803) */ -#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */ -#define FIX_3_624509785 ((INT32) 29692) /* FIX(3.624509785) */ +#define FIX_0_211164243 ((INT32) 1730) /* FIX(0.211164243) */ +#define FIX_0_509795579 ((INT32) 4176) /* FIX(0.509795579) */ +#define FIX_0_601344887 ((INT32) 4926) /* FIX(0.601344887) */ +#define FIX_0_720959822 ((INT32) 5906) /* FIX(0.720959822) */ +#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */ +#define FIX_0_850430095 ((INT32) 6967) /* FIX(0.850430095) */ +#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */ +#define FIX_1_061594337 ((INT32) 8697) /* FIX(1.061594337) */ +#define FIX_1_272758580 ((INT32) 10426) /* FIX(1.272758580) */ +#define FIX_1_451774981 ((INT32) 11893) /* FIX(1.451774981) */ +#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */ +#define FIX_2_172734803 ((INT32) 17799) /* FIX(2.172734803) */ +#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */ +#define FIX_3_624509785 ((INT32) 29692) /* FIX(3.624509785) */ #else #define FIX_0_211164243 FIX(0.211164243) #define FIX_0_509795579 FIX(0.509795579) @@ -116,8 +116,8 @@ GLOBAL(void) jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp2, tmp10, tmp12; INT32 z1, z2, z3, z4; @@ -127,7 +127,7 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[DCTSIZE*4]; /* buffers data between passes */ + int workspace[DCTSIZE*4]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -140,57 +140,57 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, if (ctr == DCTSIZE-4) continue; if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && - inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 && - inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) { + inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 && + inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) { /* AC terms all zero; we need not examine term 4 for 4x4 output */ int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; - + wsptr[DCTSIZE*0] = dcval; wsptr[DCTSIZE*1] = dcval; wsptr[DCTSIZE*2] = dcval; wsptr[DCTSIZE*3] = dcval; - + continue; } - + /* Even part */ - + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); tmp0 <<= (CONST_BITS+1); - + z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865); - + tmp10 = tmp0 + tmp2; tmp12 = tmp0 - tmp2; - + /* Odd part */ - + z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); - + tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */ - + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */ - + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */ - + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */ - + + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */ + + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */ + + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */ + tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */ - + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */ - + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */ - + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */ + + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */ + + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */ + + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */ /* Final output stage */ - + wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1); wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1); wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1); wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1); } - + /* Pass 2: process 4 rows from work array, store into output array. */ wsptr = workspace; @@ -200,64 +200,64 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, #ifndef NO_ZERO_ROW_TEST if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && - wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { + wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { /* AC terms all zero */ JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3) - & RANGE_MASK]; - + & RANGE_MASK]; + outptr[0] = dcval; outptr[1] = dcval; outptr[2] = dcval; outptr[3] = dcval; - - wsptr += DCTSIZE; /* advance pointer to next row */ + + wsptr += DCTSIZE; /* advance pointer to next row */ continue; } #endif - + /* Even part */ - + tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1); - + tmp2 = MULTIPLY((INT32) wsptr[2], FIX_1_847759065) - + MULTIPLY((INT32) wsptr[6], - FIX_0_765366865); - + + MULTIPLY((INT32) wsptr[6], - FIX_0_765366865); + tmp10 = tmp0 + tmp2; tmp12 = tmp0 - tmp2; - + /* Odd part */ - + z1 = (INT32) wsptr[7]; z2 = (INT32) wsptr[5]; z3 = (INT32) wsptr[3]; z4 = (INT32) wsptr[1]; - + tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */ - + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */ - + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */ - + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */ - + + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */ + + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */ + + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */ + tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */ - + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */ - + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */ - + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */ + + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */ + + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */ + + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */ /* Final output stage */ - + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2, - CONST_BITS+PASS1_BITS+3+1) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2, - CONST_BITS+PASS1_BITS+3+1) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0, - CONST_BITS+PASS1_BITS+3+1) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0, - CONST_BITS+PASS1_BITS+3+1) - & RANGE_MASK]; - - wsptr += DCTSIZE; /* advance pointer to next row */ + CONST_BITS+PASS1_BITS+3+1) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ } } @@ -269,8 +269,8 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { INT32 tmp0, tmp10, z1; JCOEFPTR inptr; @@ -279,7 +279,7 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPROW outptr; JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[DCTSIZE*2]; /* buffers data between passes */ + int workspace[DCTSIZE*2]; /* buffers data between passes */ SHIFT_TEMPS /* Pass 1: process columns from input, store into work array. */ @@ -292,21 +292,21 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6) continue; if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 && - inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) { + inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) { /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */ int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; - + wsptr[DCTSIZE*0] = dcval; wsptr[DCTSIZE*1] = dcval; - + continue; } - + /* Even part */ - + z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); tmp10 = z1 << (CONST_BITS+2); - + /* Odd part */ z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); @@ -319,11 +319,11 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */ /* Final output stage */ - + wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2); wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2); } - + /* Pass 2: process 2 rows from work array, store into output array. */ wsptr = workspace; @@ -335,37 +335,37 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) { /* AC terms all zero */ JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3) - & RANGE_MASK]; - + & RANGE_MASK]; + outptr[0] = dcval; outptr[1] = dcval; - - wsptr += DCTSIZE; /* advance pointer to next row */ + + wsptr += DCTSIZE; /* advance pointer to next row */ continue; } #endif - + /* Even part */ - + tmp10 = ((INT32) wsptr[0]) << (CONST_BITS+2); - + /* Odd part */ tmp0 = MULTIPLY((INT32) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */ - + MULTIPLY((INT32) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */ - + MULTIPLY((INT32) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */ - + MULTIPLY((INT32) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */ + + MULTIPLY((INT32) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */ + + MULTIPLY((INT32) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */ + + MULTIPLY((INT32) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */ /* Final output stage */ - + outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0, - CONST_BITS+PASS1_BITS+3+2) - & RANGE_MASK]; + CONST_BITS+PASS1_BITS+3+2) + & RANGE_MASK]; outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0, - CONST_BITS+PASS1_BITS+3+2) - & RANGE_MASK]; - - wsptr += DCTSIZE; /* advance pointer to next row */ + CONST_BITS+PASS1_BITS+3+2) + & RANGE_MASK]; + + wsptr += DCTSIZE; /* advance pointer to next row */ } } @@ -377,8 +377,8 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col) + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) { int dcval; ISLOW_MULT_TYPE * quantptr; diff --git a/jinclude.h b/jinclude.h index 0a4f15146..b14a3fc05 100644 --- a/jinclude.h +++ b/jinclude.h @@ -17,8 +17,8 @@ /* Include auto-config file to find out which system include files we need. */ -#include "jconfig.h" /* auto configuration options */ -#define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */ +#include "jconfig.h" /* auto configuration options */ +#define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */ /* * We need the NULL macro and size_t typedef. @@ -58,14 +58,14 @@ #ifdef NEED_BSD_STRINGS #include -#define MEMZERO(target,size) bzero((void *)(target), (size_t)(size)) -#define MEMCOPY(dest,src,size) bcopy((const void *)(src), (void *)(dest), (size_t)(size)) +#define MEMZERO(target,size) bzero((void *)(target), (size_t)(size)) +#define MEMCOPY(dest,src,size) bcopy((const void *)(src), (void *)(dest), (size_t)(size)) #else /* not BSD, assume ANSI/SysV string lib */ #include -#define MEMZERO(target,size) memset((void *)(target), 0, (size_t)(size)) -#define MEMCOPY(dest,src,size) memcpy((void *)(dest), (const void *)(src), (size_t)(size)) +#define MEMZERO(target,size) memset((void *)(target), 0, (size_t)(size)) +#define MEMCOPY(dest,src,size) memcpy((void *)(dest), (const void *)(src), (size_t)(size)) #endif @@ -77,7 +77,7 @@ * we always use this SIZEOF() macro in place of using sizeof() directly. */ -#define SIZEOF(object) ((size_t) sizeof(object)) +#define SIZEOF(object) ((size_t) sizeof(object)) /* * The modules that use fread() and fwrite() always invoke them through diff --git a/jmemmgr.c b/jmemmgr.c index cf32524ea..15b6325fe 100644 --- a/jmemmgr.c +++ b/jmemmgr.c @@ -25,13 +25,13 @@ */ #define JPEG_INTERNALS -#define AM_MEMORY_MANAGER /* we define jvirt_Xarray_control structs */ +#define AM_MEMORY_MANAGER /* we define jvirt_Xarray_control structs */ #include "jinclude.h" #include "jpeglib.h" -#include "jmemsys.h" /* import the system-dependent declarations */ +#include "jmemsys.h" /* import the system-dependent declarations */ #ifndef NO_GETENV -#ifndef HAVE_STDLIB_H /* should declare getenv() */ +#ifndef HAVE_STDLIB_H /* should declare getenv() */ extern char * getenv JPP((const char * name)); #endif #endif @@ -78,7 +78,7 @@ round_up_pow2 (size_t a, size_t b) * such a compiler. */ -#ifndef ALIGN_SIZE /* so can override from jconfig.h */ +#ifndef ALIGN_SIZE /* so can override from jconfig.h */ #ifndef WITH_SIMD #define ALIGN_SIZE SIZEOF(double) #else @@ -98,17 +98,17 @@ round_up_pow2 (size_t a, size_t b) typedef struct small_pool_struct * small_pool_ptr; typedef struct small_pool_struct { - small_pool_ptr next; /* next in list of pools */ - size_t bytes_used; /* how many bytes already used within pool */ - size_t bytes_left; /* bytes still available in this pool */ + small_pool_ptr next; /* next in list of pools */ + size_t bytes_used; /* how many bytes already used within pool */ + size_t bytes_left; /* bytes still available in this pool */ } small_pool_hdr; typedef struct large_pool_struct FAR * large_pool_ptr; typedef struct large_pool_struct { - large_pool_ptr next; /* next in list of pools */ - size_t bytes_used; /* how many bytes already used within pool */ - size_t bytes_left; /* bytes still available in this pool */ + large_pool_ptr next; /* next in list of pools */ + size_t bytes_used; /* how many bytes already used within pool */ + size_t bytes_left; /* bytes still available in this pool */ } large_pool_hdr; /* @@ -116,7 +116,7 @@ typedef struct large_pool_struct { */ typedef struct { - struct jpeg_memory_mgr pub; /* public fields */ + struct jpeg_memory_mgr pub; /* public fields */ /* Each pool identifier (lifetime class) names a linked list of pools. */ small_pool_ptr small_list[JPOOL_NUMPOOLS]; @@ -136,7 +136,7 @@ typedef struct { /* alloc_sarray and alloc_barray set this value for use by virtual * array routines. */ - JDIMENSION last_rowsperchunk; /* from most recent alloc_sarray/barray */ + JDIMENSION last_rowsperchunk; /* from most recent alloc_sarray/barray */ } my_memory_mgr; typedef my_memory_mgr * my_mem_ptr; @@ -150,39 +150,39 @@ typedef my_memory_mgr * my_mem_ptr; */ struct jvirt_sarray_control { - JSAMPARRAY mem_buffer; /* => the in-memory buffer */ - JDIMENSION rows_in_array; /* total virtual array height */ - JDIMENSION samplesperrow; /* width of array (and of memory buffer) */ - JDIMENSION maxaccess; /* max rows accessed by access_virt_sarray */ - JDIMENSION rows_in_mem; /* height of memory buffer */ - JDIMENSION rowsperchunk; /* allocation chunk size in mem_buffer */ - JDIMENSION cur_start_row; /* first logical row # in the buffer */ - JDIMENSION first_undef_row; /* row # of first uninitialized row */ - boolean pre_zero; /* pre-zero mode requested? */ - boolean dirty; /* do current buffer contents need written? */ - boolean b_s_open; /* is backing-store data valid? */ - jvirt_sarray_ptr next; /* link to next virtual sarray control block */ - backing_store_info b_s_info; /* System-dependent control info */ + JSAMPARRAY mem_buffer; /* => the in-memory buffer */ + JDIMENSION rows_in_array; /* total virtual array height */ + JDIMENSION samplesperrow; /* width of array (and of memory buffer) */ + JDIMENSION maxaccess; /* max rows accessed by access_virt_sarray */ + JDIMENSION rows_in_mem; /* height of memory buffer */ + JDIMENSION rowsperchunk; /* allocation chunk size in mem_buffer */ + JDIMENSION cur_start_row; /* first logical row # in the buffer */ + JDIMENSION first_undef_row; /* row # of first uninitialized row */ + boolean pre_zero; /* pre-zero mode requested? */ + boolean dirty; /* do current buffer contents need written? */ + boolean b_s_open; /* is backing-store data valid? */ + jvirt_sarray_ptr next; /* link to next virtual sarray control block */ + backing_store_info b_s_info; /* System-dependent control info */ }; struct jvirt_barray_control { - JBLOCKARRAY mem_buffer; /* => the in-memory buffer */ - JDIMENSION rows_in_array; /* total virtual array height */ - JDIMENSION blocksperrow; /* width of array (and of memory buffer) */ - JDIMENSION maxaccess; /* max rows accessed by access_virt_barray */ - JDIMENSION rows_in_mem; /* height of memory buffer */ - JDIMENSION rowsperchunk; /* allocation chunk size in mem_buffer */ - JDIMENSION cur_start_row; /* first logical row # in the buffer */ - JDIMENSION first_undef_row; /* row # of first uninitialized row */ - boolean pre_zero; /* pre-zero mode requested? */ - boolean dirty; /* do current buffer contents need written? */ - boolean b_s_open; /* is backing-store data valid? */ - jvirt_barray_ptr next; /* link to next virtual barray control block */ - backing_store_info b_s_info; /* System-dependent control info */ + JBLOCKARRAY mem_buffer; /* => the in-memory buffer */ + JDIMENSION rows_in_array; /* total virtual array height */ + JDIMENSION blocksperrow; /* width of array (and of memory buffer) */ + JDIMENSION maxaccess; /* max rows accessed by access_virt_barray */ + JDIMENSION rows_in_mem; /* height of memory buffer */ + JDIMENSION rowsperchunk; /* allocation chunk size in mem_buffer */ + JDIMENSION cur_start_row; /* first logical row # in the buffer */ + JDIMENSION first_undef_row; /* row # of first uninitialized row */ + boolean pre_zero; /* pre-zero mode requested? */ + boolean dirty; /* do current buffer contents need written? */ + boolean b_s_open; /* is backing-store data valid? */ + jvirt_barray_ptr next; /* link to next virtual barray control block */ + backing_store_info b_s_info; /* System-dependent control info */ }; -#ifdef MEM_STATS /* optional extra stuff for statistics */ +#ifdef MEM_STATS /* optional extra stuff for statistics */ LOCAL(void) print_mem_stats (j_common_ptr cinfo, int pool_id) @@ -196,19 +196,19 @@ print_mem_stats (j_common_ptr cinfo, int pool_id) * This is helpful because message parm array can't handle longs. */ fprintf(stderr, "Freeing pool %d, total space = %ld\n", - pool_id, mem->total_space_allocated); + pool_id, mem->total_space_allocated); for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL; lhdr_ptr = lhdr_ptr->next) { fprintf(stderr, " Large chunk used %ld\n", - (long) lhdr_ptr->bytes_used); + (long) lhdr_ptr->bytes_used); } for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL; shdr_ptr = shdr_ptr->next) { fprintf(stderr, " Small chunk used %ld free %ld\n", - (long) shdr_ptr->bytes_used, - (long) shdr_ptr->bytes_left); + (long) shdr_ptr->bytes_used, + (long) shdr_ptr->bytes_left); } } @@ -221,7 +221,7 @@ out_of_memory (j_common_ptr cinfo, int which) /* If we compiled MEM_STATS support, report alloc requests before dying */ { #ifdef MEM_STATS - cinfo->err->trace_level = 2; /* force self_destruct to report stats */ + cinfo->err->trace_level = 2; /* force self_destruct to report stats */ #endif ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, which); } @@ -244,19 +244,19 @@ out_of_memory (j_common_ptr cinfo, int which) * adjustment. */ -static const size_t first_pool_slop[JPOOL_NUMPOOLS] = +static const size_t first_pool_slop[JPOOL_NUMPOOLS] = { - 1600, /* first PERMANENT pool */ - 16000 /* first IMAGE pool */ + 1600, /* first PERMANENT pool */ + 16000 /* first IMAGE pool */ }; -static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = +static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = { - 0, /* additional PERMANENT pools */ - 5000 /* additional IMAGE pools */ + 0, /* additional PERMANENT pools */ + 5000 /* additional IMAGE pools */ }; -#define MIN_SLOP 50 /* greater than 0 to avoid futile looping */ +#define MIN_SLOP 50 /* greater than 0 to avoid futile looping */ METHODDEF(void *) @@ -278,16 +278,16 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject) /* Check for unsatisfiable request (do now to ensure no overflow below) */ if ((SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK) - out_of_memory(cinfo, 1); /* request exceeds malloc's ability */ + out_of_memory(cinfo, 1); /* request exceeds malloc's ability */ /* See if space is available in any existing pool */ if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) - ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ prev_hdr_ptr = NULL; hdr_ptr = mem->small_list[pool_id]; while (hdr_ptr != NULL) { if (hdr_ptr->bytes_left >= sizeofobject) - break; /* found pool with enough space */ + break; /* found pool with enough space */ prev_hdr_ptr = hdr_ptr; hdr_ptr = hdr_ptr->next; } @@ -296,7 +296,7 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject) if (hdr_ptr == NULL) { /* min_request is what we need now, slop is what will be leftover */ min_request = SIZEOF(small_pool_hdr) + sizeofobject + ALIGN_SIZE - 1; - if (prev_hdr_ptr == NULL) /* first pool in class? */ + if (prev_hdr_ptr == NULL) /* first pool in class? */ slop = first_pool_slop[pool_id]; else slop = extra_pool_slop[pool_id]; @@ -307,17 +307,17 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject) for (;;) { hdr_ptr = (small_pool_ptr) jpeg_get_small(cinfo, min_request + slop); if (hdr_ptr != NULL) - break; + break; slop /= 2; - if (slop < MIN_SLOP) /* give up when it gets real small */ - out_of_memory(cinfo, 2); /* jpeg_get_small failed */ + if (slop < MIN_SLOP) /* give up when it gets real small */ + out_of_memory(cinfo, 2); /* jpeg_get_small failed */ } mem->total_space_allocated += min_request + slop; /* Success, initialize the new pool header and add to end of list */ hdr_ptr->next = NULL; hdr_ptr->bytes_used = 0; hdr_ptr->bytes_left = sizeofobject + slop; - if (prev_hdr_ptr == NULL) /* first pool in class? */ + if (prev_hdr_ptr == NULL) /* first pool in class? */ mem->small_list[pool_id] = hdr_ptr; else prev_hdr_ptr->next = hdr_ptr; @@ -367,17 +367,17 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject) /* Check for unsatisfiable request (do now to ensure no overflow below) */ if ((SIZEOF(large_pool_hdr) + sizeofobject + ALIGN_SIZE - 1) > MAX_ALLOC_CHUNK) - out_of_memory(cinfo, 3); /* request exceeds malloc's ability */ + out_of_memory(cinfo, 3); /* request exceeds malloc's ability */ /* Always make a new pool */ if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) - ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject + - SIZEOF(large_pool_hdr) + - ALIGN_SIZE - 1); + SIZEOF(large_pool_hdr) + + ALIGN_SIZE - 1); if (hdr_ptr == NULL) - out_of_memory(cinfo, 4); /* jpeg_get_large failed */ + out_of_memory(cinfo, 4); /* jpeg_get_large failed */ mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr) + ALIGN_SIZE - 1; /* Success, initialize the new pool header and add to list */ @@ -417,7 +417,7 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject) METHODDEF(JSAMPARRAY) alloc_sarray (j_common_ptr cinfo, int pool_id, - JDIMENSION samplesperrow, JDIMENSION numrows) + JDIMENSION samplesperrow, JDIMENSION numrows) /* Allocate a 2-D sample array */ { my_mem_ptr mem = (my_mem_ptr) cinfo->mem; @@ -428,12 +428,12 @@ alloc_sarray (j_common_ptr cinfo, int pool_id, /* Make sure each row is properly aligned */ if ((ALIGN_SIZE % SIZEOF(JSAMPLE)) != 0) - out_of_memory(cinfo, 5); /* safety check */ + out_of_memory(cinfo, 5); /* safety check */ samplesperrow = (JDIMENSION)round_up_pow2(samplesperrow, (2 * ALIGN_SIZE) / SIZEOF(JSAMPLE)); /* Calculate max # of rows allowed in one allocation chunk */ ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) / - ((long) samplesperrow * SIZEOF(JSAMPLE)); + ((long) samplesperrow * SIZEOF(JSAMPLE)); if (ltemp <= 0) ERREXIT(cinfo, JERR_WIDTH_OVERFLOW); if (ltemp < (long) numrows) @@ -444,15 +444,15 @@ alloc_sarray (j_common_ptr cinfo, int pool_id, /* Get space for row pointers (small object) */ result = (JSAMPARRAY) alloc_small(cinfo, pool_id, - (size_t) (numrows * SIZEOF(JSAMPROW))); + (size_t) (numrows * SIZEOF(JSAMPROW))); /* Get the rows themselves (large objects) */ currow = 0; while (currow < numrows) { rowsperchunk = MIN(rowsperchunk, numrows - currow); workspace = (JSAMPROW) alloc_large(cinfo, pool_id, - (size_t) ((size_t) rowsperchunk * (size_t) samplesperrow - * SIZEOF(JSAMPLE))); + (size_t) ((size_t) rowsperchunk * (size_t) samplesperrow + * SIZEOF(JSAMPLE))); for (i = rowsperchunk; i > 0; i--) { result[currow++] = workspace; workspace += samplesperrow; @@ -470,7 +470,7 @@ alloc_sarray (j_common_ptr cinfo, int pool_id, METHODDEF(JBLOCKARRAY) alloc_barray (j_common_ptr cinfo, int pool_id, - JDIMENSION blocksperrow, JDIMENSION numrows) + JDIMENSION blocksperrow, JDIMENSION numrows) /* Allocate a 2-D coefficient-block array */ { my_mem_ptr mem = (my_mem_ptr) cinfo->mem; @@ -481,11 +481,11 @@ alloc_barray (j_common_ptr cinfo, int pool_id, /* Make sure each row is properly aligned */ if ((SIZEOF(JBLOCK) % ALIGN_SIZE) != 0) - out_of_memory(cinfo, 6); /* safety check */ + out_of_memory(cinfo, 6); /* safety check */ /* Calculate max # of rows allowed in one allocation chunk */ ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) / - ((long) blocksperrow * SIZEOF(JBLOCK)); + ((long) blocksperrow * SIZEOF(JBLOCK)); if (ltemp <= 0) ERREXIT(cinfo, JERR_WIDTH_OVERFLOW); if (ltemp < (long) numrows) @@ -496,15 +496,15 @@ alloc_barray (j_common_ptr cinfo, int pool_id, /* Get space for row pointers (small object) */ result = (JBLOCKARRAY) alloc_small(cinfo, pool_id, - (size_t) (numrows * SIZEOF(JBLOCKROW))); + (size_t) (numrows * SIZEOF(JBLOCKROW))); /* Get the rows themselves (large objects) */ currow = 0; while (currow < numrows) { rowsperchunk = MIN(rowsperchunk, numrows - currow); workspace = (JBLOCKROW) alloc_large(cinfo, pool_id, - (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow - * SIZEOF(JBLOCK))); + (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow + * SIZEOF(JBLOCK))); for (i = rowsperchunk; i > 0; i--) { result[currow++] = workspace; workspace += blocksperrow; @@ -554,8 +554,8 @@ alloc_barray (j_common_ptr cinfo, int pool_id, METHODDEF(jvirt_sarray_ptr) request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero, - JDIMENSION samplesperrow, JDIMENSION numrows, - JDIMENSION maxaccess) + JDIMENSION samplesperrow, JDIMENSION numrows, + JDIMENSION maxaccess) /* Request a virtual 2-D sample array */ { my_mem_ptr mem = (my_mem_ptr) cinfo->mem; @@ -563,18 +563,18 @@ request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero, /* Only IMAGE-lifetime virtual arrays are currently supported */ if (pool_id != JPOOL_IMAGE) - ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ /* get control block */ result = (jvirt_sarray_ptr) alloc_small(cinfo, pool_id, - SIZEOF(struct jvirt_sarray_control)); + SIZEOF(struct jvirt_sarray_control)); - result->mem_buffer = NULL; /* marks array not yet realized */ + result->mem_buffer = NULL; /* marks array not yet realized */ result->rows_in_array = numrows; result->samplesperrow = samplesperrow; result->maxaccess = maxaccess; result->pre_zero = pre_zero; - result->b_s_open = FALSE; /* no associated backing-store object */ + result->b_s_open = FALSE; /* no associated backing-store object */ result->next = mem->virt_sarray_list; /* add to list of virtual arrays */ mem->virt_sarray_list = result; @@ -584,8 +584,8 @@ request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero, METHODDEF(jvirt_barray_ptr) request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero, - JDIMENSION blocksperrow, JDIMENSION numrows, - JDIMENSION maxaccess) + JDIMENSION blocksperrow, JDIMENSION numrows, + JDIMENSION maxaccess) /* Request a virtual 2-D coefficient-block array */ { my_mem_ptr mem = (my_mem_ptr) cinfo->mem; @@ -593,18 +593,18 @@ request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero, /* Only IMAGE-lifetime virtual arrays are currently supported */ if (pool_id != JPOOL_IMAGE) - ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ /* get control block */ result = (jvirt_barray_ptr) alloc_small(cinfo, pool_id, - SIZEOF(struct jvirt_barray_control)); + SIZEOF(struct jvirt_barray_control)); - result->mem_buffer = NULL; /* marks array not yet realized */ + result->mem_buffer = NULL; /* marks array not yet realized */ result->rows_in_array = numrows; result->blocksperrow = blocksperrow; result->maxaccess = maxaccess; result->pre_zero = pre_zero; - result->b_s_open = FALSE; /* no associated backing-store object */ + result->b_s_open = FALSE; /* no associated backing-store object */ result->next = mem->virt_barray_list; /* add to list of virtual arrays */ mem->virt_barray_list = result; @@ -631,26 +631,26 @@ realize_virt_arrays (j_common_ptr cinfo) for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) { if (sptr->mem_buffer == NULL) { /* if not realized yet */ space_per_minheight += (long) sptr->maxaccess * - (long) sptr->samplesperrow * SIZEOF(JSAMPLE); + (long) sptr->samplesperrow * SIZEOF(JSAMPLE); maximum_space += (long) sptr->rows_in_array * - (long) sptr->samplesperrow * SIZEOF(JSAMPLE); + (long) sptr->samplesperrow * SIZEOF(JSAMPLE); } } for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) { if (bptr->mem_buffer == NULL) { /* if not realized yet */ space_per_minheight += (long) bptr->maxaccess * - (long) bptr->blocksperrow * SIZEOF(JBLOCK); + (long) bptr->blocksperrow * SIZEOF(JBLOCK); maximum_space += (long) bptr->rows_in_array * - (long) bptr->blocksperrow * SIZEOF(JBLOCK); + (long) bptr->blocksperrow * SIZEOF(JBLOCK); } } if (space_per_minheight <= 0) - return; /* no unrealized arrays, no work */ + return; /* no unrealized arrays, no work */ /* Determine amount of memory to actually use; this is system-dependent. */ avail_mem = jpeg_mem_available(cinfo, space_per_minheight, maximum_space, - mem->total_space_allocated); + mem->total_space_allocated); /* If the maximum space needed is available, make all the buffers full * height; otherwise parcel it out with the same number of minheights @@ -673,19 +673,19 @@ realize_virt_arrays (j_common_ptr cinfo) if (sptr->mem_buffer == NULL) { /* if not realized yet */ minheights = ((long) sptr->rows_in_array - 1L) / sptr->maxaccess + 1L; if (minheights <= max_minheights) { - /* This buffer fits in memory */ - sptr->rows_in_mem = sptr->rows_in_array; + /* This buffer fits in memory */ + sptr->rows_in_mem = sptr->rows_in_array; } else { - /* It doesn't fit in memory, create backing store. */ - sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess); - jpeg_open_backing_store(cinfo, & sptr->b_s_info, - (long) sptr->rows_in_array * - (long) sptr->samplesperrow * - (long) SIZEOF(JSAMPLE)); - sptr->b_s_open = TRUE; + /* It doesn't fit in memory, create backing store. */ + sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess); + jpeg_open_backing_store(cinfo, & sptr->b_s_info, + (long) sptr->rows_in_array * + (long) sptr->samplesperrow * + (long) SIZEOF(JSAMPLE)); + sptr->b_s_open = TRUE; } sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE, - sptr->samplesperrow, sptr->rows_in_mem); + sptr->samplesperrow, sptr->rows_in_mem); sptr->rowsperchunk = mem->last_rowsperchunk; sptr->cur_start_row = 0; sptr->first_undef_row = 0; @@ -697,19 +697,19 @@ realize_virt_arrays (j_common_ptr cinfo) if (bptr->mem_buffer == NULL) { /* if not realized yet */ minheights = ((long) bptr->rows_in_array - 1L) / bptr->maxaccess + 1L; if (minheights <= max_minheights) { - /* This buffer fits in memory */ - bptr->rows_in_mem = bptr->rows_in_array; + /* This buffer fits in memory */ + bptr->rows_in_mem = bptr->rows_in_array; } else { - /* It doesn't fit in memory, create backing store. */ - bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess); - jpeg_open_backing_store(cinfo, & bptr->b_s_info, - (long) bptr->rows_in_array * - (long) bptr->blocksperrow * - (long) SIZEOF(JBLOCK)); - bptr->b_s_open = TRUE; + /* It doesn't fit in memory, create backing store. */ + bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess); + jpeg_open_backing_store(cinfo, & bptr->b_s_info, + (long) bptr->rows_in_array * + (long) bptr->blocksperrow * + (long) SIZEOF(JBLOCK)); + bptr->b_s_open = TRUE; } bptr->mem_buffer = alloc_barray(cinfo, JPOOL_IMAGE, - bptr->blocksperrow, bptr->rows_in_mem); + bptr->blocksperrow, bptr->rows_in_mem); bptr->rowsperchunk = mem->last_rowsperchunk; bptr->cur_start_row = 0; bptr->first_undef_row = 0; @@ -736,17 +736,17 @@ do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing) rows = MIN(rows, (long) ptr->first_undef_row - thisrow); /* Transfer no more than fits in file */ rows = MIN(rows, (long) ptr->rows_in_array - thisrow); - if (rows <= 0) /* this chunk might be past end of file! */ + if (rows <= 0) /* this chunk might be past end of file! */ break; byte_count = rows * bytesperrow; if (writing) (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info, - (void FAR *) ptr->mem_buffer[i], - file_offset, byte_count); + (void FAR *) ptr->mem_buffer[i], + file_offset, byte_count); else (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info, - (void FAR *) ptr->mem_buffer[i], - file_offset, byte_count); + (void FAR *) ptr->mem_buffer[i], + file_offset, byte_count); file_offset += byte_count; } } @@ -769,17 +769,17 @@ do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing) rows = MIN(rows, (long) ptr->first_undef_row - thisrow); /* Transfer no more than fits in file */ rows = MIN(rows, (long) ptr->rows_in_array - thisrow); - if (rows <= 0) /* this chunk might be past end of file! */ + if (rows <= 0) /* this chunk might be past end of file! */ break; byte_count = rows * bytesperrow; if (writing) (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info, - (void FAR *) ptr->mem_buffer[i], - file_offset, byte_count); + (void FAR *) ptr->mem_buffer[i], + file_offset, byte_count); else (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info, - (void FAR *) ptr->mem_buffer[i], - file_offset, byte_count); + (void FAR *) ptr->mem_buffer[i], + file_offset, byte_count); file_offset += byte_count; } } @@ -787,8 +787,8 @@ do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing) METHODDEF(JSAMPARRAY) access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr, - JDIMENSION start_row, JDIMENSION num_rows, - boolean writable) + JDIMENSION start_row, JDIMENSION num_rows, + boolean writable) /* Access the part of a virtual sample array starting at start_row */ /* and extending for num_rows rows. writable is true if */ /* caller intends to modify the accessed area. */ @@ -826,7 +826,7 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr, ltemp = (long) end_row - (long) ptr->rows_in_mem; if (ltemp < 0) - ltemp = 0; /* don't fall off front end of file */ + ltemp = 0; /* don't fall off front end of file */ ptr->cur_start_row = (JDIMENSION) ltemp; } /* Read in the selected part of the array. @@ -841,9 +841,9 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr, */ if (ptr->first_undef_row < end_row) { if (ptr->first_undef_row < start_row) { - if (writable) /* writer skipped over a section of array */ - ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); - undef_row = start_row; /* but reader is allowed to read ahead */ + if (writable) /* writer skipped over a section of array */ + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + undef_row = start_row; /* but reader is allowed to read ahead */ } else { undef_row = ptr->first_undef_row; } @@ -854,12 +854,12 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr, undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */ end_row -= ptr->cur_start_row; while (undef_row < end_row) { - jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow); - undef_row++; + jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow); + undef_row++; } } else { - if (! writable) /* reader looking at undefined data */ - ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + if (! writable) /* reader looking at undefined data */ + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); } } /* Flag the buffer dirty if caller will write in it */ @@ -872,8 +872,8 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr, METHODDEF(JBLOCKARRAY) access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr, - JDIMENSION start_row, JDIMENSION num_rows, - boolean writable) + JDIMENSION start_row, JDIMENSION num_rows, + boolean writable) /* Access the part of a virtual block array starting at start_row */ /* and extending for num_rows rows. writable is true if */ /* caller intends to modify the accessed area. */ @@ -911,7 +911,7 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr, ltemp = (long) end_row - (long) ptr->rows_in_mem; if (ltemp < 0) - ltemp = 0; /* don't fall off front end of file */ + ltemp = 0; /* don't fall off front end of file */ ptr->cur_start_row = (JDIMENSION) ltemp; } /* Read in the selected part of the array. @@ -926,9 +926,9 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr, */ if (ptr->first_undef_row < end_row) { if (ptr->first_undef_row < start_row) { - if (writable) /* writer skipped over a section of array */ - ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); - undef_row = start_row; /* but reader is allowed to read ahead */ + if (writable) /* writer skipped over a section of array */ + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + undef_row = start_row; /* but reader is allowed to read ahead */ } else { undef_row = ptr->first_undef_row; } @@ -939,12 +939,12 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr, undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */ end_row -= ptr->cur_start_row; while (undef_row < end_row) { - jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow); - undef_row++; + jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow); + undef_row++; } } else { - if (! writable) /* reader looking at undefined data */ - ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); + if (! writable) /* reader looking at undefined data */ + ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS); } } /* Flag the buffer dirty if caller will write in it */ @@ -968,7 +968,7 @@ free_pool (j_common_ptr cinfo, int pool_id) size_t space_freed; if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) - ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ + ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */ #ifdef MEM_STATS if (cinfo->err->trace_level > 1) @@ -981,16 +981,16 @@ free_pool (j_common_ptr cinfo, int pool_id) jvirt_barray_ptr bptr; for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) { - if (sptr->b_s_open) { /* there may be no backing store */ - sptr->b_s_open = FALSE; /* prevent recursive close if error */ - (*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info); + if (sptr->b_s_open) { /* there may be no backing store */ + sptr->b_s_open = FALSE; /* prevent recursive close if error */ + (*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info); } } mem->virt_sarray_list = NULL; for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) { - if (bptr->b_s_open) { /* there may be no backing store */ - bptr->b_s_open = FALSE; /* prevent recursive close if error */ - (*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info); + if (bptr->b_s_open) { /* there may be no backing store */ + bptr->b_s_open = FALSE; /* prevent recursive close if error */ + (*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info); } } mem->virt_barray_list = NULL; @@ -1003,8 +1003,8 @@ free_pool (j_common_ptr cinfo, int pool_id) while (lhdr_ptr != NULL) { large_pool_ptr next_lhdr_ptr = lhdr_ptr->next; space_freed = lhdr_ptr->bytes_used + - lhdr_ptr->bytes_left + - SIZEOF(large_pool_hdr); + lhdr_ptr->bytes_left + + SIZEOF(large_pool_hdr); jpeg_free_large(cinfo, (void FAR *) lhdr_ptr, space_freed); mem->total_space_allocated -= space_freed; lhdr_ptr = next_lhdr_ptr; @@ -1017,8 +1017,8 @@ free_pool (j_common_ptr cinfo, int pool_id) while (shdr_ptr != NULL) { small_pool_ptr next_shdr_ptr = shdr_ptr->next; space_freed = shdr_ptr->bytes_used + - shdr_ptr->bytes_left + - SIZEOF(small_pool_hdr); + shdr_ptr->bytes_left + + SIZEOF(small_pool_hdr); jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed); mem->total_space_allocated -= space_freed; shdr_ptr = next_shdr_ptr; @@ -1046,9 +1046,9 @@ self_destruct (j_common_ptr cinfo) /* Release the memory manager control block too. */ jpeg_free_small(cinfo, (void *) cinfo->mem, SIZEOF(my_memory_mgr)); - cinfo->mem = NULL; /* ensures I will be called only once */ + cinfo->mem = NULL; /* ensures I will be called only once */ - jpeg_mem_term(cinfo); /* system-dependent cleanup */ + jpeg_mem_term(cinfo); /* system-dependent cleanup */ } @@ -1065,7 +1065,7 @@ jinit_memory_mgr (j_common_ptr cinfo) int pool; size_t test_mac; - cinfo->mem = NULL; /* for safety if init fails */ + cinfo->mem = NULL; /* for safety if init fails */ /* Check for configuration errors. * SIZEOF(ALIGN_TYPE) should be a power of 2; otherwise, it probably @@ -1092,7 +1092,7 @@ jinit_memory_mgr (j_common_ptr cinfo) mem = (my_mem_ptr) jpeg_get_small(cinfo, SIZEOF(my_memory_mgr)); if (mem == NULL) { - jpeg_mem_term(cinfo); /* system-dependent cleanup */ + jpeg_mem_term(cinfo); /* system-dependent cleanup */ ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 0); } @@ -1140,9 +1140,9 @@ jinit_memory_mgr (j_common_ptr cinfo) char ch = 'x'; if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) { - if (ch == 'm' || ch == 'M') - max_to_use *= 1000L; - mem->pub.max_memory_to_use = max_to_use * 1000L; + if (ch == 'm' || ch == 'M') + max_to_use *= 1000L; + mem->pub.max_memory_to_use = max_to_use * 1000L; } } } diff --git a/jmemnobs.c b/jmemnobs.c index 34b189563..2e4de0900 100644 --- a/jmemnobs.c +++ b/jmemnobs.c @@ -18,9 +18,9 @@ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" -#include "jmemsys.h" /* import the system-dependent declarations */ +#include "jmemsys.h" /* import the system-dependent declarations */ -#ifndef HAVE_STDLIB_H /* should declare malloc(),free() */ +#ifndef HAVE_STDLIB_H /* should declare malloc(),free() */ extern void * malloc JPP((size_t size)); extern void free JPP((void *ptr)); #endif @@ -71,7 +71,7 @@ jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject) GLOBAL(size_t) jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed, - size_t max_bytes_needed, size_t already_allocated) + size_t max_bytes_needed, size_t already_allocated) { return max_bytes_needed; } @@ -85,7 +85,7 @@ jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed, GLOBAL(void) jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info, - long total_bytes_needed) + long total_bytes_needed) { ERREXIT(cinfo, JERR_NO_BACKING_STORE); } @@ -99,7 +99,7 @@ jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info, GLOBAL(long) jpeg_mem_init (j_common_ptr cinfo) { - return 0; /* just set max_memory_to_use to 0 */ + return 0; /* just set max_memory_to_use to 0 */ } GLOBAL(void) diff --git a/jmemsys.h b/jmemsys.h index b19094596..5b6b7c4e6 100644 --- a/jmemsys.h +++ b/jmemsys.h @@ -22,14 +22,14 @@ /* Short forms of external names for systems with brain-damaged linkers. */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jpeg_get_small jGetSmall -#define jpeg_free_small jFreeSmall -#define jpeg_get_large jGetLarge -#define jpeg_free_large jFreeLarge -#define jpeg_mem_available jMemAvail -#define jpeg_open_backing_store jOpenBackStore -#define jpeg_mem_init jMemInit -#define jpeg_mem_term jMemTerm +#define jpeg_get_small jGetSmall +#define jpeg_free_small jFreeSmall +#define jpeg_get_large jGetLarge +#define jpeg_free_large jFreeLarge +#define jpeg_mem_available jMemAvail +#define jpeg_open_backing_store jOpenBackStore +#define jpeg_mem_init jMemInit +#define jpeg_mem_term jMemTerm #endif /* NEED_SHORT_EXTERNAL_NAMES */ @@ -46,7 +46,7 @@ EXTERN(void *) jpeg_get_small JPP((j_common_ptr cinfo, size_t sizeofobject)); EXTERN(void) jpeg_free_small JPP((j_common_ptr cinfo, void * object, - size_t sizeofobject)); + size_t sizeofobject)); /* * These two functions are used to allocate and release large chunks of @@ -58,9 +58,9 @@ EXTERN(void) jpeg_free_small JPP((j_common_ptr cinfo, void * object, */ EXTERN(void FAR *) jpeg_get_large JPP((j_common_ptr cinfo, - size_t sizeofobject)); + size_t sizeofobject)); EXTERN(void) jpeg_free_large JPP((j_common_ptr cinfo, void FAR * object, - size_t sizeofobject)); + size_t sizeofobject)); /* * The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may @@ -74,7 +74,7 @@ EXTERN(void) jpeg_free_large JPP((j_common_ptr cinfo, void FAR * object, * size_t and will be a multiple of sizeof(align_type). */ -#ifndef MAX_ALLOC_CHUNK /* may be overridden in jconfig.h */ +#ifndef MAX_ALLOC_CHUNK /* may be overridden in jconfig.h */ #define MAX_ALLOC_CHUNK 1000000000L #endif @@ -101,9 +101,9 @@ EXTERN(void) jpeg_free_large JPP((j_common_ptr cinfo, void FAR * object, */ EXTERN(size_t) jpeg_mem_available JPP((j_common_ptr cinfo, - size_t min_bytes_needed, - size_t max_bytes_needed, - size_t already_allocated)); + size_t min_bytes_needed, + size_t max_bytes_needed, + size_t already_allocated)); /* @@ -113,23 +113,23 @@ EXTERN(size_t) jpeg_mem_available JPP((j_common_ptr cinfo, * are private to the system-dependent backing store routines. */ -#define TEMP_NAME_LENGTH 64 /* max length of a temporary file's name */ +#define TEMP_NAME_LENGTH 64 /* max length of a temporary file's name */ -#ifdef USE_MSDOS_MEMMGR /* DOS-specific junk */ +#ifdef USE_MSDOS_MEMMGR /* DOS-specific junk */ -typedef unsigned short XMSH; /* type of extended-memory handles */ -typedef unsigned short EMSH; /* type of expanded-memory handles */ +typedef unsigned short XMSH; /* type of extended-memory handles */ +typedef unsigned short EMSH; /* type of expanded-memory handles */ typedef union { - short file_handle; /* DOS file handle if it's a temp file */ - XMSH xms_handle; /* handle if it's a chunk of XMS */ - EMSH ems_handle; /* handle if it's a chunk of EMS */ + short file_handle; /* DOS file handle if it's a temp file */ + XMSH xms_handle; /* handle if it's a chunk of XMS */ + EMSH ems_handle; /* handle if it's a chunk of EMS */ } handle_union; #endif /* USE_MSDOS_MEMMGR */ -#ifdef USE_MAC_MEMMGR /* Mac-specific junk */ +#ifdef USE_MAC_MEMMGR /* Mac-specific junk */ #include #endif /* USE_MAC_MEMMGR */ @@ -139,30 +139,30 @@ typedef struct backing_store_struct * backing_store_ptr; typedef struct backing_store_struct { /* Methods for reading/writing/closing this backing-store object */ JMETHOD(void, read_backing_store, (j_common_ptr cinfo, - backing_store_ptr info, - void FAR * buffer_address, - long file_offset, long byte_count)); + backing_store_ptr info, + void FAR * buffer_address, + long file_offset, long byte_count)); JMETHOD(void, write_backing_store, (j_common_ptr cinfo, - backing_store_ptr info, - void FAR * buffer_address, - long file_offset, long byte_count)); + backing_store_ptr info, + void FAR * buffer_address, + long file_offset, long byte_count)); JMETHOD(void, close_backing_store, (j_common_ptr cinfo, - backing_store_ptr info)); + backing_store_ptr info)); /* Private fields for system-dependent backing-store management */ #ifdef USE_MSDOS_MEMMGR /* For the MS-DOS manager (jmemdos.c), we need: */ - handle_union handle; /* reference to backing-store storage object */ + handle_union handle; /* reference to backing-store storage object */ char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */ #else #ifdef USE_MAC_MEMMGR /* For the Mac manager (jmemmac.c), we need: */ - short temp_file; /* file reference number to temp file */ - FSSpec tempSpec; /* the FSSpec for the temp file */ + short temp_file; /* file reference number to temp file */ + FSSpec tempSpec; /* the FSSpec for the temp file */ char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */ #else /* For a typical implementation with temp files, we need: */ - FILE * temp_file; /* stdio reference to temp file */ + FILE * temp_file; /* stdio reference to temp file */ char temp_name[TEMP_NAME_LENGTH]; /* name of temp file */ #endif #endif @@ -178,8 +178,8 @@ typedef struct backing_store_struct { */ EXTERN(void) jpeg_open_backing_store JPP((j_common_ptr cinfo, - backing_store_ptr info, - long total_bytes_needed)); + backing_store_ptr info, + long total_bytes_needed)); /* diff --git a/jmorecfg.h b/jmorecfg.h index 55af05680..f41eccf00 100644 --- a/jmorecfg.h +++ b/jmorecfg.h @@ -22,7 +22,7 @@ * We do not support run-time selection of data precision, sorry. */ -#define BITS_IN_JSAMPLE 8 /* use 8 or 12 */ +#define BITS_IN_JSAMPLE 8 /* use 8 or 12 */ /* @@ -34,7 +34,7 @@ * bytes of storage, whether actually used in an image or not.) */ -#define MAX_COMPONENTS 10 /* maximum number of image components */ +#define MAX_COMPONENTS 10 /* maximum number of image components */ /* @@ -72,8 +72,8 @@ typedef char JSAMPLE; #endif /* HAVE_UNSIGNED_CHAR */ -#define MAXJSAMPLE 255 -#define CENTERJSAMPLE 128 +#define MAXJSAMPLE 255 +#define CENTERJSAMPLE 128 #endif /* BITS_IN_JSAMPLE == 8 */ @@ -86,8 +86,8 @@ typedef char JSAMPLE; typedef short JSAMPLE; #define GETJSAMPLE(value) ((int) (value)) -#define MAXJSAMPLE 4095 -#define CENTERJSAMPLE 2048 +#define MAXJSAMPLE 4095 +#define CENTERJSAMPLE 2048 #endif /* BITS_IN_JSAMPLE == 12 */ @@ -153,13 +153,13 @@ typedef unsigned int UINT16; /* INT16 must hold at least the values -32768..32767. */ -#ifndef XMD_H /* X11/xmd.h correctly defines INT16 */ +#ifndef XMD_H /* X11/xmd.h correctly defines INT16 */ typedef short INT16; #endif /* INT32 must hold at least signed 32-bit values. */ -#ifndef XMD_H /* X11/xmd.h correctly defines INT32 */ +#ifndef XMD_H /* X11/xmd.h correctly defines INT32 */ typedef long INT32; #endif @@ -183,13 +183,13 @@ typedef unsigned int JDIMENSION; */ /* a function called through method pointers: */ -#define METHODDEF(type) static type +#define METHODDEF(type) static type /* a function used only in its module: */ -#define LOCAL(type) static type +#define LOCAL(type) static type /* a function referenced thru EXTERNs: */ -#define GLOBAL(type) type +#define GLOBAL(type) type /* a reference to a GLOBAL function: */ -#define EXTERN(type) extern type +#define EXTERN(type) extern type /* This macro is used to declare a "method", that is, a function pointer. @@ -231,11 +231,11 @@ typedef unsigned int JDIMENSION; #ifndef HAVE_BOOLEAN typedef int boolean; #endif -#ifndef FALSE /* in case these macros already exist */ -#define FALSE 0 /* values of boolean */ +#ifndef FALSE /* in case these macros already exist */ +#define FALSE 0 /* values of boolean */ #endif #ifndef TRUE -#define TRUE 1 +#define TRUE 1 #endif @@ -263,15 +263,15 @@ typedef int boolean; /* Capability options common to encoder and decoder: */ -#define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */ -#define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */ -#define DCT_FLOAT_SUPPORTED /* floating-point: accurate, fast on fast HW */ +#define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */ +#define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */ +#define DCT_FLOAT_SUPPORTED /* floating-point: accurate, fast on fast HW */ /* Encoder capability options: */ #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */ -#define C_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/ -#define ENTROPY_OPT_SUPPORTED /* Optimization of entropy coding parms? */ +#define C_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/ +#define ENTROPY_OPT_SUPPORTED /* Optimization of entropy coding parms? */ /* Note: if you selected 12-bit data precision, it is dangerous to turn off * ENTROPY_OPT_SUPPORTED. The standard Huffman tables are only good for 8-bit * precision, so jchuff.c normally uses entropy optimization to compute @@ -285,14 +285,14 @@ typedef int boolean; /* Decoder capability options: */ #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */ -#define D_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/ -#define SAVE_MARKERS_SUPPORTED /* jpeg_save_markers() needed? */ +#define D_PROGRESSIVE_SUPPORTED /* Progressive JPEG? (Requires MULTISCAN)*/ +#define SAVE_MARKERS_SUPPORTED /* jpeg_save_markers() needed? */ #define BLOCK_SMOOTHING_SUPPORTED /* Block smoothing? (Progressive only) */ -#define IDCT_SCALING_SUPPORTED /* Output rescaling via IDCT? */ +#define IDCT_SCALING_SUPPORTED /* Output rescaling via IDCT? */ #undef UPSAMPLE_SCALING_SUPPORTED /* Output rescaling at upsample stage? */ #define UPSAMPLE_MERGING_SUPPORTED /* Fast path for sloppy upsampling? */ -#define QUANT_1PASS_SUPPORTED /* 1-pass color quantization? */ -#define QUANT_2PASS_SUPPORTED /* 2-pass color quantization? */ +#define QUANT_1PASS_SUPPORTED /* 1-pass color quantization? */ +#define QUANT_2PASS_SUPPORTED /* 2-pass color quantization? */ /* more capability options later, no doubt */ @@ -312,10 +312,10 @@ typedef int boolean; * can't use color quantization if you change that value. */ -#define RGB_RED 0 /* Offset of Red in an RGB scanline element */ -#define RGB_GREEN 1 /* Offset of Green */ -#define RGB_BLUE 2 /* Offset of Blue */ -#define RGB_PIXELSIZE 3 /* JSAMPLEs per RGB scanline element */ +#define RGB_RED 0 /* Offset of Red in an RGB scanline element */ +#define RGB_GREEN 1 /* Offset of Green */ +#define RGB_BLUE 2 /* Offset of Blue */ +#define RGB_PIXELSIZE 3 /* JSAMPLEs per RGB scanline element */ #define JPEG_NUMCS 16 @@ -382,7 +382,7 @@ static const int rgb_pixelsize[JPEG_NUMCS] = { #ifndef MULTIPLIER #ifndef WITH_SIMD -#define MULTIPLIER int /* type for fastest integer multiply */ +#define MULTIPLIER int /* type for fastest integer multiply */ #else #define MULTIPLIER short /* prefer 16-bit with SIMD for parellelism */ #endif diff --git a/jpegint.h b/jpegint.h index 78717482e..fcacef39c 100644 --- a/jpegint.h +++ b/jpegint.h @@ -14,30 +14,30 @@ /* Declarations for both compression & decompression */ -typedef enum { /* Operating modes for buffer controllers */ - JBUF_PASS_THRU, /* Plain stripwise operation */ - /* Remaining modes require a full-image buffer to have been created */ - JBUF_SAVE_SOURCE, /* Run source subobject only, save output */ - JBUF_CRANK_DEST, /* Run dest subobject only, using saved data */ - JBUF_SAVE_AND_PASS /* Run both subobjects, save output */ +typedef enum { /* Operating modes for buffer controllers */ + JBUF_PASS_THRU, /* Plain stripwise operation */ + /* Remaining modes require a full-image buffer to have been created */ + JBUF_SAVE_SOURCE, /* Run source subobject only, save output */ + JBUF_CRANK_DEST, /* Run dest subobject only, using saved data */ + JBUF_SAVE_AND_PASS /* Run both subobjects, save output */ } J_BUF_MODE; /* Values of global_state field (jdapi.c has some dependencies on ordering!) */ -#define CSTATE_START 100 /* after create_compress */ -#define CSTATE_SCANNING 101 /* start_compress done, write_scanlines OK */ -#define CSTATE_RAW_OK 102 /* start_compress done, write_raw_data OK */ -#define CSTATE_WRCOEFS 103 /* jpeg_write_coefficients done */ -#define DSTATE_START 200 /* after create_decompress */ -#define DSTATE_INHEADER 201 /* reading header markers, no SOS yet */ -#define DSTATE_READY 202 /* found SOS, ready for start_decompress */ -#define DSTATE_PRELOAD 203 /* reading multiscan file in start_decompress*/ -#define DSTATE_PRESCAN 204 /* performing dummy pass for 2-pass quant */ -#define DSTATE_SCANNING 205 /* start_decompress done, read_scanlines OK */ -#define DSTATE_RAW_OK 206 /* start_decompress done, read_raw_data OK */ -#define DSTATE_BUFIMAGE 207 /* expecting jpeg_start_output */ -#define DSTATE_BUFPOST 208 /* looking for SOS/EOI in jpeg_finish_output */ -#define DSTATE_RDCOEFS 209 /* reading file in jpeg_read_coefficients */ -#define DSTATE_STOPPING 210 /* looking for EOI in jpeg_finish_decompress */ +#define CSTATE_START 100 /* after create_compress */ +#define CSTATE_SCANNING 101 /* start_compress done, write_scanlines OK */ +#define CSTATE_RAW_OK 102 /* start_compress done, write_raw_data OK */ +#define CSTATE_WRCOEFS 103 /* jpeg_write_coefficients done */ +#define DSTATE_START 200 /* after create_decompress */ +#define DSTATE_INHEADER 201 /* reading header markers, no SOS yet */ +#define DSTATE_READY 202 /* found SOS, ready for start_decompress */ +#define DSTATE_PRELOAD 203 /* reading multiscan file in start_decompress*/ +#define DSTATE_PRESCAN 204 /* performing dummy pass for 2-pass quant */ +#define DSTATE_SCANNING 205 /* start_decompress done, read_scanlines OK */ +#define DSTATE_RAW_OK 206 /* start_decompress done, read_raw_data OK */ +#define DSTATE_BUFIMAGE 207 /* expecting jpeg_start_output */ +#define DSTATE_BUFPOST 208 /* looking for SOS/EOI in jpeg_finish_output */ +#define DSTATE_RDCOEFS 209 /* reading file in jpeg_read_coefficients */ +#define DSTATE_STOPPING 210 /* looking for EOI in jpeg_finish_decompress */ /* Declarations for compression modules */ @@ -49,54 +49,54 @@ struct jpeg_comp_master { JMETHOD(void, finish_pass, (j_compress_ptr cinfo)); /* State variables made visible to other modules */ - boolean call_pass_startup; /* True if pass_startup must be called */ - boolean is_last_pass; /* True during last pass */ + boolean call_pass_startup; /* True if pass_startup must be called */ + boolean is_last_pass; /* True during last pass */ }; /* Main buffer control (downsampled-data buffer) */ struct jpeg_c_main_controller { JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode)); JMETHOD(void, process_data, (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, - JDIMENSION in_rows_avail)); + JSAMPARRAY input_buf, JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail)); }; /* Compression preprocessing (downsampling input buffer control) */ struct jpeg_c_prep_controller { JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode)); JMETHOD(void, pre_process_data, (j_compress_ptr cinfo, - JSAMPARRAY input_buf, - JDIMENSION *in_row_ctr, - JDIMENSION in_rows_avail, - JSAMPIMAGE output_buf, - JDIMENSION *out_row_group_ctr, - JDIMENSION out_row_groups_avail)); + JSAMPARRAY input_buf, + JDIMENSION *in_row_ctr, + JDIMENSION in_rows_avail, + JSAMPIMAGE output_buf, + JDIMENSION *out_row_group_ctr, + JDIMENSION out_row_groups_avail)); }; /* Coefficient buffer control */ struct jpeg_c_coef_controller { JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode)); JMETHOD(boolean, compress_data, (j_compress_ptr cinfo, - JSAMPIMAGE input_buf)); + JSAMPIMAGE input_buf)); }; /* Colorspace conversion */ struct jpeg_color_converter { JMETHOD(void, start_pass, (j_compress_ptr cinfo)); JMETHOD(void, color_convert, (j_compress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPIMAGE output_buf, - JDIMENSION output_row, int num_rows)); + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); }; /* Downsampling */ struct jpeg_downsampler { JMETHOD(void, start_pass, (j_compress_ptr cinfo)); JMETHOD(void, downsample, (j_compress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION in_row_index, - JSAMPIMAGE output_buf, - JDIMENSION out_row_group_index)); + JSAMPIMAGE input_buf, JDIMENSION in_row_index, + JSAMPIMAGE output_buf, + JDIMENSION out_row_group_index)); - boolean need_context_rows; /* TRUE if need rows above & below */ + boolean need_context_rows; /* TRUE if need rows above & below */ }; /* Forward DCT (also controls coefficient quantization) */ @@ -104,10 +104,10 @@ struct jpeg_forward_dct { JMETHOD(void, start_pass, (j_compress_ptr cinfo)); /* perhaps this should be an array??? */ JMETHOD(void, forward_DCT, (j_compress_ptr cinfo, - jpeg_component_info * compptr, - JSAMPARRAY sample_data, JBLOCKROW coef_blocks, - JDIMENSION start_row, JDIMENSION start_col, - JDIMENSION num_blocks)); + jpeg_component_info * compptr, + JSAMPARRAY sample_data, JBLOCKROW coef_blocks, + JDIMENSION start_row, JDIMENSION start_col, + JDIMENSION num_blocks)); }; /* Entropy encoding */ @@ -127,7 +127,7 @@ struct jpeg_marker_writer { /* These routines are exported to allow insertion of extra markers */ /* Probably only COM and APPn markers should be written this way */ JMETHOD(void, write_marker_header, (j_compress_ptr cinfo, int marker, - unsigned int datalen)); + unsigned int datalen)); JMETHOD(void, write_marker_byte, (j_compress_ptr cinfo, int val)); }; @@ -140,7 +140,7 @@ struct jpeg_decomp_master { JMETHOD(void, finish_output_pass, (j_decompress_ptr cinfo)); /* State variables made visible to other modules */ - boolean is_dummy_pass; /* True during 1st pass for 2-pass quant */ + boolean is_dummy_pass; /* True during 1st pass for 2-pass quant */ }; /* Input control module */ @@ -151,16 +151,16 @@ struct jpeg_input_controller { JMETHOD(void, finish_input_pass, (j_decompress_ptr cinfo)); /* State variables made visible to other modules */ - boolean has_multiple_scans; /* True if file has multiple scans */ - boolean eoi_reached; /* True when EOI has been consumed */ + boolean has_multiple_scans; /* True if file has multiple scans */ + boolean eoi_reached; /* True when EOI has been consumed */ }; /* Main buffer control (downsampled-data buffer) */ struct jpeg_d_main_controller { JMETHOD(void, start_pass, (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)); JMETHOD(void, process_data, (j_decompress_ptr cinfo, - JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail)); + JSAMPARRAY output_buf, JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail)); }; /* Coefficient buffer control */ @@ -169,7 +169,7 @@ struct jpeg_d_coef_controller { JMETHOD(int, consume_data, (j_decompress_ptr cinfo)); JMETHOD(void, start_output_pass, (j_decompress_ptr cinfo)); JMETHOD(int, decompress_data, (j_decompress_ptr cinfo, - JSAMPIMAGE output_buf)); + JSAMPIMAGE output_buf)); /* Pointer to array of coefficient virtual arrays, or NULL if none */ jvirt_barray_ptr *coef_arrays; }; @@ -178,12 +178,12 @@ struct jpeg_d_coef_controller { struct jpeg_d_post_controller { JMETHOD(void, start_pass, (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)); JMETHOD(void, post_process_data, (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, - JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, - JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail)); + JSAMPIMAGE input_buf, + JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail)); }; /* Marker reading & parsing */ @@ -200,28 +200,28 @@ struct jpeg_marker_reader { /* State of marker reader --- nominally internal, but applications * supplying COM or APPn handlers might like to know the state. */ - boolean saw_SOI; /* found SOI? */ - boolean saw_SOF; /* found SOF? */ - int next_restart_num; /* next restart number expected (0-7) */ - unsigned int discarded_bytes; /* # of bytes skipped looking for a marker */ + boolean saw_SOI; /* found SOI? */ + boolean saw_SOF; /* found SOF? */ + int next_restart_num; /* next restart number expected (0-7) */ + unsigned int discarded_bytes; /* # of bytes skipped looking for a marker */ }; /* Entropy decoding */ struct jpeg_entropy_decoder { JMETHOD(void, start_pass, (j_decompress_ptr cinfo)); JMETHOD(boolean, decode_mcu, (j_decompress_ptr cinfo, - JBLOCKROW *MCU_data)); + JBLOCKROW *MCU_data)); /* This is here to share code between baseline and progressive decoders; */ /* other modules probably should not use it */ - boolean insufficient_data; /* set TRUE after emitting warning */ + boolean insufficient_data; /* set TRUE after emitting warning */ }; /* Inverse DCT (also performs dequantization) */ typedef JMETHOD(void, inverse_DCT_method_ptr, - (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, - JSAMPARRAY output_buf, JDIMENSION output_col)); + (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col)); struct jpeg_inverse_dct { JMETHOD(void, start_pass, (j_decompress_ptr cinfo)); @@ -233,30 +233,30 @@ struct jpeg_inverse_dct { struct jpeg_upsampler { JMETHOD(void, start_pass, (j_decompress_ptr cinfo)); JMETHOD(void, upsample, (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, - JDIMENSION *in_row_group_ctr, - JDIMENSION in_row_groups_avail, - JSAMPARRAY output_buf, - JDIMENSION *out_row_ctr, - JDIMENSION out_rows_avail)); - - boolean need_context_rows; /* TRUE if need rows above & below */ + JSAMPIMAGE input_buf, + JDIMENSION *in_row_group_ctr, + JDIMENSION in_row_groups_avail, + JSAMPARRAY output_buf, + JDIMENSION *out_row_ctr, + JDIMENSION out_rows_avail)); + + boolean need_context_rows; /* TRUE if need rows above & below */ }; /* Colorspace conversion */ struct jpeg_color_deconverter { JMETHOD(void, start_pass, (j_decompress_ptr cinfo)); JMETHOD(void, color_convert, (j_decompress_ptr cinfo, - JSAMPIMAGE input_buf, JDIMENSION input_row, - JSAMPARRAY output_buf, int num_rows)); + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); }; /* Color quantization or color precision reduction */ struct jpeg_color_quantizer { JMETHOD(void, start_pass, (j_decompress_ptr cinfo, boolean is_pre_scan)); JMETHOD(void, color_quantize, (j_decompress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPARRAY output_buf, - int num_rows)); + JSAMPARRAY input_buf, JSAMPARRAY output_buf, + int num_rows)); JMETHOD(void, finish_pass, (j_decompress_ptr cinfo)); JMETHOD(void, new_color_map, (j_decompress_ptr cinfo)); }; @@ -265,9 +265,9 @@ struct jpeg_color_quantizer { /* Miscellaneous useful macros */ #undef MAX -#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) #undef MIN -#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MIN(a,b) ((a) < (b) ? (a) : (b)) /* We assume that right shift corresponds to signed division by 2 with @@ -281,69 +281,69 @@ struct jpeg_color_quantizer { */ #ifdef RIGHT_SHIFT_IS_UNSIGNED -#define SHIFT_TEMPS INT32 shift_temp; +#define SHIFT_TEMPS INT32 shift_temp; #define RIGHT_SHIFT(x,shft) \ - ((shift_temp = (x)) < 0 ? \ - (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \ - (shift_temp >> (shft))) + ((shift_temp = (x)) < 0 ? \ + (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \ + (shift_temp >> (shft))) #else #define SHIFT_TEMPS -#define RIGHT_SHIFT(x,shft) ((x) >> (shft)) +#define RIGHT_SHIFT(x,shft) ((x) >> (shft)) #endif /* Short forms of external names for systems with brain-damaged linkers. */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jinit_compress_master jICompress -#define jinit_c_master_control jICMaster -#define jinit_c_main_controller jICMainC -#define jinit_c_prep_controller jICPrepC -#define jinit_c_coef_controller jICCoefC -#define jinit_color_converter jICColor -#define jinit_downsampler jIDownsampler -#define jinit_forward_dct jIFDCT -#define jinit_huff_encoder jIHEncoder -#define jinit_phuff_encoder jIPHEncoder -#define jinit_arith_encoder jIAEncoder -#define jinit_marker_writer jIMWriter -#define jinit_master_decompress jIDMaster -#define jinit_d_main_controller jIDMainC -#define jinit_d_coef_controller jIDCoefC -#define jinit_d_post_controller jIDPostC -#define jinit_input_controller jIInCtlr -#define jinit_marker_reader jIMReader -#define jinit_huff_decoder jIHDecoder -#define jinit_phuff_decoder jIPHDecoder -#define jinit_arith_decoder jIADecoder -#define jinit_inverse_dct jIIDCT -#define jinit_upsampler jIUpsampler -#define jinit_color_deconverter jIDColor -#define jinit_1pass_quantizer jI1Quant -#define jinit_2pass_quantizer jI2Quant -#define jinit_merged_upsampler jIMUpsampler -#define jinit_memory_mgr jIMemMgr -#define jdiv_round_up jDivRound -#define jround_up jRound -#define jcopy_sample_rows jCopySamples -#define jcopy_block_row jCopyBlocks -#define jzero_far jZeroFar -#define jpeg_zigzag_order jZIGTable -#define jpeg_natural_order jZAGTable -#define jpeg_aritab jAriTab +#define jinit_compress_master jICompress +#define jinit_c_master_control jICMaster +#define jinit_c_main_controller jICMainC +#define jinit_c_prep_controller jICPrepC +#define jinit_c_coef_controller jICCoefC +#define jinit_color_converter jICColor +#define jinit_downsampler jIDownsampler +#define jinit_forward_dct jIFDCT +#define jinit_huff_encoder jIHEncoder +#define jinit_phuff_encoder jIPHEncoder +#define jinit_arith_encoder jIAEncoder +#define jinit_marker_writer jIMWriter +#define jinit_master_decompress jIDMaster +#define jinit_d_main_controller jIDMainC +#define jinit_d_coef_controller jIDCoefC +#define jinit_d_post_controller jIDPostC +#define jinit_input_controller jIInCtlr +#define jinit_marker_reader jIMReader +#define jinit_huff_decoder jIHDecoder +#define jinit_phuff_decoder jIPHDecoder +#define jinit_arith_decoder jIADecoder +#define jinit_inverse_dct jIIDCT +#define jinit_upsampler jIUpsampler +#define jinit_color_deconverter jIDColor +#define jinit_1pass_quantizer jI1Quant +#define jinit_2pass_quantizer jI2Quant +#define jinit_merged_upsampler jIMUpsampler +#define jinit_memory_mgr jIMemMgr +#define jdiv_round_up jDivRound +#define jround_up jRound +#define jcopy_sample_rows jCopySamples +#define jcopy_block_row jCopyBlocks +#define jzero_far jZeroFar +#define jpeg_zigzag_order jZIGTable +#define jpeg_natural_order jZAGTable +#define jpeg_aritab jAriTab #endif /* NEED_SHORT_EXTERNAL_NAMES */ /* Compression module initialization routines */ EXTERN(void) jinit_compress_master JPP((j_compress_ptr cinfo)); EXTERN(void) jinit_c_master_control JPP((j_compress_ptr cinfo, - boolean transcode_only)); + boolean transcode_only)); EXTERN(void) jinit_c_main_controller JPP((j_compress_ptr cinfo, - boolean need_full_buffer)); + boolean need_full_buffer)); EXTERN(void) jinit_c_prep_controller JPP((j_compress_ptr cinfo, - boolean need_full_buffer)); + boolean need_full_buffer)); EXTERN(void) jinit_c_coef_controller JPP((j_compress_ptr cinfo, - boolean need_full_buffer)); + boolean need_full_buffer)); EXTERN(void) jinit_color_converter JPP((j_compress_ptr cinfo)); EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo)); EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo)); @@ -354,11 +354,11 @@ EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo)); /* Decompression module initialization routines */ EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_d_main_controller JPP((j_decompress_ptr cinfo, - boolean need_full_buffer)); + boolean need_full_buffer)); EXTERN(void) jinit_d_coef_controller JPP((j_decompress_ptr cinfo, - boolean need_full_buffer)); + boolean need_full_buffer)); EXTERN(void) jinit_d_post_controller JPP((j_decompress_ptr cinfo, - boolean need_full_buffer)); + boolean need_full_buffer)); EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo)); EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo)); @@ -377,13 +377,13 @@ EXTERN(void) jinit_memory_mgr JPP((j_common_ptr cinfo)); EXTERN(long) jdiv_round_up JPP((long a, long b)); EXTERN(long) jround_up JPP((long a, long b)); EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array, int source_row, - JSAMPARRAY output_array, int dest_row, - int num_rows, JDIMENSION num_cols)); + JSAMPARRAY output_array, int dest_row, + int num_rows, JDIMENSION num_cols)); EXTERN(void) jcopy_block_row JPP((JBLOCKROW input_row, JBLOCKROW output_row, - JDIMENSION num_blocks)); + JDIMENSION num_blocks)); EXTERN(void) jzero_far JPP((void FAR * target, size_t bytestozero)); /* Constant tables in jutils.c */ -#if 0 /* This table is not actually needed in v6a */ +#if 0 /* This table is not actually needed in v6a */ extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */ #endif extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */ @@ -394,7 +394,7 @@ extern const INT32 jpeg_aritab[]; /* Suppress undefined-structure complaints if necessary. */ #ifdef INCOMPLETE_TYPES_BROKEN -#ifndef AM_MEMORY_MANAGER /* only jmemmgr.c defines these */ +#ifndef AM_MEMORY_MANAGER /* only jmemmgr.c defines these */ struct jvirt_sarray_control { long dummy; }; struct jvirt_barray_control { long dummy; }; #endif diff --git a/jpeglib.h b/jpeglib.h index 7386a2ed7..e5329cff3 100644 --- a/jpeglib.h +++ b/jpeglib.h @@ -23,10 +23,10 @@ * manual configuration options that most people need not worry about. */ -#ifndef JCONFIG_INCLUDED /* in case jinclude.h already did */ -#include "jconfig.h" /* widely used configuration options */ +#ifndef JCONFIG_INCLUDED /* in case jinclude.h already did */ +#include "jconfig.h" /* widely used configuration options */ #endif -#include "jmorecfg.h" /* seldom changed options */ +#include "jmorecfg.h" /* seldom changed options */ #ifdef __cplusplus @@ -41,13 +41,13 @@ extern "C" { * if you want to be compatible. */ -#define DCTSIZE 8 /* The basic DCT block is 8x8 samples */ -#define DCTSIZE2 64 /* DCTSIZE squared; # of elements in a block */ -#define NUM_QUANT_TBLS 4 /* Quantization tables are numbered 0..3 */ -#define NUM_HUFF_TBLS 4 /* Huffman tables are numbered 0..3 */ -#define NUM_ARITH_TBLS 16 /* Arith-coding tables are numbered 0..15 */ -#define MAX_COMPS_IN_SCAN 4 /* JPEG limit on # of components in one scan */ -#define MAX_SAMP_FACTOR 4 /* JPEG limit on sampling factors */ +#define DCTSIZE 8 /* The basic DCT block is 8x8 samples */ +#define DCTSIZE2 64 /* DCTSIZE squared; # of elements in a block */ +#define NUM_QUANT_TBLS 4 /* Quantization tables are numbered 0..3 */ +#define NUM_HUFF_TBLS 4 /* Huffman tables are numbered 0..3 */ +#define NUM_ARITH_TBLS 16 /* Arith-coding tables are numbered 0..15 */ +#define MAX_COMPS_IN_SCAN 4 /* JPEG limit on # of components in one scan */ +#define MAX_SAMP_FACTOR 4 /* JPEG limit on sampling factors */ /* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard; * the PostScript DCT filter can emit files with many more than 10 blocks/MCU. * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU @@ -66,16 +66,16 @@ extern "C" { * but the pointer arrays can fit in near memory. */ -typedef JSAMPLE FAR *JSAMPROW; /* ptr to one image row of pixel samples. */ -typedef JSAMPROW *JSAMPARRAY; /* ptr to some rows (a 2-D sample array) */ -typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */ +typedef JSAMPLE FAR *JSAMPROW; /* ptr to one image row of pixel samples. */ +typedef JSAMPROW *JSAMPARRAY; /* ptr to some rows (a 2-D sample array) */ +typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */ -typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */ -typedef JBLOCK FAR *JBLOCKROW; /* pointer to one row of coefficient blocks */ -typedef JBLOCKROW *JBLOCKARRAY; /* a 2-D array of coefficient blocks */ -typedef JBLOCKARRAY *JBLOCKIMAGE; /* a 3-D array of coefficient blocks */ +typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */ +typedef JBLOCK FAR *JBLOCKROW; /* pointer to one row of coefficient blocks */ +typedef JBLOCKROW *JBLOCKARRAY; /* a 2-D array of coefficient blocks */ +typedef JBLOCKARRAY *JBLOCKIMAGE; /* a 3-D array of coefficient blocks */ -typedef JCOEF FAR *JCOEFPTR; /* useful in a couple of places */ +typedef JCOEF FAR *JCOEFPTR; /* useful in a couple of places */ /* Types for JPEG compression parameters and working tables. */ @@ -88,13 +88,13 @@ typedef struct { * (not the zigzag order in which they are stored in a JPEG DQT marker). * CAUTION: IJG versions prior to v6a kept this array in zigzag order. */ - UINT16 quantval[DCTSIZE2]; /* quantization step for each coefficient */ + UINT16 quantval[DCTSIZE2]; /* quantization step for each coefficient */ /* This field is used only during compression. It's initialized FALSE when * the table is created, and set TRUE when it's been output to the file. * You could suppress output of a table by setting this to TRUE. * (See jpeg_suppress_tables for an example.) */ - boolean sent_table; /* TRUE when table has been output */ + boolean sent_table; /* TRUE when table has been output */ } JQUANT_TBL; @@ -102,15 +102,15 @@ typedef struct { typedef struct { /* These two fields directly represent the contents of a JPEG DHT marker */ - UINT8 bits[17]; /* bits[k] = # of symbols with codes of */ - /* length k bits; bits[0] is unused */ - UINT8 huffval[256]; /* The symbols, in order of incr code length */ + UINT8 bits[17]; /* bits[k] = # of symbols with codes of */ + /* length k bits; bits[0] is unused */ + UINT8 huffval[256]; /* The symbols, in order of incr code length */ /* This field is used only during compression. It's initialized FALSE when * the table is created, and set TRUE when it's been output to the file. * You could suppress output of a table by setting this to TRUE. * (See jpeg_suppress_tables for an example.) */ - boolean sent_table; /* TRUE when table has been output */ + boolean sent_table; /* TRUE when table has been output */ } JHUFF_TBL; @@ -120,20 +120,20 @@ typedef struct { /* These values are fixed over the whole image. */ /* For compression, they must be supplied by parameter setup; */ /* for decompression, they are read from the SOF marker. */ - int component_id; /* identifier for this component (0..255) */ - int component_index; /* its index in SOF or cinfo->comp_info[] */ - int h_samp_factor; /* horizontal sampling factor (1..4) */ - int v_samp_factor; /* vertical sampling factor (1..4) */ - int quant_tbl_no; /* quantization table selector (0..3) */ + int component_id; /* identifier for this component (0..255) */ + int component_index; /* its index in SOF or cinfo->comp_info[] */ + int h_samp_factor; /* horizontal sampling factor (1..4) */ + int v_samp_factor; /* vertical sampling factor (1..4) */ + int quant_tbl_no; /* quantization table selector (0..3) */ /* These values may vary between scans. */ /* For compression, they must be supplied by parameter setup; */ /* for decompression, they are read from the SOS marker. */ /* The decompressor output side may not use these variables. */ - int dc_tbl_no; /* DC entropy table selector (0..3) */ - int ac_tbl_no; /* AC entropy table selector (0..3) */ - + int dc_tbl_no; /* DC entropy table selector (0..3) */ + int ac_tbl_no; /* AC entropy table selector (0..3) */ + /* Remaining fields should be treated as private by applications. */ - + /* These values are computed during compression or decompression startup: */ /* Component's size in DCT blocks. * Any dummy blocks added to complete an MCU are not counted; therefore @@ -159,22 +159,22 @@ typedef struct { * and similarly for height. For decompression, IDCT scaling is included, so * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE) */ - JDIMENSION downsampled_width; /* actual width in samples */ + JDIMENSION downsampled_width; /* actual width in samples */ JDIMENSION downsampled_height; /* actual height in samples */ /* This flag is used only for decompression. In cases where some of the * components will be ignored (eg grayscale output from YCbCr image), * we can skip most computations for the unused components. */ - boolean component_needed; /* do we need the value of this component? */ + boolean component_needed; /* do we need the value of this component? */ /* These values are computed before starting a scan of the component. */ /* The decompressor output side may not use these variables. */ - int MCU_width; /* number of blocks per MCU, horizontally */ - int MCU_height; /* number of blocks per MCU, vertically */ - int MCU_blocks; /* MCU_width * MCU_height */ - int MCU_sample_width; /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */ - int last_col_width; /* # of non-dummy blocks across in last MCU */ - int last_row_height; /* # of non-dummy blocks down in last MCU */ + int MCU_width; /* number of blocks per MCU, horizontally */ + int MCU_height; /* number of blocks per MCU, vertically */ + int MCU_blocks; /* MCU_width * MCU_height */ + int MCU_sample_width; /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */ + int last_col_width; /* # of non-dummy blocks across in last MCU */ + int last_row_height; /* # of non-dummy blocks down in last MCU */ /* Saved quantization table for component; NULL if none yet saved. * See jdinput.c comments about the need for this information. @@ -190,10 +190,10 @@ typedef struct { /* The script for encoding a multiple-scan file is an array of these: */ typedef struct { - int comps_in_scan; /* number of components encoded in this scan */ + int comps_in_scan; /* number of components encoded in this scan */ int component_index[MAX_COMPS_IN_SCAN]; /* their SOF/comp_info[] indexes */ - int Ss, Se; /* progressive JPEG spectral selection parms */ - int Ah, Al; /* progressive JPEG successive approx. parms */ + int Ss, Se; /* progressive JPEG spectral selection parms */ + int Ah, Al; /* progressive JPEG successive approx. parms */ } jpeg_scan_info; /* The decompressor can save APPn and COM markers in a list of these: */ @@ -201,11 +201,11 @@ typedef struct { typedef struct jpeg_marker_struct FAR * jpeg_saved_marker_ptr; struct jpeg_marker_struct { - jpeg_saved_marker_ptr next; /* next in list, or NULL */ - UINT8 marker; /* marker code: JPEG_COM, or JPEG_APP0+n */ - unsigned int original_length; /* # bytes of data in the file */ - unsigned int data_length; /* # bytes of data saved at data[] */ - JOCTET FAR * data; /* the data contained in the marker */ + jpeg_saved_marker_ptr next; /* next in list, or NULL */ + UINT8 marker; /* marker code: JPEG_COM, or JPEG_APP0+n */ + unsigned int original_length; /* # bytes of data in the file */ + unsigned int data_length; /* # bytes of data saved at data[] */ + JOCTET FAR * data; /* the data contained in the marker */ /* the marker length word is not counted in data_length or original_length */ }; @@ -215,72 +215,72 @@ struct jpeg_marker_struct { #define JCS_ALPHA_EXTENSIONS 1 typedef enum { - JCS_UNKNOWN, /* error/unspecified */ - JCS_GRAYSCALE, /* monochrome */ - JCS_RGB, /* red/green/blue as specified by the RGB_RED, RGB_GREEN, - RGB_BLUE, and RGB_PIXELSIZE macros */ - JCS_YCbCr, /* Y/Cb/Cr (also known as YUV) */ - JCS_CMYK, /* C/M/Y/K */ - JCS_YCCK, /* Y/Cb/Cr/K */ - JCS_EXT_RGB, /* red/green/blue */ - JCS_EXT_RGBX, /* red/green/blue/x */ - JCS_EXT_BGR, /* blue/green/red */ - JCS_EXT_BGRX, /* blue/green/red/x */ - JCS_EXT_XBGR, /* x/blue/green/red */ - JCS_EXT_XRGB, /* x/red/green/blue */ - /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX, - JCS_EXT_XBGR, or JCS_EXT_XRGB during decompression, the X byte is - undefined, and in order to ensure the best performance, - libjpeg-turbo can set that byte to whatever value it wishes. Use - the following colorspace constants to ensure that the X byte is set - to 0xFF, so that it can be interpreted as an opaque alpha - channel. */ - JCS_EXT_RGBA, /* red/green/blue/alpha */ - JCS_EXT_BGRA, /* blue/green/red/alpha */ - JCS_EXT_ABGR, /* alpha/blue/green/red */ - JCS_EXT_ARGB /* alpha/red/green/blue */ + JCS_UNKNOWN, /* error/unspecified */ + JCS_GRAYSCALE, /* monochrome */ + JCS_RGB, /* red/green/blue as specified by the RGB_RED, RGB_GREEN, + RGB_BLUE, and RGB_PIXELSIZE macros */ + JCS_YCbCr, /* Y/Cb/Cr (also known as YUV) */ + JCS_CMYK, /* C/M/Y/K */ + JCS_YCCK, /* Y/Cb/Cr/K */ + JCS_EXT_RGB, /* red/green/blue */ + JCS_EXT_RGBX, /* red/green/blue/x */ + JCS_EXT_BGR, /* blue/green/red */ + JCS_EXT_BGRX, /* blue/green/red/x */ + JCS_EXT_XBGR, /* x/blue/green/red */ + JCS_EXT_XRGB, /* x/red/green/blue */ + /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX, + JCS_EXT_XBGR, or JCS_EXT_XRGB during decompression, the X byte is + undefined, and in order to ensure the best performance, + libjpeg-turbo can set that byte to whatever value it wishes. Use + the following colorspace constants to ensure that the X byte is set + to 0xFF, so that it can be interpreted as an opaque alpha + channel. */ + JCS_EXT_RGBA, /* red/green/blue/alpha */ + JCS_EXT_BGRA, /* blue/green/red/alpha */ + JCS_EXT_ABGR, /* alpha/blue/green/red */ + JCS_EXT_ARGB /* alpha/red/green/blue */ } J_COLOR_SPACE; /* DCT/IDCT algorithm options. */ typedef enum { - JDCT_ISLOW, /* slow but accurate integer algorithm */ - JDCT_IFAST, /* faster, less accurate integer method */ - JDCT_FLOAT /* floating-point: accurate, fast on fast HW */ + JDCT_ISLOW, /* slow but accurate integer algorithm */ + JDCT_IFAST, /* faster, less accurate integer method */ + JDCT_FLOAT /* floating-point: accurate, fast on fast HW */ } J_DCT_METHOD; -#ifndef JDCT_DEFAULT /* may be overridden in jconfig.h */ +#ifndef JDCT_DEFAULT /* may be overridden in jconfig.h */ #define JDCT_DEFAULT JDCT_ISLOW #endif -#ifndef JDCT_FASTEST /* may be overridden in jconfig.h */ +#ifndef JDCT_FASTEST /* may be overridden in jconfig.h */ #define JDCT_FASTEST JDCT_IFAST #endif /* Dithering options for decompression. */ typedef enum { - JDITHER_NONE, /* no dithering */ - JDITHER_ORDERED, /* simple ordered dither */ - JDITHER_FS /* Floyd-Steinberg error diffusion dither */ + JDITHER_NONE, /* no dithering */ + JDITHER_ORDERED, /* simple ordered dither */ + JDITHER_FS /* Floyd-Steinberg error diffusion dither */ } J_DITHER_MODE; /* Common fields between JPEG compression and decompression master structs. */ #define jpeg_common_fields \ - struct jpeg_error_mgr * err; /* Error handler module */\ - struct jpeg_memory_mgr * mem; /* Memory manager module */\ + struct jpeg_error_mgr * err; /* Error handler module */\ + struct jpeg_memory_mgr * mem; /* Memory manager module */\ struct jpeg_progress_mgr * progress; /* Progress monitor, or NULL if none */\ - void * client_data; /* Available for use by application */\ - boolean is_decompressor; /* So common code can tell which is which */\ - int global_state /* For checking call sequence validity */ + void * client_data; /* Available for use by application */\ + boolean is_decompressor; /* So common code can tell which is which */\ + int global_state /* For checking call sequence validity */ /* Routines that are to be used by both halves of the library are declared * to receive a pointer to this structure. There are no actual instances of * jpeg_common_struct, only of jpeg_compress_struct and jpeg_decompress_struct. */ struct jpeg_common_struct { - jpeg_common_fields; /* Fields common to both master struct types */ + jpeg_common_fields; /* Fields common to both master struct types */ /* Additional fields follow in an actual jpeg_compress_struct or * jpeg_decompress_struct. All three structs must agree on these * initial fields! (This would be a lot cleaner in C++.) @@ -295,7 +295,7 @@ typedef struct jpeg_decompress_struct * j_decompress_ptr; /* Master record for a compression instance */ struct jpeg_compress_struct { - jpeg_common_fields; /* Fields shared with jpeg_decompress_struct */ + jpeg_common_fields; /* Fields shared with jpeg_decompress_struct */ /* Destination for compressed data */ struct jpeg_destination_mgr * dest; @@ -305,12 +305,12 @@ struct jpeg_compress_struct { * be correct before you can even call jpeg_set_defaults(). */ - JDIMENSION image_width; /* input image width */ - JDIMENSION image_height; /* input image height */ - int input_components; /* # of color components in input image */ - J_COLOR_SPACE in_color_space; /* colorspace of input image */ + JDIMENSION image_width; /* input image width */ + JDIMENSION image_height; /* input image height */ + int input_components; /* # of color components in input image */ + J_COLOR_SPACE in_color_space; /* colorspace of input image */ - double input_gamma; /* image gamma of input image */ + double input_gamma; /* image gamma of input image */ /* Compression parameters --- these fields must be set before calling * jpeg_start_compress(). We recommend calling jpeg_set_defaults() to @@ -323,8 +323,8 @@ struct jpeg_compress_struct { #if JPEG_LIB_VERSION >= 70 unsigned int scale_num, scale_denom; /* fraction by which to scale image */ - JDIMENSION jpeg_width; /* scaled JPEG image width */ - JDIMENSION jpeg_height; /* scaled JPEG image height */ + JDIMENSION jpeg_width; /* scaled JPEG image width */ + JDIMENSION jpeg_height; /* scaled JPEG image height */ /* Dimensions of actual JPEG image that will be written to file, * derived from input dimensions by scaling factors above. * These fields are computed by jpeg_start_compress(). @@ -333,9 +333,9 @@ struct jpeg_compress_struct { */ #endif - int data_precision; /* bits of precision in image data */ + int data_precision; /* bits of precision in image data */ - int num_components; /* # of color components in JPEG image */ + int num_components; /* # of color components in JPEG image */ J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */ jpeg_component_info * comp_info; @@ -357,22 +357,22 @@ struct jpeg_compress_struct { UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */ UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */ - int num_scans; /* # of entries in scan_info array */ + int num_scans; /* # of entries in scan_info array */ const jpeg_scan_info * scan_info; /* script for multi-scan file, or NULL */ /* The default value of scan_info is NULL, which causes a single-scan * sequential JPEG file to be emitted. To create a multi-scan file, * set num_scans and scan_info to point to an array of scan definitions. */ - boolean raw_data_in; /* TRUE=caller supplies downsampled data */ - boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */ - boolean optimize_coding; /* TRUE=optimize entropy encoding parms */ - boolean CCIR601_sampling; /* TRUE=first samples are cosited */ + boolean raw_data_in; /* TRUE=caller supplies downsampled data */ + boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */ + boolean optimize_coding; /* TRUE=optimize entropy encoding parms */ + boolean CCIR601_sampling; /* TRUE=first samples are cosited */ #if JPEG_LIB_VERSION >= 70 boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */ #endif - int smoothing_factor; /* 1..100, or 0 for no input smoothing */ - J_DCT_METHOD dct_method; /* DCT algorithm selector */ + int smoothing_factor; /* 1..100, or 0 for no input smoothing */ + J_DCT_METHOD dct_method; /* DCT algorithm selector */ /* The restart interval can be specified in absolute MCUs by setting * restart_interval, or in MCU rows by setting restart_in_rows @@ -380,28 +380,28 @@ struct jpeg_compress_struct { * for each scan). */ unsigned int restart_interval; /* MCUs per restart, or 0 for no restart */ - int restart_in_rows; /* if > 0, MCU rows per restart interval */ + int restart_in_rows; /* if > 0, MCU rows per restart interval */ /* Parameters controlling emission of special markers. */ - boolean write_JFIF_header; /* should a JFIF marker be written? */ - UINT8 JFIF_major_version; /* What to write for the JFIF version number */ + boolean write_JFIF_header; /* should a JFIF marker be written? */ + UINT8 JFIF_major_version; /* What to write for the JFIF version number */ UINT8 JFIF_minor_version; /* These three values are not used by the JPEG code, merely copied */ /* into the JFIF APP0 marker. density_unit can be 0 for unknown, */ /* 1 for dots/inch, or 2 for dots/cm. Note that the pixel aspect */ /* ratio is defined by X_density/Y_density even when density_unit=0. */ - UINT8 density_unit; /* JFIF code for pixel size units */ - UINT16 X_density; /* Horizontal pixel density */ - UINT16 Y_density; /* Vertical pixel density */ - boolean write_Adobe_marker; /* should an Adobe marker be written? */ - + UINT8 density_unit; /* JFIF code for pixel size units */ + UINT16 X_density; /* Horizontal pixel density */ + UINT16 Y_density; /* Vertical pixel density */ + boolean write_Adobe_marker; /* should an Adobe marker be written? */ + /* State variable: index of next scanline to be written to * jpeg_write_scanlines(). Application may use this to control its * processing loop, e.g., "while (next_scanline < image_height)". */ - JDIMENSION next_scanline; /* 0 .. image_height-1 */ + JDIMENSION next_scanline; /* 0 .. image_height-1 */ /* Remaining fields are known throughout compressor, but generally * should not be touched by a surrounding application. @@ -410,44 +410,44 @@ struct jpeg_compress_struct { /* * These fields are computed during compression startup */ - boolean progressive_mode; /* TRUE if scan script uses progressive mode */ - int max_h_samp_factor; /* largest h_samp_factor */ - int max_v_samp_factor; /* largest v_samp_factor */ + boolean progressive_mode; /* TRUE if scan script uses progressive mode */ + int max_h_samp_factor; /* largest h_samp_factor */ + int max_v_samp_factor; /* largest v_samp_factor */ #if JPEG_LIB_VERSION >= 70 - int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */ - int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */ + int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */ + int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */ #endif - JDIMENSION total_iMCU_rows; /* # of iMCU rows to be input to coef ctlr */ + JDIMENSION total_iMCU_rows; /* # of iMCU rows to be input to coef ctlr */ /* The coefficient controller receives data in units of MCU rows as defined * for fully interleaved scans (whether the JPEG file is interleaved or not). * There are v_samp_factor * DCTSIZE sample rows of each component in an * "iMCU" (interleaved MCU) row. */ - + /* * These fields are valid during any one scan. * They describe the components and MCUs actually appearing in the scan. */ - int comps_in_scan; /* # of JPEG components in this scan */ + int comps_in_scan; /* # of JPEG components in this scan */ jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN]; /* *cur_comp_info[i] describes component that appears i'th in SOS */ - - JDIMENSION MCUs_per_row; /* # of MCUs across the image */ - JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */ - - int blocks_in_MCU; /* # of DCT blocks per MCU */ + + JDIMENSION MCUs_per_row; /* # of MCUs across the image */ + JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */ + + int blocks_in_MCU; /* # of DCT blocks per MCU */ int MCU_membership[C_MAX_BLOCKS_IN_MCU]; /* MCU_membership[i] is index in cur_comp_info of component owning */ /* i'th block in an MCU */ - int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */ + int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */ #if JPEG_LIB_VERSION >= 80 - int block_size; /* the basic DCT block size: 1..16 */ - const int * natural_order; /* natural-order position array */ - int lim_Se; /* min( Se, DCTSIZE2-1 ) */ + int block_size; /* the basic DCT block size: 1..16 */ + const int * natural_order; /* natural-order position array */ + int lim_Se; /* min( Se, DCTSIZE2-1 ) */ #endif /* @@ -470,7 +470,7 @@ struct jpeg_compress_struct { /* Master record for a decompression instance */ struct jpeg_decompress_struct { - jpeg_common_fields; /* Fields shared with jpeg_compress_struct */ + jpeg_common_fields; /* Fields shared with jpeg_compress_struct */ /* Source of compressed data */ struct jpeg_source_mgr * src; @@ -478,9 +478,9 @@ struct jpeg_decompress_struct { /* Basic description of image --- filled in by jpeg_read_header(). */ /* Application may inspect these values to decide how to process image. */ - JDIMENSION image_width; /* nominal image width (from SOF marker) */ - JDIMENSION image_height; /* nominal image height */ - int num_components; /* # of color components in JPEG image */ + JDIMENSION image_width; /* nominal image width (from SOF marker) */ + JDIMENSION image_height; /* nominal image height */ + int num_components; /* # of color components in JPEG image */ J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */ /* Decompression processing parameters --- these fields must be set before @@ -492,24 +492,24 @@ struct jpeg_decompress_struct { unsigned int scale_num, scale_denom; /* fraction by which to scale image */ - double output_gamma; /* image gamma wanted in output */ + double output_gamma; /* image gamma wanted in output */ - boolean buffered_image; /* TRUE=multiple output passes */ - boolean raw_data_out; /* TRUE=downsampled data wanted */ + boolean buffered_image; /* TRUE=multiple output passes */ + boolean raw_data_out; /* TRUE=downsampled data wanted */ - J_DCT_METHOD dct_method; /* IDCT algorithm selector */ - boolean do_fancy_upsampling; /* TRUE=apply fancy upsampling */ - boolean do_block_smoothing; /* TRUE=apply interblock smoothing */ + J_DCT_METHOD dct_method; /* IDCT algorithm selector */ + boolean do_fancy_upsampling; /* TRUE=apply fancy upsampling */ + boolean do_block_smoothing; /* TRUE=apply interblock smoothing */ - boolean quantize_colors; /* TRUE=colormapped output wanted */ + boolean quantize_colors; /* TRUE=colormapped output wanted */ /* the following are ignored if not quantize_colors: */ - J_DITHER_MODE dither_mode; /* type of color dithering to use */ - boolean two_pass_quantize; /* TRUE=use two-pass color quantization */ - int desired_number_of_colors; /* max # colors to use in created colormap */ + J_DITHER_MODE dither_mode; /* type of color dithering to use */ + boolean two_pass_quantize; /* TRUE=use two-pass color quantization */ + int desired_number_of_colors; /* max # colors to use in created colormap */ /* these are significant only in buffered-image mode: */ - boolean enable_1pass_quant; /* enable future use of 1-pass quantizer */ + boolean enable_1pass_quant; /* enable future use of 1-pass quantizer */ boolean enable_external_quant;/* enable future use of external colormap */ - boolean enable_2pass_quant; /* enable future use of 2-pass quantizer */ + boolean enable_2pass_quant; /* enable future use of 2-pass quantizer */ /* Description of actual output image that will be returned to application. * These fields are computed by jpeg_start_decompress(). @@ -517,14 +517,14 @@ struct jpeg_decompress_struct { * in advance of calling jpeg_start_decompress(). */ - JDIMENSION output_width; /* scaled image width */ - JDIMENSION output_height; /* scaled image height */ - int out_color_components; /* # of color components in out_color_space */ - int output_components; /* # of color components returned */ + JDIMENSION output_width; /* scaled image width */ + JDIMENSION output_height; /* scaled image height */ + int out_color_components; /* # of color components in out_color_space */ + int output_components; /* # of color components returned */ /* output_components is 1 (a colormap index) when quantizing colors; * otherwise it equals out_color_components. */ - int rec_outbuf_height; /* min recommended height of scanline buffer */ + int rec_outbuf_height; /* min recommended height of scanline buffer */ /* If the buffer passed to jpeg_read_scanlines() is less than this many rows * high, space and time will be wasted due to unnecessary data copying. * Usually rec_outbuf_height will be 1 or 2, at most 4. @@ -536,8 +536,8 @@ struct jpeg_decompress_struct { * jpeg_start_decompress or jpeg_start_output. * The map has out_color_components rows and actual_number_of_colors columns. */ - int actual_number_of_colors; /* number of entries in use */ - JSAMPARRAY colormap; /* The color map as a 2-D pixel array */ + int actual_number_of_colors; /* number of entries in use */ + JSAMPARRAY colormap; /* The color map as a 2-D pixel array */ /* State variables: these variables indicate the progress of decompression. * The application may examine these but must not modify them. @@ -547,20 +547,20 @@ struct jpeg_decompress_struct { * Application may use this to control its processing loop, e.g., * "while (output_scanline < output_height)". */ - JDIMENSION output_scanline; /* 0 .. output_height-1 */ + JDIMENSION output_scanline; /* 0 .. output_height-1 */ /* Current input scan number and number of iMCU rows completed in scan. * These indicate the progress of the decompressor input side. */ - int input_scan_number; /* Number of SOS markers seen so far */ - JDIMENSION input_iMCU_row; /* Number of iMCU rows completed */ + int input_scan_number; /* Number of SOS markers seen so far */ + JDIMENSION input_iMCU_row; /* Number of iMCU rows completed */ /* The "output scan number" is the notional scan being displayed by the * output side. The decompressor will not allow output scan/row number * to get ahead of input scan/row, but it can fall arbitrarily far behind. */ - int output_scan_number; /* Nominal scan number being displayed */ - JDIMENSION output_iMCU_row; /* Number of iMCU rows read */ + int output_scan_number; /* Nominal scan number being displayed */ + JDIMENSION output_iMCU_row; /* Number of iMCU rows read */ /* Current progression status. coef_bits[c][i] indicates the precision * with which component c's DCT coefficient i (in zigzag order) is known. @@ -569,7 +569,7 @@ struct jpeg_decompress_struct { * (thus, 0 at completion of the progression). * This pointer is NULL when reading a non-progressive file. */ - int (*coef_bits)[DCTSIZE2]; /* -1 or current Al value for each coef */ + int (*coef_bits)[DCTSIZE2]; /* -1 or current Al value for each coef */ /* Internal JPEG parameters --- the application usually need not look at * these fields. Note that the decompressor output side may not use @@ -591,16 +591,16 @@ struct jpeg_decompress_struct { * are given in SOF/SOS markers or defined to be reset by SOI. */ - int data_precision; /* bits of precision in image data */ + int data_precision; /* bits of precision in image data */ jpeg_component_info * comp_info; /* comp_info[i] describes component that appears i'th in SOF */ #if JPEG_LIB_VERSION >= 80 - boolean is_baseline; /* TRUE if Baseline SOF0 encountered */ + boolean is_baseline; /* TRUE if Baseline SOF0 encountered */ #endif - boolean progressive_mode; /* TRUE if SOFn specifies progressive mode */ - boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */ + boolean progressive_mode; /* TRUE if SOFn specifies progressive mode */ + boolean arith_code; /* TRUE=arithmetic coding, FALSE=Huffman */ UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */ UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */ @@ -611,17 +611,17 @@ struct jpeg_decompress_struct { /* These fields record data obtained from optional markers recognized by * the JPEG library. */ - boolean saw_JFIF_marker; /* TRUE iff a JFIF APP0 marker was found */ + boolean saw_JFIF_marker; /* TRUE iff a JFIF APP0 marker was found */ /* Data copied from JFIF marker; only valid if saw_JFIF_marker is TRUE: */ - UINT8 JFIF_major_version; /* JFIF version number */ + UINT8 JFIF_major_version; /* JFIF version number */ UINT8 JFIF_minor_version; - UINT8 density_unit; /* JFIF code for pixel size units */ - UINT16 X_density; /* Horizontal pixel density */ - UINT16 Y_density; /* Vertical pixel density */ - boolean saw_Adobe_marker; /* TRUE iff an Adobe APP14 marker was found */ - UINT8 Adobe_transform; /* Color transform code from Adobe marker */ + UINT8 density_unit; /* JFIF code for pixel size units */ + UINT16 X_density; /* Horizontal pixel density */ + UINT16 Y_density; /* Vertical pixel density */ + boolean saw_Adobe_marker; /* TRUE iff an Adobe APP14 marker was found */ + UINT8 Adobe_transform; /* Color transform code from Adobe marker */ - boolean CCIR601_sampling; /* TRUE=first samples are cosited */ + boolean CCIR601_sampling; /* TRUE=first samples are cosited */ /* Aside from the specific data retained from APPn markers known to the * library, the uninterpreted contents of any or all APPn and COM markers @@ -636,17 +636,17 @@ struct jpeg_decompress_struct { /* * These fields are computed during decompression startup */ - int max_h_samp_factor; /* largest h_samp_factor */ - int max_v_samp_factor; /* largest v_samp_factor */ + int max_h_samp_factor; /* largest h_samp_factor */ + int max_v_samp_factor; /* largest v_samp_factor */ #if JPEG_LIB_VERSION >= 70 - int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */ - int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */ + int min_DCT_h_scaled_size; /* smallest DCT_h_scaled_size of any component */ + int min_DCT_v_scaled_size; /* smallest DCT_v_scaled_size of any component */ #else - int min_DCT_scaled_size; /* smallest DCT_scaled_size of any component */ + int min_DCT_scaled_size; /* smallest DCT_scaled_size of any component */ #endif - JDIMENSION total_iMCU_rows; /* # of iMCU rows in image */ + JDIMENSION total_iMCU_rows; /* # of iMCU rows in image */ /* The coefficient controller's input and output progress is measured in * units of "iMCU" (interleaved MCU) rows. These are the same as MCU rows * in fully interleaved JPEG scans, but are used whether the scan is @@ -662,26 +662,26 @@ struct jpeg_decompress_struct { * They describe the components and MCUs actually appearing in the scan. * Note that the decompressor output side must not use these fields. */ - int comps_in_scan; /* # of JPEG components in this scan */ + int comps_in_scan; /* # of JPEG components in this scan */ jpeg_component_info * cur_comp_info[MAX_COMPS_IN_SCAN]; /* *cur_comp_info[i] describes component that appears i'th in SOS */ - JDIMENSION MCUs_per_row; /* # of MCUs across the image */ - JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */ + JDIMENSION MCUs_per_row; /* # of MCUs across the image */ + JDIMENSION MCU_rows_in_scan; /* # of MCU rows in the image */ - int blocks_in_MCU; /* # of DCT blocks per MCU */ + int blocks_in_MCU; /* # of DCT blocks per MCU */ int MCU_membership[D_MAX_BLOCKS_IN_MCU]; /* MCU_membership[i] is index in cur_comp_info of component owning */ /* i'th block in an MCU */ - int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */ + int Ss, Se, Ah, Al; /* progressive JPEG parameters for scan */ #if JPEG_LIB_VERSION >= 80 /* These fields are derived from Se of first SOS marker. */ - int block_size; /* the basic DCT block size: 1..16 */ + int block_size; /* the basic DCT block size: 1..16 */ const int * natural_order; /* natural-order position array for entropy decode */ - int lim_Se; /* min( Se, DCTSIZE2-1 ) for entropy decode */ + int lim_Se; /* min( Se, DCTSIZE2-1 ) for entropy decode */ #endif /* This field is shared between entropy decoder and marker parser. @@ -726,10 +726,10 @@ struct jpeg_error_mgr { JMETHOD(void, output_message, (j_common_ptr cinfo)); /* Format a message string for the most recent JPEG error or message */ JMETHOD(void, format_message, (j_common_ptr cinfo, char * buffer)); -#define JMSG_LENGTH_MAX 200 /* recommended size of format_message buffer */ +#define JMSG_LENGTH_MAX 200 /* recommended size of format_message buffer */ /* Reset error state variables at start of a new image */ JMETHOD(void, reset_error_mgr, (j_common_ptr cinfo)); - + /* The message ID code and any parameters are saved here. * A message can have one string parameter or up to 8 int parameters. */ @@ -739,18 +739,18 @@ struct jpeg_error_mgr { int i[8]; char s[JMSG_STR_PARM_MAX]; } msg_parm; - + /* Standard state variables for error facility */ - - int trace_level; /* max msg_level that will be displayed */ - + + int trace_level; /* max msg_level that will be displayed */ + /* For recoverable corrupt-data errors, we emit a warning message, * but keep going unless emit_message chooses to abort. emit_message * should count warnings in num_warnings. The surrounding application * can check for bad data by seeing if num_warnings is nonzero at the * end of processing. */ - long num_warnings; /* number of corrupt-data warnings */ + long num_warnings; /* number of corrupt-data warnings */ /* These fields point to the table(s) of error message strings. * An application can change the table pointer to switch to a different @@ -768,8 +768,8 @@ struct jpeg_error_mgr { * It contains strings numbered first_addon_message..last_addon_message. */ const char * const * addon_message_table; /* Non-library errors */ - int first_addon_message; /* code for first string in addon table */ - int last_addon_message; /* code for last string in addon table */ + int first_addon_message; /* code for first string in addon table */ + int last_addon_message; /* code for last string in addon table */ }; @@ -778,18 +778,18 @@ struct jpeg_error_mgr { struct jpeg_progress_mgr { JMETHOD(void, progress_monitor, (j_common_ptr cinfo)); - long pass_counter; /* work units completed in this pass */ - long pass_limit; /* total number of work units in this pass */ - int completed_passes; /* passes completed so far */ - int total_passes; /* total number of passes expected */ + long pass_counter; /* work units completed in this pass */ + long pass_limit; /* total number of work units in this pass */ + int completed_passes; /* passes completed so far */ + int total_passes; /* total number of passes expected */ }; /* Data destination object for compression */ struct jpeg_destination_mgr { - JOCTET * next_output_byte; /* => next byte to write in buffer */ - size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + JOCTET * next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ JMETHOD(void, init_destination, (j_compress_ptr cinfo)); JMETHOD(boolean, empty_output_buffer, (j_compress_ptr cinfo)); @@ -801,7 +801,7 @@ struct jpeg_destination_mgr { struct jpeg_source_mgr { const JOCTET * next_input_byte; /* => next byte to read from buffer */ - size_t bytes_in_buffer; /* # of bytes remaining in buffer */ + size_t bytes_in_buffer; /* # of bytes remaining in buffer */ JMETHOD(void, init_source, (j_decompress_ptr cinfo)); JMETHOD(boolean, fill_input_buffer, (j_decompress_ptr cinfo)); @@ -822,9 +822,9 @@ struct jpeg_source_mgr { * successful. */ -#define JPOOL_PERMANENT 0 /* lasts until master record is destroyed */ -#define JPOOL_IMAGE 1 /* lasts until done with image/datastream */ -#define JPOOL_NUMPOOLS 2 +#define JPOOL_PERMANENT 0 /* lasts until master record is destroyed */ +#define JPOOL_IMAGE 1 /* lasts until done with image/datastream */ +#define JPOOL_NUMPOOLS 2 typedef struct jvirt_sarray_control * jvirt_sarray_ptr; typedef struct jvirt_barray_control * jvirt_barray_ptr; @@ -833,38 +833,38 @@ typedef struct jvirt_barray_control * jvirt_barray_ptr; struct jpeg_memory_mgr { /* Method pointers */ JMETHOD(void *, alloc_small, (j_common_ptr cinfo, int pool_id, - size_t sizeofobject)); + size_t sizeofobject)); JMETHOD(void FAR *, alloc_large, (j_common_ptr cinfo, int pool_id, - size_t sizeofobject)); + size_t sizeofobject)); JMETHOD(JSAMPARRAY, alloc_sarray, (j_common_ptr cinfo, int pool_id, - JDIMENSION samplesperrow, - JDIMENSION numrows)); + JDIMENSION samplesperrow, + JDIMENSION numrows)); JMETHOD(JBLOCKARRAY, alloc_barray, (j_common_ptr cinfo, int pool_id, - JDIMENSION blocksperrow, - JDIMENSION numrows)); + JDIMENSION blocksperrow, + JDIMENSION numrows)); JMETHOD(jvirt_sarray_ptr, request_virt_sarray, (j_common_ptr cinfo, - int pool_id, - boolean pre_zero, - JDIMENSION samplesperrow, - JDIMENSION numrows, - JDIMENSION maxaccess)); + int pool_id, + boolean pre_zero, + JDIMENSION samplesperrow, + JDIMENSION numrows, + JDIMENSION maxaccess)); JMETHOD(jvirt_barray_ptr, request_virt_barray, (j_common_ptr cinfo, - int pool_id, - boolean pre_zero, - JDIMENSION blocksperrow, - JDIMENSION numrows, - JDIMENSION maxaccess)); + int pool_id, + boolean pre_zero, + JDIMENSION blocksperrow, + JDIMENSION numrows, + JDIMENSION maxaccess)); JMETHOD(void, realize_virt_arrays, (j_common_ptr cinfo)); JMETHOD(JSAMPARRAY, access_virt_sarray, (j_common_ptr cinfo, - jvirt_sarray_ptr ptr, - JDIMENSION start_row, - JDIMENSION num_rows, - boolean writable)); + jvirt_sarray_ptr ptr, + JDIMENSION start_row, + JDIMENSION num_rows, + boolean writable)); JMETHOD(JBLOCKARRAY, access_virt_barray, (j_common_ptr cinfo, - jvirt_barray_ptr ptr, - JDIMENSION start_row, - JDIMENSION num_rows, - boolean writable)); + jvirt_barray_ptr ptr, + JDIMENSION start_row, + JDIMENSION num_rows, + boolean writable)); JMETHOD(void, free_pool, (j_common_ptr cinfo, int pool_id)); JMETHOD(void, self_destruct, (j_common_ptr cinfo)); @@ -892,87 +892,87 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo)); */ #ifdef HAVE_PROTOTYPES -#define JPP(arglist) arglist +#define JPP(arglist) arglist #else -#define JPP(arglist) () +#define JPP(arglist) () #endif /* Short forms of external names for systems with brain-damaged linkers. * We shorten external names to be unique in the first six letters, which * is good enough for all known systems. - * (If your compiler itself needs names to be unique in less than 15 + * (If your compiler itself needs names to be unique in less than 15 * characters, you are out of luck. Get a better compiler.) */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jpeg_std_error jStdError -#define jpeg_CreateCompress jCreaCompress -#define jpeg_CreateDecompress jCreaDecompress -#define jpeg_destroy_compress jDestCompress -#define jpeg_destroy_decompress jDestDecompress -#define jpeg_stdio_dest jStdDest -#define jpeg_stdio_src jStdSrc +#define jpeg_std_error jStdError +#define jpeg_CreateCompress jCreaCompress +#define jpeg_CreateDecompress jCreaDecompress +#define jpeg_destroy_compress jDestCompress +#define jpeg_destroy_decompress jDestDecompress +#define jpeg_stdio_dest jStdDest +#define jpeg_stdio_src jStdSrc #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) -#define jpeg_mem_dest jMemDest -#define jpeg_mem_src jMemSrc +#define jpeg_mem_dest jMemDest +#define jpeg_mem_src jMemSrc #endif -#define jpeg_set_defaults jSetDefaults -#define jpeg_set_colorspace jSetColorspace -#define jpeg_default_colorspace jDefColorspace -#define jpeg_set_quality jSetQuality -#define jpeg_set_linear_quality jSetLQuality +#define jpeg_set_defaults jSetDefaults +#define jpeg_set_colorspace jSetColorspace +#define jpeg_default_colorspace jDefColorspace +#define jpeg_set_quality jSetQuality +#define jpeg_set_linear_quality jSetLQuality #if JPEG_LIB_VERSION >= 70 -#define jpeg_default_qtables jDefQTables +#define jpeg_default_qtables jDefQTables #endif -#define jpeg_add_quant_table jAddQuantTable -#define jpeg_quality_scaling jQualityScaling -#define jpeg_simple_progression jSimProgress -#define jpeg_suppress_tables jSuppressTables -#define jpeg_alloc_quant_table jAlcQTable -#define jpeg_alloc_huff_table jAlcHTable -#define jpeg_start_compress jStrtCompress -#define jpeg_write_scanlines jWrtScanlines -#define jpeg_finish_compress jFinCompress +#define jpeg_add_quant_table jAddQuantTable +#define jpeg_quality_scaling jQualityScaling +#define jpeg_simple_progression jSimProgress +#define jpeg_suppress_tables jSuppressTables +#define jpeg_alloc_quant_table jAlcQTable +#define jpeg_alloc_huff_table jAlcHTable +#define jpeg_start_compress jStrtCompress +#define jpeg_write_scanlines jWrtScanlines +#define jpeg_finish_compress jFinCompress #if JPEG_LIB_VERSION >= 70 -#define jpeg_calc_jpeg_dimensions jCjpegDimensions +#define jpeg_calc_jpeg_dimensions jCjpegDimensions #endif -#define jpeg_write_raw_data jWrtRawData -#define jpeg_write_marker jWrtMarker -#define jpeg_write_m_header jWrtMHeader -#define jpeg_write_m_byte jWrtMByte -#define jpeg_write_tables jWrtTables -#define jpeg_read_header jReadHeader -#define jpeg_start_decompress jStrtDecompress -#define jpeg_read_scanlines jReadScanlines -#define jpeg_finish_decompress jFinDecompress -#define jpeg_read_raw_data jReadRawData -#define jpeg_has_multiple_scans jHasMultScn -#define jpeg_start_output jStrtOutput -#define jpeg_finish_output jFinOutput -#define jpeg_input_complete jInComplete -#define jpeg_new_colormap jNewCMap -#define jpeg_consume_input jConsumeInput +#define jpeg_write_raw_data jWrtRawData +#define jpeg_write_marker jWrtMarker +#define jpeg_write_m_header jWrtMHeader +#define jpeg_write_m_byte jWrtMByte +#define jpeg_write_tables jWrtTables +#define jpeg_read_header jReadHeader +#define jpeg_start_decompress jStrtDecompress +#define jpeg_read_scanlines jReadScanlines +#define jpeg_finish_decompress jFinDecompress +#define jpeg_read_raw_data jReadRawData +#define jpeg_has_multiple_scans jHasMultScn +#define jpeg_start_output jStrtOutput +#define jpeg_finish_output jFinOutput +#define jpeg_input_complete jInComplete +#define jpeg_new_colormap jNewCMap +#define jpeg_consume_input jConsumeInput #if JPEG_LIB_VERSION >= 80 -#define jpeg_core_output_dimensions jCoreDimensions +#define jpeg_core_output_dimensions jCoreDimensions #endif -#define jpeg_calc_output_dimensions jCalcDimensions -#define jpeg_save_markers jSaveMarkers -#define jpeg_set_marker_processor jSetMarker -#define jpeg_read_coefficients jReadCoefs -#define jpeg_write_coefficients jWrtCoefs -#define jpeg_copy_critical_parameters jCopyCrit -#define jpeg_abort_compress jAbrtCompress -#define jpeg_abort_decompress jAbrtDecompress -#define jpeg_abort jAbort -#define jpeg_destroy jDestroy -#define jpeg_resync_to_restart jResyncRestart +#define jpeg_calc_output_dimensions jCalcDimensions +#define jpeg_save_markers jSaveMarkers +#define jpeg_set_marker_processor jSetMarker +#define jpeg_read_coefficients jReadCoefs +#define jpeg_write_coefficients jWrtCoefs +#define jpeg_copy_critical_parameters jCopyCrit +#define jpeg_abort_compress jAbrtCompress +#define jpeg_abort_decompress jAbrtDecompress +#define jpeg_abort jAbort +#define jpeg_destroy jDestroy +#define jpeg_resync_to_restart jResyncRestart #endif /* NEED_SHORT_EXTERNAL_NAMES */ /* Default error-management setup */ EXTERN(struct jpeg_error_mgr *) jpeg_std_error - JPP((struct jpeg_error_mgr * err)); + JPP((struct jpeg_error_mgr * err)); /* Initialization of JPEG compression objects. * jpeg_create_compress() and jpeg_create_decompress() are the exported @@ -983,14 +983,14 @@ EXTERN(struct jpeg_error_mgr *) jpeg_std_error */ #define jpeg_create_compress(cinfo) \ jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \ - (size_t) sizeof(struct jpeg_compress_struct)) + (size_t) sizeof(struct jpeg_compress_struct)) #define jpeg_create_decompress(cinfo) \ jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \ - (size_t) sizeof(struct jpeg_decompress_struct)) + (size_t) sizeof(struct jpeg_decompress_struct)) EXTERN(void) jpeg_CreateCompress JPP((j_compress_ptr cinfo, - int version, size_t structsize)); + int version, size_t structsize)); EXTERN(void) jpeg_CreateDecompress JPP((j_decompress_ptr cinfo, - int version, size_t structsize)); + int version, size_t structsize)); /* Destruction of JPEG compression objects */ EXTERN(void) jpeg_destroy_compress JPP((j_compress_ptr cinfo)); EXTERN(void) jpeg_destroy_decompress JPP((j_decompress_ptr cinfo)); @@ -1003,45 +1003,45 @@ EXTERN(void) jpeg_stdio_src JPP((j_decompress_ptr cinfo, FILE * infile)); #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED) /* Data source and destination managers: memory buffers. */ EXTERN(void) jpeg_mem_dest JPP((j_compress_ptr cinfo, - unsigned char ** outbuffer, - unsigned long * outsize)); + unsigned char ** outbuffer, + unsigned long * outsize)); EXTERN(void) jpeg_mem_src JPP((j_decompress_ptr cinfo, - unsigned char * inbuffer, - unsigned long insize)); + unsigned char * inbuffer, + unsigned long insize)); #endif /* Default parameter setup for compression */ EXTERN(void) jpeg_set_defaults JPP((j_compress_ptr cinfo)); /* Compression parameter setup aids */ EXTERN(void) jpeg_set_colorspace JPP((j_compress_ptr cinfo, - J_COLOR_SPACE colorspace)); + J_COLOR_SPACE colorspace)); EXTERN(void) jpeg_default_colorspace JPP((j_compress_ptr cinfo)); EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality, - boolean force_baseline)); + boolean force_baseline)); EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo, - int scale_factor, - boolean force_baseline)); + int scale_factor, + boolean force_baseline)); #if JPEG_LIB_VERSION >= 70 EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo, - boolean force_baseline)); + boolean force_baseline)); #endif EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl, - const unsigned int *basic_table, - int scale_factor, - boolean force_baseline)); + const unsigned int *basic_table, + int scale_factor, + boolean force_baseline)); EXTERN(int) jpeg_quality_scaling JPP((int quality)); EXTERN(void) jpeg_simple_progression JPP((j_compress_ptr cinfo)); EXTERN(void) jpeg_suppress_tables JPP((j_compress_ptr cinfo, - boolean suppress)); + boolean suppress)); EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table JPP((j_common_ptr cinfo)); EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table JPP((j_common_ptr cinfo)); /* Main entry points for compression */ EXTERN(void) jpeg_start_compress JPP((j_compress_ptr cinfo, - boolean write_all_tables)); + boolean write_all_tables)); EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo, - JSAMPARRAY scanlines, - JDIMENSION num_lines)); + JSAMPARRAY scanlines, + JDIMENSION num_lines)); EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo)); #if JPEG_LIB_VERSION >= 70 @@ -1051,29 +1051,29 @@ EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo)); /* Replaces jpeg_write_scanlines when writing raw downsampled data. */ EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo, - JSAMPIMAGE data, - JDIMENSION num_lines)); + JSAMPIMAGE data, + JDIMENSION num_lines)); /* Write a special marker. See libjpeg.txt concerning safe usage. */ EXTERN(void) jpeg_write_marker - JPP((j_compress_ptr cinfo, int marker, - const JOCTET * dataptr, unsigned int datalen)); + JPP((j_compress_ptr cinfo, int marker, + const JOCTET * dataptr, unsigned int datalen)); /* Same, but piecemeal. */ EXTERN(void) jpeg_write_m_header - JPP((j_compress_ptr cinfo, int marker, unsigned int datalen)); + JPP((j_compress_ptr cinfo, int marker, unsigned int datalen)); EXTERN(void) jpeg_write_m_byte - JPP((j_compress_ptr cinfo, int val)); + JPP((j_compress_ptr cinfo, int val)); /* Alternate compression function: just write an abbreviated table file */ EXTERN(void) jpeg_write_tables JPP((j_compress_ptr cinfo)); /* Decompression startup: read start of JPEG datastream to see what's there */ EXTERN(int) jpeg_read_header JPP((j_decompress_ptr cinfo, - boolean require_image)); + boolean require_image)); /* Return value is one of: */ -#define JPEG_SUSPENDED 0 /* Suspended due to lack of input data */ -#define JPEG_HEADER_OK 1 /* Found valid image datastream */ -#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */ +#define JPEG_SUSPENDED 0 /* Suspended due to lack of input data */ +#define JPEG_HEADER_OK 1 /* Found valid image datastream */ +#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */ /* If you pass require_image = TRUE (normal case), you need not check for * a TABLES_ONLY return code; an abbreviated file will cause an error exit. * JPEG_SUSPENDED is only possible if you use a data source module that can @@ -1083,29 +1083,29 @@ EXTERN(int) jpeg_read_header JPP((j_decompress_ptr cinfo, /* Main entry points for decompression */ EXTERN(boolean) jpeg_start_decompress JPP((j_decompress_ptr cinfo)); EXTERN(JDIMENSION) jpeg_read_scanlines JPP((j_decompress_ptr cinfo, - JSAMPARRAY scanlines, - JDIMENSION max_lines)); + JSAMPARRAY scanlines, + JDIMENSION max_lines)); EXTERN(boolean) jpeg_finish_decompress JPP((j_decompress_ptr cinfo)); /* Replaces jpeg_read_scanlines when reading raw downsampled data. */ EXTERN(JDIMENSION) jpeg_read_raw_data JPP((j_decompress_ptr cinfo, - JSAMPIMAGE data, - JDIMENSION max_lines)); + JSAMPIMAGE data, + JDIMENSION max_lines)); /* Additional entry points for buffered-image mode. */ EXTERN(boolean) jpeg_has_multiple_scans JPP((j_decompress_ptr cinfo)); EXTERN(boolean) jpeg_start_output JPP((j_decompress_ptr cinfo, - int scan_number)); + int scan_number)); EXTERN(boolean) jpeg_finish_output JPP((j_decompress_ptr cinfo)); EXTERN(boolean) jpeg_input_complete JPP((j_decompress_ptr cinfo)); EXTERN(void) jpeg_new_colormap JPP((j_decompress_ptr cinfo)); EXTERN(int) jpeg_consume_input JPP((j_decompress_ptr cinfo)); /* Return value is one of: */ -/* #define JPEG_SUSPENDED 0 Suspended due to lack of input data */ -#define JPEG_REACHED_SOS 1 /* Reached start of new scan */ -#define JPEG_REACHED_EOI 2 /* Reached end of image */ -#define JPEG_ROW_COMPLETED 3 /* Completed one iMCU row */ -#define JPEG_SCAN_COMPLETED 4 /* Completed last iMCU row of a scan */ +/* #define JPEG_SUSPENDED 0 Suspended due to lack of input data */ +#define JPEG_REACHED_SOS 1 /* Reached start of new scan */ +#define JPEG_REACHED_EOI 2 /* Reached end of image */ +#define JPEG_ROW_COMPLETED 3 /* Completed one iMCU row */ +#define JPEG_SCAN_COMPLETED 4 /* Completed last iMCU row of a scan */ /* Precalculate output dimensions for current decompression parameters. */ #if JPEG_LIB_VERSION >= 80 @@ -1115,20 +1115,20 @@ EXTERN(void) jpeg_calc_output_dimensions JPP((j_decompress_ptr cinfo)); /* Control saving of COM and APPn markers into marker_list. */ EXTERN(void) jpeg_save_markers - JPP((j_decompress_ptr cinfo, int marker_code, - unsigned int length_limit)); + JPP((j_decompress_ptr cinfo, int marker_code, + unsigned int length_limit)); /* Install a special processing method for COM or APPn markers. */ EXTERN(void) jpeg_set_marker_processor - JPP((j_decompress_ptr cinfo, int marker_code, - jpeg_marker_parser_method routine)); + JPP((j_decompress_ptr cinfo, int marker_code, + jpeg_marker_parser_method routine)); /* Read or write raw DCT coefficients --- useful for lossless transcoding. */ EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients JPP((j_decompress_ptr cinfo)); EXTERN(void) jpeg_write_coefficients JPP((j_compress_ptr cinfo, - jvirt_barray_ptr * coef_arrays)); + jvirt_barray_ptr * coef_arrays)); EXTERN(void) jpeg_copy_critical_parameters JPP((j_decompress_ptr srcinfo, - j_compress_ptr dstinfo)); + j_compress_ptr dstinfo)); /* If you choose to abort compression or decompression before completing * jpeg_finish_(de)compress, then you need to clean up to release memory, @@ -1147,17 +1147,17 @@ EXTERN(void) jpeg_destroy JPP((j_common_ptr cinfo)); /* Default restart-marker-resync procedure for use by data source modules */ EXTERN(boolean) jpeg_resync_to_restart JPP((j_decompress_ptr cinfo, - int desired)); + int desired)); /* These marker codes are exported since applications and data source modules * are likely to want to use them. */ -#define JPEG_RST0 0xD0 /* RST0 marker code */ -#define JPEG_EOI 0xD9 /* EOI marker code */ -#define JPEG_APP0 0xE0 /* APP0 marker code */ -#define JPEG_COM 0xFE /* COM marker code */ +#define JPEG_RST0 0xD0 /* RST0 marker code */ +#define JPEG_EOI 0xD9 /* EOI marker code */ +#define JPEG_APP0 0xE0 /* APP0 marker code */ +#define JPEG_COM 0xFE /* COM marker code */ /* If we have a brain-damaged compiler that emits warnings (or worse, errors) @@ -1166,7 +1166,7 @@ EXTERN(boolean) jpeg_resync_to_restart JPP((j_decompress_ptr cinfo, */ #ifdef INCOMPLETE_TYPES_BROKEN -#ifndef JPEG_INTERNALS /* will be defined in jpegint.h */ +#ifndef JPEG_INTERNALS /* will be defined in jpegint.h */ struct jvirt_sarray_control { long dummy; }; struct jvirt_barray_control { long dummy; }; struct jpeg_comp_master { long dummy; }; @@ -1201,8 +1201,8 @@ struct jpeg_color_quantizer { long dummy; }; */ #ifdef JPEG_INTERNALS -#include "jpegint.h" /* fetch private declarations */ -#include "jerror.h" /* fetch error codes too */ +#include "jpegint.h" /* fetch private declarations */ +#include "jerror.h" /* fetch error codes too */ #endif #ifdef __cplusplus diff --git a/jpegtran.c b/jpegtran.c index 5bfab483e..c7906f4d2 100644 --- a/jpegtran.c +++ b/jpegtran.c @@ -13,18 +13,18 @@ * provides some lossless and sort-of-lossless transformations of JPEG data. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ -#include "transupp.h" /* Support routines for jpegtran */ -#include "jversion.h" /* for version message */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "transupp.h" /* Support routines for jpegtran */ +#include "jversion.h" /* for version message */ #include "jconfigint.h" -#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ +#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ #ifdef __MWERKS__ #include /* Metrowerks needs this */ -#include /* ... and this */ +#include /* ... and this */ #endif #ifdef THINK_C -#include /* Think declares it here */ +#include /* Think declares it here */ #endif #endif @@ -38,9 +38,9 @@ */ -static const char * progname; /* program name for error messages */ -static char * outfilename; /* for -outfile switch */ -static JCOPY_OPTION copyoption; /* -copy switch */ +static const char * progname; /* program name for error messages */ +static char * outfilename; /* for -outfile switch */ +static JCOPY_OPTION copyoption; /* -copy switch */ static jpeg_transform_info transformoption; /* image transformation options */ @@ -106,12 +106,12 @@ select_transform (JXFORM_CODE transform) transformoption.transform = transform; } else { fprintf(stderr, "%s: can only do one image transformation at a time\n", - progname); + progname); usage(); } #else fprintf(stderr, "%s: sorry, image transformation was not compiled\n", - progname); + progname); exit(EXIT_FAILURE); #endif } @@ -119,7 +119,7 @@ select_transform (JXFORM_CODE transform) LOCAL(int) parse_switches (j_compress_ptr cinfo, int argc, char **argv, - int last_file_arg_seen, boolean for_real) + int last_file_arg_seen, boolean for_real) /* Parse optional switches. * Returns argv[] index of first file-name argument (== argc if none). * Any file names with indexes <= last_file_arg_seen are ignored; @@ -132,7 +132,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, int argn; char * arg; boolean simple_progressive; - char * scansarg = NULL; /* saves -scans parm if any */ + char * scansarg = NULL; /* saves -scans parm if any */ /* Set up default JPEG parameters. */ simple_progressive = FALSE; @@ -153,12 +153,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, if (*arg != '-') { /* Not a switch, must be a file name argument */ if (argn <= last_file_arg_seen) { - outfilename = NULL; /* -outfile applies to just one input file */ - continue; /* ignore this name if previously processed */ + outfilename = NULL; /* -outfile applies to just one input file */ + continue; /* ignore this name if previously processed */ } - break; /* else done parsing switches */ + break; /* else done parsing switches */ } - arg++; /* advance past switch marker character */ + arg++; /* advance past switch marker character */ if (keymatch(arg, "arithmetic", 1)) { /* Use arithmetic coding. */ @@ -166,35 +166,35 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, cinfo->arith_code = TRUE; #else fprintf(stderr, "%s: sorry, arithmetic coding not supported\n", - progname); + progname); exit(EXIT_FAILURE); #endif } else if (keymatch(arg, "copy", 2)) { /* Select which extra markers to copy. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (keymatch(argv[argn], "none", 1)) { - copyoption = JCOPYOPT_NONE; + copyoption = JCOPYOPT_NONE; } else if (keymatch(argv[argn], "comments", 1)) { - copyoption = JCOPYOPT_COMMENTS; + copyoption = JCOPYOPT_COMMENTS; } else if (keymatch(argv[argn], "all", 1)) { - copyoption = JCOPYOPT_ALL; + copyoption = JCOPYOPT_ALL; } else - usage(); + usage(); } else if (keymatch(arg, "crop", 2)) { /* Perform lossless cropping. */ #if TRANSFORMS_SUPPORTED - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (! jtransform_parse_crop_spec(&transformoption, argv[argn])) { - fprintf(stderr, "%s: bogus -crop argument '%s'\n", - progname, argv[argn]); - exit(EXIT_FAILURE); + fprintf(stderr, "%s: bogus -crop argument '%s'\n", + progname, argv[argn]); + exit(EXIT_FAILURE); } #else - select_transform(JXFORM_NONE); /* force an error */ + select_transform(JXFORM_NONE); /* force an error */ #endif } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) { @@ -203,32 +203,32 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, static boolean printed_version = FALSE; if (! printed_version) { - fprintf(stderr, "%s version %s (build %s)\n", - PACKAGE_NAME, VERSION, BUILD); - fprintf(stderr, "%s\n\n", JCOPYRIGHT); - fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n", - JVERSION); - printed_version = TRUE; + fprintf(stderr, "%s version %s (build %s)\n", + PACKAGE_NAME, VERSION, BUILD); + fprintf(stderr, "%s\n\n", JCOPYRIGHT); + fprintf(stderr, "Emulating The Independent JPEG Group's software, version %s\n\n", + JVERSION); + printed_version = TRUE; } cinfo->err->trace_level++; } else if (keymatch(arg, "flip", 1)) { /* Mirror left-right or top-bottom. */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (keymatch(argv[argn], "horizontal", 1)) - select_transform(JXFORM_FLIP_H); + select_transform(JXFORM_FLIP_H); else if (keymatch(argv[argn], "vertical", 1)) - select_transform(JXFORM_FLIP_V); + select_transform(JXFORM_FLIP_V); else - usage(); + usage(); } else if (keymatch(arg, "grayscale", 1) || keymatch(arg, "greyscale",1)) { /* Force to grayscale. */ #if TRANSFORMS_SUPPORTED transformoption.force_grayscale = TRUE; #else - select_transform(JXFORM_NONE); /* force an error */ + select_transform(JXFORM_NONE); /* force an error */ #endif } else if (keymatch(arg, "maxmemory", 3)) { @@ -236,12 +236,12 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, long lval; char ch = 'x'; - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1) - usage(); + usage(); if (ch == 'm' || ch == 'M') - lval *= 1000L; + lval *= 1000L; cinfo->mem->max_memory_to_use = lval * 1000L; } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) { @@ -250,15 +250,15 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, cinfo->optimize_coding = TRUE; #else fprintf(stderr, "%s: sorry, entropy optimization was not compiled\n", - progname); + progname); exit(EXIT_FAILURE); #endif } else if (keymatch(arg, "outfile", 4)) { /* Set output file name. */ - if (++argn >= argc) /* advance to next argument */ - usage(); - outfilename = argv[argn]; /* save it away for later use */ + if (++argn >= argc) /* advance to next argument */ + usage(); + outfilename = argv[argn]; /* save it away for later use */ } else if (keymatch(arg, "perfect", 2)) { /* Fail if there is any partial edge MCUs that the transform can't @@ -272,7 +272,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, /* We must postpone execution until num_components is known. */ #else fprintf(stderr, "%s: sorry, progressive output was not compiled\n", - progname); + progname); exit(EXIT_FAILURE); #endif @@ -281,43 +281,43 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, long lval; char ch = 'x'; - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1) - usage(); + usage(); if (lval < 0 || lval > 65535L) - usage(); + usage(); if (ch == 'b' || ch == 'B') { - cinfo->restart_interval = (unsigned int) lval; - cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */ + cinfo->restart_interval = (unsigned int) lval; + cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */ } else { - cinfo->restart_in_rows = (int) lval; - /* restart_interval will be computed during startup */ + cinfo->restart_in_rows = (int) lval; + /* restart_interval will be computed during startup */ } } else if (keymatch(arg, "rotate", 2)) { /* Rotate 90, 180, or 270 degrees (measured clockwise). */ - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); if (keymatch(argv[argn], "90", 2)) - select_transform(JXFORM_ROT_90); + select_transform(JXFORM_ROT_90); else if (keymatch(argv[argn], "180", 3)) - select_transform(JXFORM_ROT_180); + select_transform(JXFORM_ROT_180); else if (keymatch(argv[argn], "270", 3)) - select_transform(JXFORM_ROT_270); + select_transform(JXFORM_ROT_270); else - usage(); + usage(); } else if (keymatch(arg, "scans", 1)) { /* Set scan script. */ #ifdef C_MULTISCAN_FILES_SUPPORTED - if (++argn >= argc) /* advance to next argument */ - usage(); + if (++argn >= argc) /* advance to next argument */ + usage(); scansarg = argv[argn]; /* We must postpone reading the file in case -progressive appears. */ #else fprintf(stderr, "%s: sorry, multi-scan output was not compiled\n", - progname); + progname); exit(EXIT_FAILURE); #endif @@ -334,7 +334,7 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, transformoption.trim = TRUE; } else { - usage(); /* bogus switch */ + usage(); /* bogus switch */ } } @@ -343,18 +343,18 @@ parse_switches (j_compress_ptr cinfo, int argc, char **argv, if (for_real) { #ifdef C_PROGRESSIVE_SUPPORTED - if (simple_progressive) /* process -progressive; -scans can override */ + if (simple_progressive) /* process -progressive; -scans can override */ jpeg_simple_progression(cinfo); #endif #ifdef C_MULTISCAN_FILES_SUPPORTED - if (scansarg != NULL) /* process -scans if it was present */ + if (scansarg != NULL) /* process -scans if it was present */ if (! read_scan_script(cinfo, scansarg)) - usage(); + usage(); #endif } - return argn; /* return index of next arg (file name) */ + return argn; /* return index of next arg (file name) */ } @@ -386,7 +386,7 @@ main (int argc, char **argv) progname = argv[0]; if (progname == NULL || progname[0] == 0) - progname = "jpegtran"; /* in case C library doesn't provide it */ + progname = "jpegtran"; /* in case C library doesn't provide it */ /* Initialize the JPEG decompression object with default error handling. */ srcinfo.err = jpeg_std_error(&jsrcerr); @@ -419,14 +419,14 @@ main (int argc, char **argv) if (outfilename == NULL) { if (file_index != argc-2) { fprintf(stderr, "%s: must name one input and one output file\n", - progname); + progname); usage(); } outfilename = argv[file_index+1]; } else { if (file_index != argc-1) { fprintf(stderr, "%s: must name one input and one output file\n", - progname); + progname); usage(); } } @@ -485,8 +485,8 @@ main (int argc, char **argv) */ #if TRANSFORMS_SUPPORTED dst_coef_arrays = jtransform_adjust_parameters(&srcinfo, &dstinfo, - src_coef_arrays, - &transformoption); + src_coef_arrays, + &transformoption); #else dst_coef_arrays = src_coef_arrays; #endif @@ -527,8 +527,8 @@ main (int argc, char **argv) /* Execute image transformation, if any */ #if TRANSFORMS_SUPPORTED jtransform_execute_transformation(&srcinfo, &dstinfo, - src_coef_arrays, - &transformoption); + src_coef_arrays, + &transformoption); #endif /* Finish compression and release memory */ @@ -547,5 +547,5 @@ main (int argc, char **argv) /* All done. */ exit(jsrcerr.num_warnings + jdsterr.num_warnings ?EXIT_WARNING:EXIT_SUCCESS); - return 0; /* suppress no-return-value warnings */ + return 0; /* suppress no-return-value warnings */ } diff --git a/jquant1.c b/jquant1.c index aa2c59a4b..dbcdd2776 100644 --- a/jquant1.c +++ b/jquant1.c @@ -70,9 +70,9 @@ * table in both directions. */ -#define ODITHER_SIZE 16 /* dimension of dither matrix */ +#define ODITHER_SIZE 16 /* dimension of dither matrix */ /* NB: if ODITHER_SIZE is not a power of 2, ODITHER_MASK uses will break */ -#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE) /* # cells in matrix */ +#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE) /* # cells in matrix */ #define ODITHER_MASK (ODITHER_SIZE-1) /* mask for wrapping around counters */ typedef int ODITHER_MATRIX[ODITHER_SIZE][ODITHER_SIZE]; @@ -107,8 +107,8 @@ static const UINT8 base_dither_matrix[ODITHER_SIZE][ODITHER_SIZE] = { * Errors are accumulated into the array fserrors[], at a resolution of * 1/16th of a pixel count. The error at a given pixel is propagated * to its not-yet-processed neighbors using the standard F-S fractions, - * ... (here) 7/16 - * 3/16 5/16 1/16 + * ... (here) 7/16 + * 3/16 5/16 1/16 * We work left-to-right on even rows, right-to-left on odd rows. * * We can get away with a single array (holding one row's worth of errors) @@ -127,43 +127,43 @@ static const UINT8 base_dither_matrix[ODITHER_SIZE][ODITHER_SIZE] = { */ #if BITS_IN_JSAMPLE == 8 -typedef INT16 FSERROR; /* 16 bits should be enough */ -typedef int LOCFSERROR; /* use 'int' for calculation temps */ +typedef INT16 FSERROR; /* 16 bits should be enough */ +typedef int LOCFSERROR; /* use 'int' for calculation temps */ #else -typedef INT32 FSERROR; /* may need more than 16 bits */ -typedef INT32 LOCFSERROR; /* be sure calculation temps are big enough */ +typedef INT32 FSERROR; /* may need more than 16 bits */ +typedef INT32 LOCFSERROR; /* be sure calculation temps are big enough */ #endif -typedef FSERROR FAR *FSERRPTR; /* pointer to error array (in FAR storage!) */ +typedef FSERROR FAR *FSERRPTR; /* pointer to error array (in FAR storage!) */ /* Private subobject */ -#define MAX_Q_COMPS 4 /* max components I can handle */ +#define MAX_Q_COMPS 4 /* max components I can handle */ typedef struct { struct jpeg_color_quantizer pub; /* public fields */ /* Initially allocated colormap is saved here */ - JSAMPARRAY sv_colormap; /* The color map as a 2-D pixel array */ - int sv_actual; /* number of entries in use */ + JSAMPARRAY sv_colormap; /* The color map as a 2-D pixel array */ + int sv_actual; /* number of entries in use */ - JSAMPARRAY colorindex; /* Precomputed mapping for speed */ + JSAMPARRAY colorindex; /* Precomputed mapping for speed */ /* colorindex[i][j] = index of color closest to pixel value j in component i, * premultiplied as described above. Since colormap indexes must fit into * JSAMPLEs, the entries of this array will too. */ - boolean is_padded; /* is the colorindex padded for odither? */ + boolean is_padded; /* is the colorindex padded for odither? */ - int Ncolors[MAX_Q_COMPS]; /* # of values alloced to each component */ + int Ncolors[MAX_Q_COMPS]; /* # of values alloced to each component */ /* Variables for ordered dithering */ - int row_index; /* cur row's vertical index in dither matrix */ + int row_index; /* cur row's vertical index in dither matrix */ ODITHER_MATRIX_PTR odither[MAX_Q_COMPS]; /* one dither array per component */ /* Variables for Floyd-Steinberg dithering */ FSERRPTR fserrors[MAX_Q_COMPS]; /* accumulated errors */ - boolean on_odd_row; /* flag to remember which row we are on */ + boolean on_odd_row; /* flag to remember which row we are on */ } my_cquantizer; typedef my_cquantizer * my_cquantize_ptr; @@ -205,11 +205,11 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[]) iroot = 1; do { iroot++; - temp = iroot; /* set temp = iroot ** nc */ + temp = iroot; /* set temp = iroot ** nc */ for (i = 1; i < nc; i++) temp *= iroot; } while (temp <= (long) max_colors); /* repeat till iroot exceeds root */ - iroot--; /* now iroot = floor(root) */ + iroot--; /* now iroot = floor(root) */ /* Must have at least 2 color values per component */ if (iroot < 2) @@ -233,10 +233,10 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[]) j = (cinfo->out_color_space == JCS_RGB ? RGB_order[i] : i); /* calculate new total_colors if Ncolors[j] is incremented */ temp = total_colors / Ncolors[j]; - temp *= Ncolors[j]+1; /* done in long arith to avoid oflo */ + temp *= Ncolors[j]+1; /* done in long arith to avoid oflo */ if (temp > (long) max_colors) - break; /* won't fit, done with this pass */ - Ncolors[j]++; /* OK, apply the increment */ + break; /* won't fit, done with this pass */ + Ncolors[j]++; /* OK, apply the increment */ total_colors = (int) temp; changed = TRUE; } @@ -278,8 +278,8 @@ LOCAL(void) create_colormap (j_decompress_ptr cinfo) { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; - JSAMPARRAY colormap; /* Created colormap */ - int total_colors; /* Number of distinct output colors */ + JSAMPARRAY colormap; /* Created colormap */ + int total_colors; /* Number of distinct output colors */ int i,j,k, nci, blksize, blkdist, ptr, val; /* Select number of colors for each component */ @@ -288,8 +288,8 @@ create_colormap (j_decompress_ptr cinfo) /* Report selected color counts */ if (cinfo->out_color_components == 3) TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS, - total_colors, cquantize->Ncolors[0], - cquantize->Ncolors[1], cquantize->Ncolors[2]); + total_colors, cquantize->Ncolors[0], + cquantize->Ncolors[1], cquantize->Ncolors[2]); else TRACEMS1(cinfo, 1, JTRC_QUANT_NCOLORS, total_colors); @@ -314,12 +314,12 @@ create_colormap (j_decompress_ptr cinfo) val = output_value(cinfo, i, j, nci-1); /* Fill in all colormap entries that have this value of this component */ for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) { - /* fill in blksize entries beginning at ptr */ - for (k = 0; k < blksize; k++) - colormap[i][ptr+k] = (JSAMPLE) val; + /* fill in blksize entries beginning at ptr */ + for (k = 0; k < blksize; k++) + colormap[i][ptr+k] = (JSAMPLE) val; } } - blkdist = blksize; /* blksize of this color is blkdist of next */ + blkdist = blksize; /* blksize of this color is blkdist of next */ } /* Save the colormap in private storage, @@ -377,16 +377,16 @@ create_colorindex (j_decompress_ptr cinfo) val = 0; k = largest_input_value(cinfo, i, 0, nci-1); for (j = 0; j <= MAXJSAMPLE; j++) { - while (j > k) /* advance val if past boundary */ - k = largest_input_value(cinfo, i, ++val, nci-1); + while (j > k) /* advance val if past boundary */ + k = largest_input_value(cinfo, i, ++val, nci-1); /* premultiply so that no multiplication needed in main processing */ indexptr[j] = (JSAMPLE) (val * blksize); } /* Pad at both ends if necessary */ if (pad) for (j = 1; j <= MAXJSAMPLE; j++) { - indexptr[-j] = indexptr[0]; - indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE]; + indexptr[-j] = indexptr[0]; + indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE]; } } } @@ -406,7 +406,7 @@ make_odither_array (j_decompress_ptr cinfo, int ncolors) odither = (ODITHER_MATRIX_PTR) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(ODITHER_MATRIX)); + SIZEOF(ODITHER_MATRIX)); /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1). * Hence the dither value for the matrix cell with fill order f * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1). @@ -416,7 +416,7 @@ make_odither_array (j_decompress_ptr cinfo, int ncolors) for (j = 0; j < ODITHER_SIZE; j++) { for (k = 0; k < ODITHER_SIZE; k++) { num = ((INT32) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k]))) - * MAXJSAMPLE; + * MAXJSAMPLE; /* Ensure round towards zero despite C's lack of consistency * about rounding negative values in integer division... */ @@ -429,7 +429,7 @@ make_odither_array (j_decompress_ptr cinfo, int ncolors) /* * Create the ordered-dither tables. - * Components having the same number of representative colors may + * Components having the same number of representative colors may * share a dither table. */ @@ -442,14 +442,14 @@ create_odither_tables (j_decompress_ptr cinfo) for (i = 0; i < cinfo->out_color_components; i++) { nci = cquantize->Ncolors[i]; /* # of distinct values for this color */ - odither = NULL; /* search for matching prior component */ + odither = NULL; /* search for matching prior component */ for (j = 0; j < i; j++) { if (nci == cquantize->Ncolors[j]) { - odither = cquantize->odither[j]; - break; + odither = cquantize->odither[j]; + break; } } - if (odither == NULL) /* need a new table? */ + if (odither == NULL) /* need a new table? */ odither = make_odither_array(cinfo, nci); cquantize->odither[i] = odither; } @@ -462,7 +462,7 @@ create_odither_tables (j_decompress_ptr cinfo) METHODDEF(void) color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPARRAY output_buf, int num_rows) + JSAMPARRAY output_buf, int num_rows) /* General case, no dithering */ { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; @@ -480,7 +480,7 @@ color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf, for (col = width; col > 0; col--) { pixcode = 0; for (ci = 0; ci < nc; ci++) { - pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]); + pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]); } *ptrout++ = (JSAMPLE) pixcode; } @@ -490,7 +490,7 @@ color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf, METHODDEF(void) color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPARRAY output_buf, int num_rows) + JSAMPARRAY output_buf, int num_rows) /* Fast path for out_color_components==3, no dithering */ { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; @@ -518,15 +518,15 @@ color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf, METHODDEF(void) quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPARRAY output_buf, int num_rows) + JSAMPARRAY output_buf, int num_rows) /* General case, with ordered dithering */ { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; register JSAMPROW input_ptr; register JSAMPROW output_ptr; JSAMPROW colorindex_ci; - int * dither; /* points to active row of dither matrix */ - int row_index, col_index; /* current indexes into dither matrix */ + int * dither; /* points to active row of dither matrix */ + int row_index, col_index; /* current indexes into dither matrix */ int nc = cinfo->out_color_components; int ci; int row; @@ -536,7 +536,7 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, for (row = 0; row < num_rows; row++) { /* Initialize output values to 0 so can process components separately */ jzero_far((void FAR *) output_buf[row], - (size_t) (width * SIZEOF(JSAMPLE))); + (size_t) (width * SIZEOF(JSAMPLE))); row_index = cquantize->row_index; for (ci = 0; ci < nc; ci++) { input_ptr = input_buf[row] + ci; @@ -546,17 +546,17 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, col_index = 0; for (col = width; col > 0; col--) { - /* Form pixel value + dither, range-limit to 0..MAXJSAMPLE, - * select output value, accumulate into output code for this pixel. - * Range-limiting need not be done explicitly, as we have extended - * the colorindex table to produce the right answers for out-of-range - * inputs. The maximum dither is +- MAXJSAMPLE; this sets the - * required amount of padding. - */ - *output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]]; - input_ptr += nc; - output_ptr++; - col_index = (col_index + 1) & ODITHER_MASK; + /* Form pixel value + dither, range-limit to 0..MAXJSAMPLE, + * select output value, accumulate into output code for this pixel. + * Range-limiting need not be done explicitly, as we have extended + * the colorindex table to produce the right answers for out-of-range + * inputs. The maximum dither is +- MAXJSAMPLE; this sets the + * required amount of padding. + */ + *output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]]; + input_ptr += nc; + output_ptr++; + col_index = (col_index + 1) & ODITHER_MASK; } } /* Advance row index for next row */ @@ -568,7 +568,7 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, METHODDEF(void) quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPARRAY output_buf, int num_rows) + JSAMPARRAY output_buf, int num_rows) /* Fast path for out_color_components==3, with ordered dithering */ { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; @@ -578,10 +578,10 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, JSAMPROW colorindex0 = cquantize->colorindex[0]; JSAMPROW colorindex1 = cquantize->colorindex[1]; JSAMPROW colorindex2 = cquantize->colorindex[2]; - int * dither0; /* points to active row of dither matrix */ + int * dither0; /* points to active row of dither matrix */ int * dither1; int * dither2; - int row_index, col_index; /* current indexes into dither matrix */ + int row_index, col_index; /* current indexes into dither matrix */ int row; JDIMENSION col; JDIMENSION width = cinfo->output_width; @@ -597,11 +597,11 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, for (col = width; col > 0; col--) { pixcode = GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) + - dither0[col_index]]); + dither0[col_index]]); pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) + - dither1[col_index]]); + dither1[col_index]]); pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) + - dither2[col_index]]); + dither2[col_index]]); *output_ptr++ = (JSAMPLE) pixcode; col_index = (col_index + 1) & ODITHER_MASK; } @@ -613,24 +613,24 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, METHODDEF(void) quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, - JSAMPARRAY output_buf, int num_rows) + JSAMPARRAY output_buf, int num_rows) /* General case, with Floyd-Steinberg dithering */ { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; - register LOCFSERROR cur; /* current error or pixel value */ - LOCFSERROR belowerr; /* error for pixel below cur */ - LOCFSERROR bpreverr; /* error for below/prev col */ - LOCFSERROR bnexterr; /* error for below/next col */ + register LOCFSERROR cur; /* current error or pixel value */ + LOCFSERROR belowerr; /* error for pixel below cur */ + LOCFSERROR bpreverr; /* error for below/prev col */ + LOCFSERROR bnexterr; /* error for below/next col */ LOCFSERROR delta; - register FSERRPTR errorptr; /* => fserrors[] at column before current */ + register FSERRPTR errorptr; /* => fserrors[] at column before current */ register JSAMPROW input_ptr; register JSAMPROW output_ptr; JSAMPROW colorindex_ci; JSAMPROW colormap_ci; int pixcode; int nc = cinfo->out_color_components; - int dir; /* 1 for left-to-right, -1 for right-to-left */ - int dirnc; /* dir * nc */ + int dir; /* 1 for left-to-right, -1 for right-to-left */ + int dirnc; /* dir * nc */ int ci; int row; JDIMENSION col; @@ -641,22 +641,22 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, for (row = 0; row < num_rows; row++) { /* Initialize output values to 0 so can process components separately */ jzero_far((void FAR *) output_buf[row], - (size_t) (width * SIZEOF(JSAMPLE))); + (size_t) (width * SIZEOF(JSAMPLE))); for (ci = 0; ci < nc; ci++) { input_ptr = input_buf[row] + ci; output_ptr = output_buf[row]; if (cquantize->on_odd_row) { - /* work right to left in this row */ - input_ptr += (width-1) * nc; /* so point to rightmost pixel */ - output_ptr += width-1; - dir = -1; - dirnc = -nc; - errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */ + /* work right to left in this row */ + input_ptr += (width-1) * nc; /* so point to rightmost pixel */ + output_ptr += width-1; + dir = -1; + dirnc = -nc; + errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */ } else { - /* work left to right in this row */ - dir = 1; - dirnc = nc; - errorptr = cquantize->fserrors[ci]; /* => entry before first column */ + /* work left to right in this row */ + dir = 1; + dirnc = nc; + errorptr = cquantize->fserrors[ci]; /* => entry before first column */ } colorindex_ci = cquantize->colorindex[ci]; colormap_ci = cquantize->sv_colormap[ci]; @@ -666,47 +666,47 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf, belowerr = bpreverr = 0; for (col = width; col > 0; col--) { - /* cur holds the error propagated from the previous pixel on the - * current line. Add the error propagated from the previous line - * to form the complete error correction term for this pixel, and - * round the error term (which is expressed * 16) to an integer. - * RIGHT_SHIFT rounds towards minus infinity, so adding 8 is correct - * for either sign of the error value. - * Note: errorptr points to *previous* column's array entry. - */ - cur = RIGHT_SHIFT(cur + errorptr[dir] + 8, 4); - /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE. - * The maximum error is +- MAXJSAMPLE; this sets the required size - * of the range_limit array. - */ - cur += GETJSAMPLE(*input_ptr); - cur = GETJSAMPLE(range_limit[cur]); - /* Select output value, accumulate into output code for this pixel */ - pixcode = GETJSAMPLE(colorindex_ci[cur]); - *output_ptr += (JSAMPLE) pixcode; - /* Compute actual representation error at this pixel */ - /* Note: we can do this even though we don't have the final */ - /* pixel code, because the colormap is orthogonal. */ - cur -= GETJSAMPLE(colormap_ci[pixcode]); - /* Compute error fractions to be propagated to adjacent pixels. - * Add these into the running sums, and simultaneously shift the - * next-line error sums left by 1 column. - */ - bnexterr = cur; - delta = cur * 2; - cur += delta; /* form error * 3 */ - errorptr[0] = (FSERROR) (bpreverr + cur); - cur += delta; /* form error * 5 */ - bpreverr = belowerr + cur; - belowerr = bnexterr; - cur += delta; /* form error * 7 */ - /* At this point cur contains the 7/16 error value to be propagated - * to the next pixel on the current line, and all the errors for the - * next line have been shifted over. We are therefore ready to move on. - */ - input_ptr += dirnc; /* advance input ptr to next column */ - output_ptr += dir; /* advance output ptr to next column */ - errorptr += dir; /* advance errorptr to current column */ + /* cur holds the error propagated from the previous pixel on the + * current line. Add the error propagated from the previous line + * to form the complete error correction term for this pixel, and + * round the error term (which is expressed * 16) to an integer. + * RIGHT_SHIFT rounds towards minus infinity, so adding 8 is correct + * for either sign of the error value. + * Note: errorptr points to *previous* column's array entry. + */ + cur = RIGHT_SHIFT(cur + errorptr[dir] + 8, 4); + /* Form pixel value + error, and range-limit to 0..MAXJSAMPLE. + * The maximum error is +- MAXJSAMPLE; this sets the required size + * of the range_limit array. + */ + cur += GETJSAMPLE(*input_ptr); + cur = GETJSAMPLE(range_limit[cur]); + /* Select output value, accumulate into output code for this pixel */ + pixcode = GETJSAMPLE(colorindex_ci[cur]); + *output_ptr += (JSAMPLE) pixcode; + /* Compute actual representation error at this pixel */ + /* Note: we can do this even though we don't have the final */ + /* pixel code, because the colormap is orthogonal. */ + cur -= GETJSAMPLE(colormap_ci[pixcode]); + /* Compute error fractions to be propagated to adjacent pixels. + * Add these into the running sums, and simultaneously shift the + * next-line error sums left by 1 column. + */ + bnexterr = cur; + delta = cur * 2; + cur += delta; /* form error * 3 */ + errorptr[0] = (FSERROR) (bpreverr + cur); + cur += delta; /* form error * 5 */ + bpreverr = belowerr + cur; + belowerr = bnexterr; + cur += delta; /* form error * 7 */ + /* At this point cur contains the 7/16 error value to be propagated + * to the next pixel on the current line, and all the errors for the + * next line have been shifted over. We are therefore ready to move on. + */ + input_ptr += dirnc; /* advance input ptr to next column */ + output_ptr += dir; /* advance output ptr to next column */ + errorptr += dir; /* advance errorptr to current column */ } /* Post-loop cleanup: we must unload the final error value into the * final fserrors[] entry. Note we need not unload belowerr because @@ -766,7 +766,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan) cquantize->pub.color_quantize = quantize3_ord_dither; else cquantize->pub.color_quantize = quantize_ord_dither; - cquantize->row_index = 0; /* initialize state for ordered dither */ + cquantize->row_index = 0; /* initialize state for ordered dither */ /* If user changed to ordered dither from another mode, * we must recreate the color index table with padding. * This will cost extra space, but probably isn't very likely. @@ -829,13 +829,13 @@ jinit_1pass_quantizer (j_decompress_ptr cinfo) cquantize = (my_cquantize_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_cquantizer)); + SIZEOF(my_cquantizer)); cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize; cquantize->pub.start_pass = start_pass_1_quant; cquantize->pub.finish_pass = finish_pass_1_quant; cquantize->pub.new_color_map = new_color_map_1_quant; cquantize->fserrors[0] = NULL; /* Flag FS workspace not allocated */ - cquantize->odither[0] = NULL; /* Also flag odither arrays not allocated */ + cquantize->odither[0] = NULL; /* Also flag odither arrays not allocated */ /* Make sure my internal arrays won't overflow */ if (cinfo->out_color_components > MAX_Q_COMPS) diff --git a/jquant2.c b/jquant2.c index 47a1f90aa..0fdb1975d 100644 --- a/jquant2.c +++ b/jquant2.c @@ -72,9 +72,9 @@ * probably need to change these scale factors. */ -#define R_SCALE 2 /* scale R distances by this much */ -#define G_SCALE 3 /* scale G distances by this much */ -#define B_SCALE 1 /* and B by this much */ +#define R_SCALE 2 /* scale R distances by this much */ +#define G_SCALE 3 /* scale G distances by this much */ +#define B_SCALE 1 /* and B by this much */ static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE}; #define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]] @@ -112,9 +112,9 @@ static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE}; /* These will do the right thing for either R,G,B or B,G,R color order, * but you may not like the results for other color orders. */ -#define HIST_C0_BITS 5 /* bits of precision in R/B histogram */ -#define HIST_C1_BITS 6 /* bits of precision in G histogram */ -#define HIST_C2_BITS 5 /* bits of precision in B/R histogram */ +#define HIST_C0_BITS 5 /* bits of precision in R/B histogram */ +#define HIST_C1_BITS 6 /* bits of precision in G histogram */ +#define HIST_C2_BITS 5 /* bits of precision in B/R histogram */ /* Number of elements along histogram axes. */ #define HIST_C0_ELEMS (1<cquantize; register JSAMPROW ptr; @@ -220,11 +220,11 @@ prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf, for (col = width; col > 0; col--) { /* get pixel value and index into the histogram */ histp = & histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT] - [GETJSAMPLE(ptr[1]) >> C1_SHIFT] - [GETJSAMPLE(ptr[2]) >> C2_SHIFT]; + [GETJSAMPLE(ptr[1]) >> C1_SHIFT] + [GETJSAMPLE(ptr[2]) >> C2_SHIFT]; /* increment, check for overflow and undo increment if so. */ if (++(*histp) <= 0) - (*histp)--; + (*histp)--; ptr += 3; } } @@ -312,67 +312,67 @@ update_box (j_decompress_ptr cinfo, boxptr boxp) if (c0max > c0min) for (c0 = c0min; c0 <= c0max; c0++) for (c1 = c1min; c1 <= c1max; c1++) { - histp = & histogram[c0][c1][c2min]; - for (c2 = c2min; c2 <= c2max; c2++) - if (*histp++ != 0) { - boxp->c0min = c0min = c0; - goto have_c0min; - } + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) + if (*histp++ != 0) { + boxp->c0min = c0min = c0; + goto have_c0min; + } } have_c0min: if (c0max > c0min) for (c0 = c0max; c0 >= c0min; c0--) for (c1 = c1min; c1 <= c1max; c1++) { - histp = & histogram[c0][c1][c2min]; - for (c2 = c2min; c2 <= c2max; c2++) - if (*histp++ != 0) { - boxp->c0max = c0max = c0; - goto have_c0max; - } + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) + if (*histp++ != 0) { + boxp->c0max = c0max = c0; + goto have_c0max; + } } have_c0max: if (c1max > c1min) for (c1 = c1min; c1 <= c1max; c1++) for (c0 = c0min; c0 <= c0max; c0++) { - histp = & histogram[c0][c1][c2min]; - for (c2 = c2min; c2 <= c2max; c2++) - if (*histp++ != 0) { - boxp->c1min = c1min = c1; - goto have_c1min; - } + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) + if (*histp++ != 0) { + boxp->c1min = c1min = c1; + goto have_c1min; + } } have_c1min: if (c1max > c1min) for (c1 = c1max; c1 >= c1min; c1--) for (c0 = c0min; c0 <= c0max; c0++) { - histp = & histogram[c0][c1][c2min]; - for (c2 = c2min; c2 <= c2max; c2++) - if (*histp++ != 0) { - boxp->c1max = c1max = c1; - goto have_c1max; - } + histp = & histogram[c0][c1][c2min]; + for (c2 = c2min; c2 <= c2max; c2++) + if (*histp++ != 0) { + boxp->c1max = c1max = c1; + goto have_c1max; + } } have_c1max: if (c2max > c2min) for (c2 = c2min; c2 <= c2max; c2++) for (c0 = c0min; c0 <= c0max; c0++) { - histp = & histogram[c0][c1min][c2]; - for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS) - if (*histp != 0) { - boxp->c2min = c2min = c2; - goto have_c2min; - } + histp = & histogram[c0][c1min][c2]; + for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS) + if (*histp != 0) { + boxp->c2min = c2min = c2; + goto have_c2min; + } } have_c2min: if (c2max > c2min) for (c2 = c2max; c2 >= c2min; c2--) for (c0 = c0min; c0 <= c0max; c0++) { - histp = & histogram[c0][c1min][c2]; - for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS) - if (*histp != 0) { - boxp->c2max = c2max = c2; - goto have_c2max; - } + histp = & histogram[c0][c1min][c2]; + for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS) + if (*histp != 0) { + boxp->c2max = c2max = c2; + goto have_c2max; + } } have_c2max: @@ -395,9 +395,9 @@ update_box (j_decompress_ptr cinfo, boxptr boxp) for (c1 = c1min; c1 <= c1max; c1++) { histp = & histogram[c0][c1][c2min]; for (c2 = c2min; c2 <= c2max; c2++, histp++) - if (*histp != 0) { - ccount++; - } + if (*histp != 0) { + ccount++; + } } boxp->colorcount = ccount; } @@ -405,7 +405,7 @@ update_box (j_decompress_ptr cinfo, boxptr boxp) LOCAL(int) median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes, - int desired_colors) + int desired_colors) /* Repeatedly select and split the largest box until we have enough boxes */ { int n,lb; @@ -421,9 +421,9 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes, } else { b1 = find_biggest_volume(boxlist, numboxes); } - if (b1 == NULL) /* no splittable boxes left! */ + if (b1 == NULL) /* no splittable boxes left! */ break; - b2 = &boxlist[numboxes]; /* where new box will go */ + b2 = &boxlist[numboxes]; /* where new box will go */ /* Copy the color bounds to the new box. */ b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max; b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min; @@ -504,12 +504,12 @@ compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor) for (c1 = c1min; c1 <= c1max; c1++) { histp = & histogram[c0][c1][c2min]; for (c2 = c2min; c2 <= c2max; c2++) { - if ((count = *histp++) != 0) { - total += count; - c0total += ((c0 << C0_SHIFT) + ((1<>1)) * count; - c1total += ((c1 << C1_SHIFT) + ((1<>1)) * count; - c2total += ((c2 << C2_SHIFT) + ((1<>1)) * count; - } + if ((count = *histp++) != 0) { + total += count; + c0total += ((c0 << C0_SHIFT) + ((1<>1)) * count; + c1total += ((c1 << C1_SHIFT) + ((1<>1)) * count; + c2total += ((c2 << C2_SHIFT) + ((1<>1)) * count; + } } } @@ -628,7 +628,7 @@ select_colors (j_decompress_ptr cinfo, int desired_colors) LOCAL(int) find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, - JSAMPLE colorlist[]) + JSAMPLE colorlist[]) /* Locate the colormap entries close enough to an update box to be candidates * for the nearest entry to some cell(s) in the update box. The update box * is specified by the center coordinates of its first cell. The number of @@ -643,7 +643,7 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, int centerc0, centerc1, centerc2; int i, x, ncolors; INT32 minmaxdist, min_dist, max_dist, tdist; - INT32 mindist[MAXNUMCOLORS]; /* min distance to colormap entry i */ + INT32 mindist[MAXNUMCOLORS]; /* min distance to colormap entry i */ /* Compute true coordinates of update box's upper corner and center. * Actually we compute the coordinates of the center of the upper-corner @@ -685,11 +685,11 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, /* within cell range so no contribution to min_dist */ min_dist = 0; if (x <= centerc0) { - tdist = (x - maxc0) * C0_SCALE; - max_dist = tdist*tdist; + tdist = (x - maxc0) * C0_SCALE; + max_dist = tdist*tdist; } else { - tdist = (x - minc0) * C0_SCALE; - max_dist = tdist*tdist; + tdist = (x - minc0) * C0_SCALE; + max_dist = tdist*tdist; } } @@ -707,11 +707,11 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, } else { /* within cell range so no contribution to min_dist */ if (x <= centerc1) { - tdist = (x - maxc1) * C1_SCALE; - max_dist += tdist*tdist; + tdist = (x - maxc1) * C1_SCALE; + max_dist += tdist*tdist; } else { - tdist = (x - minc1) * C1_SCALE; - max_dist += tdist*tdist; + tdist = (x - minc1) * C1_SCALE; + max_dist += tdist*tdist; } } @@ -729,15 +729,15 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, } else { /* within cell range so no contribution to min_dist */ if (x <= centerc2) { - tdist = (x - maxc2) * C2_SCALE; - max_dist += tdist*tdist; + tdist = (x - maxc2) * C2_SCALE; + max_dist += tdist*tdist; } else { - tdist = (x - minc2) * C2_SCALE; - max_dist += tdist*tdist; + tdist = (x - minc2) * C2_SCALE; + max_dist += tdist*tdist; } } - mindist[i] = min_dist; /* save away the results */ + mindist[i] = min_dist; /* save away the results */ if (max_dist < minmaxdist) minmaxdist = max_dist; } @@ -757,7 +757,7 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, LOCAL(void) find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, - int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[]) + int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[]) /* Find the closest colormap entry for each cell in the update box, * given the list of candidate colors prepared by find_nearby_colors. * Return the indexes of the closest entries in the bestcolor[] array. @@ -767,13 +767,13 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, { int ic0, ic1, ic2; int i, icolor; - register INT32 * bptr; /* pointer into bestdist[] array */ - JSAMPLE * cptr; /* pointer into bestcolor[] array */ - INT32 dist0, dist1; /* initial distance values */ - register INT32 dist2; /* current distance in inner loop */ - INT32 xx0, xx1; /* distance increments */ + register INT32 * bptr; /* pointer into bestdist[] array */ + JSAMPLE * cptr; /* pointer into bestcolor[] array */ + INT32 dist0, dist1; /* initial distance values */ + register INT32 dist2; /* current distance in inner loop */ + INT32 xx0, xx1; /* distance increments */ register INT32 xx2; - INT32 inc0, inc1, inc2; /* initial values for increments */ + INT32 inc0, inc1, inc2; /* initial values for increments */ /* This array holds the distance to the nearest-so-far color for each cell */ INT32 bestdist[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS]; @@ -813,20 +813,20 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2, dist1 = dist0; xx1 = inc1; for (ic1 = BOX_C1_ELEMS-1; ic1 >= 0; ic1--) { - dist2 = dist1; - xx2 = inc2; - for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) { - if (dist2 < *bptr) { - *bptr = dist2; - *cptr = (JSAMPLE) icolor; - } - dist2 += xx2; - xx2 += 2 * STEP_C2 * STEP_C2; - bptr++; - cptr++; - } - dist1 += xx1; - xx1 += 2 * STEP_C1 * STEP_C1; + dist2 = dist1; + xx2 = inc2; + for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) { + if (dist2 < *bptr) { + *bptr = dist2; + *cptr = (JSAMPLE) icolor; + } + dist2 += xx2; + xx2 += 2 * STEP_C2 * STEP_C2; + bptr++; + cptr++; + } + dist1 += xx1; + xx1 += 2 * STEP_C1 * STEP_C1; } dist0 += xx0; xx0 += 2 * STEP_C0 * STEP_C0; @@ -843,13 +843,13 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2) { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; hist3d histogram = cquantize->histogram; - int minc0, minc1, minc2; /* lower left corner of update box */ + int minc0, minc1, minc2; /* lower left corner of update box */ int ic0, ic1, ic2; - register JSAMPLE * cptr; /* pointer into bestcolor[] array */ - register histptr cachep; /* pointer into main cache array */ + register JSAMPLE * cptr; /* pointer into bestcolor[] array */ + register histptr cachep; /* pointer into main cache array */ /* This array lists the candidate colormap indexes. */ JSAMPLE colorlist[MAXNUMCOLORS]; - int numcolors; /* number of candidate colors */ + int numcolors; /* number of candidate colors */ /* This array holds the actually closest colormap index for each cell. */ JSAMPLE bestcolor[BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS]; @@ -873,10 +873,10 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2) /* Determine the actually nearest colors. */ find_best_colors(cinfo, minc0, minc1, minc2, numcolors, colorlist, - bestcolor); + bestcolor); /* Save the best color numbers (plus 1) in the main cache array */ - c0 <<= BOX_C0_LOG; /* convert ID back to base cell indexes */ + c0 <<= BOX_C0_LOG; /* convert ID back to base cell indexes */ c1 <<= BOX_C1_LOG; c2 <<= BOX_C2_LOG; cptr = bestcolor; @@ -884,7 +884,7 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2) for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) { cachep = & histogram[c0+ic0][c1+ic1][c2]; for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) { - *cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1); + *cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1); } } } @@ -897,7 +897,7 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2) METHODDEF(void) pass2_no_dither (j_decompress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows) + JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows) /* This version performs no dithering */ { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; @@ -921,7 +921,7 @@ pass2_no_dither (j_decompress_ptr cinfo, /* If we have not seen this color before, find nearest colormap entry */ /* and update the cache */ if (*cachep == 0) - fill_inverse_cmap(cinfo, c0,c1,c2); + fill_inverse_cmap(cinfo, c0,c1,c2); /* Now emit the colormap index for this cell */ *outptr++ = (JSAMPLE) (*cachep - 1); } @@ -931,20 +931,20 @@ pass2_no_dither (j_decompress_ptr cinfo, METHODDEF(void) pass2_fs_dither (j_decompress_ptr cinfo, - JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows) + JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows) /* This version performs Floyd-Steinberg dithering */ { my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize; hist3d histogram = cquantize->histogram; - register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */ + register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */ LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */ LOCFSERROR bpreverr0, bpreverr1, bpreverr2; /* error for below/prev col */ - register FSERRPTR errorptr; /* => fserrors[] at column before current */ - JSAMPROW inptr; /* => current input pixel */ - JSAMPROW outptr; /* => current output pixel */ + register FSERRPTR errorptr; /* => fserrors[] at column before current */ + JSAMPROW inptr; /* => current input pixel */ + JSAMPROW outptr; /* => current output pixel */ histptr cachep; - int dir; /* +1 or -1 depending on direction */ - int dir3; /* 3*dir, for advancing inptr & errorptr */ + int dir; /* +1 or -1 depending on direction */ + int dir3; /* 3*dir, for advancing inptr & errorptr */ int row; JDIMENSION col; JDIMENSION width = cinfo->output_width; @@ -960,7 +960,7 @@ pass2_fs_dither (j_decompress_ptr cinfo, outptr = output_buf[row]; if (cquantize->on_odd_row) { /* work right to left in this row */ - inptr += (width-1) * 3; /* so point to rightmost pixel */ + inptr += (width-1) * 3; /* so point to rightmost pixel */ outptr += width-1; dir = -1; dir3 = -3; @@ -1012,14 +1012,14 @@ pass2_fs_dither (j_decompress_ptr cinfo, /* If we have not seen this color before, find nearest colormap */ /* entry and update the cache */ if (*cachep == 0) - fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT); + fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT); /* Now emit the colormap index for this cell */ { register int pixcode = *cachep - 1; - *outptr = (JSAMPLE) pixcode; - /* Compute representation error for this pixel */ - cur0 -= GETJSAMPLE(colormap0[pixcode]); - cur1 -= GETJSAMPLE(colormap1[pixcode]); - cur2 -= GETJSAMPLE(colormap2[pixcode]); + *outptr = (JSAMPLE) pixcode; + /* Compute representation error for this pixel */ + cur0 -= GETJSAMPLE(colormap0[pixcode]); + cur1 -= GETJSAMPLE(colormap1[pixcode]); + cur2 -= GETJSAMPLE(colormap2[pixcode]); } /* Compute error fractions to be propagated to adjacent pixels. * Add these into the running sums, and simultaneously shift the @@ -1027,29 +1027,29 @@ pass2_fs_dither (j_decompress_ptr cinfo, */ { register LOCFSERROR bnexterr; - bnexterr = cur0; /* Process component 0 */ - errorptr[0] = (FSERROR) (bpreverr0 + cur0 * 3); - bpreverr0 = belowerr0 + cur0 * 5; - belowerr0 = bnexterr; - cur0 *= 7; - bnexterr = cur1; /* Process component 1 */ - errorptr[1] = (FSERROR) (bpreverr1 + cur1 * 3); - bpreverr1 = belowerr1 + cur1 * 5; - belowerr1 = bnexterr; - cur1 *= 7; - bnexterr = cur2; /* Process component 2 */ - errorptr[2] = (FSERROR) (bpreverr2 + cur2 * 3); - bpreverr2 = belowerr2 + cur2 * 5; - belowerr2 = bnexterr; - cur2 *= 7; + bnexterr = cur0; /* Process component 0 */ + errorptr[0] = (FSERROR) (bpreverr0 + cur0 * 3); + bpreverr0 = belowerr0 + cur0 * 5; + belowerr0 = bnexterr; + cur0 *= 7; + bnexterr = cur1; /* Process component 1 */ + errorptr[1] = (FSERROR) (bpreverr1 + cur1 * 3); + bpreverr1 = belowerr1 + cur1 * 5; + belowerr1 = bnexterr; + cur1 *= 7; + bnexterr = cur2; /* Process component 2 */ + errorptr[2] = (FSERROR) (bpreverr2 + cur2 * 3); + bpreverr2 = belowerr2 + cur2 * 5; + belowerr2 = bnexterr; + cur2 *= 7; } /* At this point curN contains the 7/16 error value to be propagated * to the next pixel on the current line, and all the errors for the * next line have been shifted over. We are therefore ready to move on. */ - inptr += dir3; /* Advance pixel pointers to next column */ + inptr += dir3; /* Advance pixel pointers to next column */ outptr += dir; - errorptr += dir3; /* advance errorptr to current column */ + errorptr += dir3; /* advance errorptr to current column */ } /* Post-loop cleanup: we must unload the final error values into the * final fserrors[] entry. Note we need not unload belowerrN because @@ -1089,7 +1089,7 @@ init_error_limit (j_decompress_ptr cinfo) table = (int *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE*2+1) * SIZEOF(int)); - table += MAXJSAMPLE; /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */ + table += MAXJSAMPLE; /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */ cquantize->error_limiter = table; #define STEPSIZE ((MAXJSAMPLE+1)/16) @@ -1172,16 +1172,16 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan) if (cinfo->dither_mode == JDITHER_FS) { size_t arraysize = (size_t) ((cinfo->output_width + 2) * - (3 * SIZEOF(FSERROR))); + (3 * SIZEOF(FSERROR))); /* Allocate Floyd-Steinberg workspace if we didn't already. */ if (cquantize->fserrors == NULL) - cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large) - ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize); + cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large) + ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize); /* Initialize the propagated errors to zero. */ jzero_far((void FAR *) cquantize->fserrors, arraysize); /* Make the error-limit table if we didn't already. */ if (cquantize->error_limiter == NULL) - init_error_limit(cinfo); + init_error_limit(cinfo); cquantize->on_odd_row = FALSE; } @@ -1190,7 +1190,7 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan) if (cquantize->needs_zeroed) { for (i = 0; i < HIST_C0_ELEMS; i++) { jzero_far((void FAR *) histogram[i], - HIST_C1_ELEMS*HIST_C2_ELEMS * SIZEOF(histcell)); + HIST_C1_ELEMS*HIST_C2_ELEMS * SIZEOF(histcell)); } cquantize->needs_zeroed = FALSE; } @@ -1223,11 +1223,11 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo) cquantize = (my_cquantize_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(my_cquantizer)); + SIZEOF(my_cquantizer)); cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize; cquantize->pub.start_pass = start_pass_2_quant; cquantize->pub.new_color_map = new_color_map_2_quant; - cquantize->fserrors = NULL; /* flag optional arrays not allocated */ + cquantize->fserrors = NULL; /* flag optional arrays not allocated */ cquantize->error_limiter = NULL; /* Make sure jdmaster didn't give me a case I can't handle */ diff --git a/jstdhuff.c b/jstdhuff.c index 7fec6cafa..0e9aff421 100644 --- a/jstdhuff.c +++ b/jstdhuff.c @@ -17,7 +17,7 @@ LOCAL(void) add_huff_table (j_common_ptr cinfo, - JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val) + JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val) /* Define a Huffman table */ { int nsymbols, len; diff --git a/jutils.c b/jutils.c index d18a95556..b9997a394 100644 --- a/jutils.c +++ b/jutils.c @@ -21,7 +21,7 @@ * of a DCT block read in natural order (left to right, top to bottom). */ -#if 0 /* This table is not actually needed in v6a */ +#if 0 /* This table is not actually needed in v6a */ const int jpeg_zigzag_order[DCTSIZE2] = { 0, 1, 5, 6, 14, 15, 27, 28, @@ -96,21 +96,21 @@ jround_up (long a, long b) * is not all that great, because these routines aren't very heavily used.) */ -#ifndef NEED_FAR_POINTERS /* normal case, same as regular macros */ -#define FMEMCOPY(dest,src,size) MEMCOPY(dest,src,size) -#define FMEMZERO(target,size) MEMZERO(target,size) -#else /* 80x86 case, define if we can */ +#ifndef NEED_FAR_POINTERS /* normal case, same as regular macros */ +#define FMEMCOPY(dest,src,size) MEMCOPY(dest,src,size) +#define FMEMZERO(target,size) MEMZERO(target,size) +#else /* 80x86 case, define if we can */ #ifdef USE_FMEM -#define FMEMCOPY(dest,src,size) _fmemcpy((void FAR *)(dest), (const void FAR *)(src), (size_t)(size)) -#define FMEMZERO(target,size) _fmemset((void FAR *)(target), 0, (size_t)(size)) +#define FMEMCOPY(dest,src,size) _fmemcpy((void FAR *)(dest), (const void FAR *)(src), (size_t)(size)) +#define FMEMZERO(target,size) _fmemset((void FAR *)(target), 0, (size_t)(size)) #endif #endif GLOBAL(void) jcopy_sample_rows (JSAMPARRAY input_array, int source_row, - JSAMPARRAY output_array, int dest_row, - int num_rows, JDIMENSION num_cols) + JSAMPARRAY output_array, int dest_row, + int num_rows, JDIMENSION num_cols) /* Copy some rows of samples from one place to another. * num_rows rows are copied from input_array[source_row++] * to output_array[dest_row++]; these areas may overlap for duplication. @@ -135,7 +135,7 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row, FMEMCOPY(outptr, inptr, count); #else for (count = num_cols; count > 0; count--) - *outptr++ = *inptr++; /* needn't bother with GETJSAMPLE() here */ + *outptr++ = *inptr++; /* needn't bother with GETJSAMPLE() here */ #endif } } @@ -143,7 +143,7 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row, GLOBAL(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row, - JDIMENSION num_blocks) + JDIMENSION num_blocks) /* Copy a row of coefficient blocks from one place to another. */ { #ifdef FMEMCOPY diff --git a/jversion.h b/jversion.h index e1c4f499d..5ddadc810 100644 --- a/jversion.h +++ b/jversion.h @@ -13,22 +13,22 @@ #if JPEG_LIB_VERSION >= 80 -#define JVERSION "8d 15-Jan-2012" +#define JVERSION "8d 15-Jan-2012" #elif JPEG_LIB_VERSION >= 70 -#define JVERSION "7 27-Jun-2009" +#define JVERSION "7 27-Jun-2009" #else -#define JVERSION "6b 27-Mar-1998" +#define JVERSION "6b 27-Mar-1998" #endif -#define JCOPYRIGHT "Copyright (C) 1991-2012 Thomas G. Lane, Guido Vollbeding\n" \ - "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \ - "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \ - "Copyright (C) 2009-2014 D. R. Commander\n" \ - "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \ - "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \ - "Copyright (C) 2013 Linaro Limited" +#define JCOPYRIGHT "Copyright (C) 1991-2012 Thomas G. Lane, Guido Vollbeding\n" \ + "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \ + "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \ + "Copyright (C) 2009-2014 D. R. Commander\n" \ + "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \ + "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \ + "Copyright (C) 2013 Linaro Limited" diff --git a/libjpeg.txt b/libjpeg.txt index 88d3a5d17..d11073801 100644 --- a/libjpeg.txt +++ b/libjpeg.txt @@ -27,32 +27,32 @@ TABLE OF CONTENTS ----------------- Overview: - Functions provided by the library - Outline of typical usage + Functions provided by the library + Outline of typical usage Basic library usage: - Data formats - Compression details - Decompression details - Mechanics of usage: include files, linking, etc + Data formats + Compression details + Decompression details + Mechanics of usage: include files, linking, etc Advanced features: - Compression parameter selection - Decompression parameter selection - Special color spaces - Error handling - Compressed data handling (source and destination managers) - I/O suspension - Progressive JPEG support - Buffered-image mode - Abbreviated datastreams and multiple images - Special markers - Raw (downsampled) image data - Really raw data: DCT coefficients - Progress monitoring - Memory management - Memory usage - Library compile-time options - Portability considerations - Notes for MS-DOS implementors + Compression parameter selection + Decompression parameter selection + Special color spaces + Error handling + Compressed data handling (source and destination managers) + I/O suspension + Progressive JPEG support + Buffered-image mode + Abbreviated datastreams and multiple images + Special markers + Raw (downsampled) image data + Really raw data: DCT coefficients + Progress monitoring + Memory management + Memory usage + Library compile-time options + Portability considerations + Notes for MS-DOS implementors You should read at least the overview and basic usage sections before trying to program with the library. The sections on advanced features can be read @@ -93,10 +93,10 @@ A word about functions *not* provided by the library. We handle a subset of the ISO JPEG standard; most baseline, extended-sequential, and progressive JPEG processes are supported. (Our subset includes all features now in common use.) Unsupported ISO options include: - * Hierarchical storage - * Lossless JPEG - * DNL marker - * Nonintegral subsampling ratios + * Hierarchical storage + * Lossless JPEG + * DNL marker + * Nonintegral subsampling ratios We support both 8- and 12-bit data precision, but this is a compile-time choice rather than a run-time choice; hence it is difficult to use both precisions in a single application. @@ -113,14 +113,14 @@ Outline of typical usage The rough outline of a JPEG compression operation is: - Allocate and initialize a JPEG compression object - Specify the destination for the compressed data (eg, a file) - Set parameters for compression, including image size & colorspace - jpeg_start_compress(...); - while (scan lines remain to be written) - jpeg_write_scanlines(...); - jpeg_finish_compress(...); - Release the JPEG compression object + Allocate and initialize a JPEG compression object + Specify the destination for the compressed data (eg, a file) + Set parameters for compression, including image size & colorspace + jpeg_start_compress(...); + while (scan lines remain to be written) + jpeg_write_scanlines(...); + jpeg_finish_compress(...); + Release the JPEG compression object A JPEG compression object holds parameters and working state for the JPEG library. We make creation/destruction of the object separate from starting @@ -139,15 +139,15 @@ provide its own destination manager to do something else. Similarly, the rough outline of a JPEG decompression operation is: - Allocate and initialize a JPEG decompression object - Specify the source of the compressed data (eg, a file) - Call jpeg_read_header() to obtain image info - Set parameters for decompression - jpeg_start_decompress(...); - while (scan lines remain to be read) - jpeg_read_scanlines(...); - jpeg_finish_decompress(...); - Release the JPEG decompression object + Allocate and initialize a JPEG decompression object + Specify the source of the compressed data (eg, a file) + Call jpeg_read_header() to obtain image info + Set parameters for decompression + jpeg_start_decompress(...); + while (scan lines remain to be read) + jpeg_read_scanlines(...); + jpeg_finish_decompress(...); + Release the JPEG decompression object This is comparable to the compression outline except that reading the datastream header is a separate step. This is helpful because information @@ -272,11 +272,11 @@ initialize the rest of the JPEG object. Typical code for this step, if you are using the default error handler, is - struct jpeg_compress_struct cinfo; - struct jpeg_error_mgr jerr; - ... - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_compress(&cinfo); + struct jpeg_compress_struct cinfo; + struct jpeg_error_mgr jerr; + ... + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_compress(&cinfo); jpeg_create_compress allocates a small amount of memory, so it could fail if you are out of memory. In that case it will exit via the error handler; @@ -293,13 +293,13 @@ destination module if you want to do something else, as discussed later. If you use the standard destination module, you must open the target stdio stream beforehand. Typical code for this step looks like: - FILE * outfile; - ... - if ((outfile = fopen(filename, "wb")) == NULL) { - fprintf(stderr, "can't open %s\n", filename); - exit(1); - } - jpeg_stdio_dest(&cinfo, outfile); + FILE * outfile; + ... + if ((outfile = fopen(filename, "wb")) == NULL) { + fprintf(stderr, "can't open %s\n", filename); + exit(1); + } + jpeg_stdio_dest(&cinfo, outfile); where the last line invokes the standard destination module. @@ -320,10 +320,10 @@ calling jpeg_start_compress() and jpeg_finish_compress(). You must supply information about the source image by setting the following fields in the JPEG object (cinfo structure): - image_width Width of image, in pixels - image_height Height of image, in pixels - input_components Number of color channels (samples per pixel) - in_color_space Color space of source image + image_width Width of image, in pixels + image_height Height of image, in pixels + input_components Number of color channels (samples per pixel) + in_color_space Color space of source image The image dimensions are, hopefully, obvious. JPEG supports image dimensions of 1 to 64K pixels in either direction. The input color space is typically @@ -347,13 +347,13 @@ than once, if that happens to be convenient. Typical code for a 24-bit RGB source image is - cinfo.image_width = Width; /* image width and height, in pixels */ - cinfo.image_height = Height; - cinfo.input_components = 3; /* # of color components per pixel */ - cinfo.in_color_space = JCS_RGB; /* colorspace of input image */ + cinfo.image_width = Width; /* image width and height, in pixels */ + cinfo.image_height = Height; + cinfo.input_components = 3; /* # of color components per pixel */ + cinfo.in_color_space = JCS_RGB; /* colorspace of input image */ - jpeg_set_defaults(&cinfo); - /* Make optional parameter settings here */ + jpeg_set_defaults(&cinfo); + /* Make optional parameter settings here */ 4. jpeg_start_compress(...); @@ -365,7 +365,7 @@ storage, and emit the first few bytes of the JPEG datastream header. Typical code: - jpeg_start_compress(&cinfo, TRUE); + jpeg_start_compress(&cinfo, TRUE); The "TRUE" parameter ensures that a complete JPEG interchange datastream will be written. This is appropriate in most cases. If you think you might @@ -378,7 +378,7 @@ the compression cycle. 5. while (scan lines remain to be written) - jpeg_write_scanlines(...); + jpeg_write_scanlines(...); Now write all the required image data by calling jpeg_write_scanlines() one or more times. You can pass one or more scanlines in each call, up @@ -403,15 +403,15 @@ Code for this step depends heavily on the way that you store the source data. example.c shows the following code for the case of a full-size 2-D source array containing 3-byte RGB pixels: - JSAMPROW row_pointer[1]; /* pointer to a single row */ - int row_stride; /* physical row width in buffer */ + JSAMPROW row_pointer[1]; /* pointer to a single row */ + int row_stride; /* physical row width in buffer */ - row_stride = image_width * 3; /* JSAMPLEs per row in image_buffer */ + row_stride = image_width * 3; /* JSAMPLEs per row in image_buffer */ - while (cinfo.next_scanline < cinfo.image_height) { - row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride]; - jpeg_write_scanlines(&cinfo, row_pointer, 1); - } + while (cinfo.next_scanline < cinfo.image_height) { + row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride]; + jpeg_write_scanlines(&cinfo, row_pointer, 1); + } jpeg_write_scanlines() returns the number of scanlines actually written. This will normally be equal to the number passed in, so you can usually @@ -436,7 +436,7 @@ object. Typical code: - jpeg_finish_compress(&cinfo); + jpeg_finish_compress(&cinfo); If using the stdio destination manager, don't forget to close the output stdio stream (if necessary) afterwards. @@ -479,7 +479,7 @@ handler structure. Typical code: - jpeg_destroy_compress(&cinfo); + jpeg_destroy_compress(&cinfo); 8. Aborting. @@ -520,11 +520,11 @@ call jpeg_create_decompress(). Error handling is exactly the same. Typical code: - struct jpeg_decompress_struct cinfo; - struct jpeg_error_mgr jerr; - ... - cinfo.err = jpeg_std_error(&jerr); - jpeg_create_decompress(&cinfo); + struct jpeg_decompress_struct cinfo; + struct jpeg_error_mgr jerr; + ... + cinfo.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&cinfo); (Both here and in the IJG code, we usually use variable name "cinfo" for both compression and decompression objects.) @@ -540,13 +540,13 @@ to do something else, as discussed later. If you use the standard source module, you must open the source stdio stream beforehand. Typical code for this step looks like: - FILE * infile; - ... - if ((infile = fopen(filename, "rb")) == NULL) { - fprintf(stderr, "can't open %s\n", filename); - exit(1); - } - jpeg_stdio_src(&cinfo, infile); + FILE * infile; + ... + if ((infile = fopen(filename, "rb")) == NULL) { + fprintf(stderr, "can't open %s\n", filename); + exit(1); + } + jpeg_stdio_src(&cinfo, infile); where the last line invokes the standard source module. @@ -569,7 +569,7 @@ being discarded. Typical code for this step is just - jpeg_read_header(&cinfo, TRUE); + jpeg_read_header(&cinfo, TRUE); This will read the source datastream header markers, up to the beginning of the compressed data proper. On return, the image dimensions and other @@ -617,7 +617,7 @@ memory, and prepare for returning data. Typical code is just - jpeg_start_decompress(&cinfo); + jpeg_start_decompress(&cinfo); If you have requested a multi-pass operating mode, such as 2-pass color quantization, jpeg_start_decompress() will do everything needed before data @@ -630,12 +630,12 @@ After this call, the final output image dimensions, including any requested scaling, are available in the JPEG object; so is the selected colormap, if colormapped output has been requested. Useful fields include - output_width image width and height, as scaled - output_height - out_color_components # of color components in out_color_space - output_components # of color components returned per pixel - colormap the selected colormap, if any - actual_number_of_colors number of entries in colormap + output_width image width and height, as scaled + output_height + out_color_components # of color components in out_color_space + output_components # of color components returned per pixel + colormap the selected colormap, if any + actual_number_of_colors number of entries in colormap output_components is 1 (a colormap index) when quantizing colors; otherwise it equals out_color_components. It is the number of JSAMPLE values that will be @@ -654,7 +654,7 @@ relevant parameters (scaling, output color space, and quantization flag). 6. while (scan lines remain to be read) - jpeg_read_scanlines(...); + jpeg_read_scanlines(...); Now you can read the decompressed image data by calling jpeg_read_scanlines() one or more times. At each call, you pass in the maximum number of scanlines @@ -696,7 +696,7 @@ with the JPEG object to be released. Typical code: - jpeg_finish_decompress(&cinfo); + jpeg_finish_decompress(&cinfo); If using the stdio source manager, don't forget to close the source stdio stream if necessary. @@ -719,7 +719,7 @@ destroying compression objects applies here too. Typical code: - jpeg_destroy_decompress(&cinfo); + jpeg_destroy_decompress(&cinfo); 9. Aborting. @@ -800,220 +800,220 @@ cinfo fields directly. The helper routines are: jpeg_set_defaults (j_compress_ptr cinfo) - This routine sets all JPEG parameters to reasonable defaults, using - only the input image's color space (field in_color_space, which must - already be set in cinfo). Many applications will only need to use - this routine and perhaps jpeg_set_quality(). + This routine sets all JPEG parameters to reasonable defaults, using + only the input image's color space (field in_color_space, which must + already be set in cinfo). Many applications will only need to use + this routine and perhaps jpeg_set_quality(). jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace) - Sets the JPEG file's colorspace (field jpeg_color_space) as specified, - and sets other color-space-dependent parameters appropriately. See - "Special color spaces", below, before using this. A large number of - parameters, including all per-component parameters, are set by this - routine; if you want to twiddle individual parameters you should call - jpeg_set_colorspace() before rather than after. + Sets the JPEG file's colorspace (field jpeg_color_space) as specified, + and sets other color-space-dependent parameters appropriately. See + "Special color spaces", below, before using this. A large number of + parameters, including all per-component parameters, are set by this + routine; if you want to twiddle individual parameters you should call + jpeg_set_colorspace() before rather than after. jpeg_default_colorspace (j_compress_ptr cinfo) - Selects an appropriate JPEG colorspace based on cinfo->in_color_space, - and calls jpeg_set_colorspace(). This is actually a subroutine of - jpeg_set_defaults(). It's broken out in case you want to change - just the colorspace-dependent JPEG parameters. + Selects an appropriate JPEG colorspace based on cinfo->in_color_space, + and calls jpeg_set_colorspace(). This is actually a subroutine of + jpeg_set_defaults(). It's broken out in case you want to change + just the colorspace-dependent JPEG parameters. jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline) - Constructs JPEG quantization tables appropriate for the indicated - quality setting. The quality value is expressed on the 0..100 scale - recommended by IJG (cjpeg's "-quality" switch uses this routine). - Note that the exact mapping from quality values to tables may change - in future IJG releases as more is learned about DCT quantization. - If the force_baseline parameter is TRUE, then the quantization table - entries are constrained to the range 1..255 for full JPEG baseline - compatibility. In the current implementation, this only makes a - difference for quality settings below 25, and it effectively prevents - very small/low quality files from being generated. The IJG decoder - is capable of reading the non-baseline files generated at low quality - settings when force_baseline is FALSE, but other decoders may not be. + Constructs JPEG quantization tables appropriate for the indicated + quality setting. The quality value is expressed on the 0..100 scale + recommended by IJG (cjpeg's "-quality" switch uses this routine). + Note that the exact mapping from quality values to tables may change + in future IJG releases as more is learned about DCT quantization. + If the force_baseline parameter is TRUE, then the quantization table + entries are constrained to the range 1..255 for full JPEG baseline + compatibility. In the current implementation, this only makes a + difference for quality settings below 25, and it effectively prevents + very small/low quality files from being generated. The IJG decoder + is capable of reading the non-baseline files generated at low quality + settings when force_baseline is FALSE, but other decoders may not be. jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor, - boolean force_baseline) - Same as jpeg_set_quality() except that the generated tables are the - sample tables given in the JPEC spec section K.1, multiplied by the - specified scale factor (which is expressed as a percentage; thus - scale_factor = 100 reproduces the spec's tables). Note that larger - scale factors give lower quality. This entry point is useful for - conforming to the Adobe PostScript DCT conventions, but we do not - recommend linear scaling as a user-visible quality scale otherwise. - force_baseline again constrains the computed table entries to 1..255. + boolean force_baseline) + Same as jpeg_set_quality() except that the generated tables are the + sample tables given in the JPEC spec section K.1, multiplied by the + specified scale factor (which is expressed as a percentage; thus + scale_factor = 100 reproduces the spec's tables). Note that larger + scale factors give lower quality. This entry point is useful for + conforming to the Adobe PostScript DCT conventions, but we do not + recommend linear scaling as a user-visible quality scale otherwise. + force_baseline again constrains the computed table entries to 1..255. int jpeg_quality_scaling (int quality) - Converts a value on the IJG-recommended quality scale to a linear - scaling percentage. Note that this routine may change or go away - in future releases --- IJG may choose to adopt a scaling method that - can't be expressed as a simple scalar multiplier, in which case the - premise of this routine collapses. Caveat user. + Converts a value on the IJG-recommended quality scale to a linear + scaling percentage. Note that this routine may change or go away + in future releases --- IJG may choose to adopt a scaling method that + can't be expressed as a simple scalar multiplier, in which case the + premise of this routine collapses. Caveat user. jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline) - [libjpeg v7+ API/ABI emulation only] - Set default quantization tables with linear q_scale_factor[] values - (see below). + [libjpeg v7+ API/ABI emulation only] + Set default quantization tables with linear q_scale_factor[] values + (see below). jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl, - const unsigned int *basic_table, - int scale_factor, boolean force_baseline) - Allows an arbitrary quantization table to be created. which_tbl - indicates which table slot to fill. basic_table points to an array - of 64 unsigned ints given in normal array order. These values are - multiplied by scale_factor/100 and then clamped to the range 1..65535 - (or to 1..255 if force_baseline is TRUE). - CAUTION: prior to library version 6a, jpeg_add_quant_table expected - the basic table to be given in JPEG zigzag order. If you need to - write code that works with either older or newer versions of this - routine, you must check the library version number. Something like - "#if JPEG_LIB_VERSION >= 61" is the right test. + const unsigned int *basic_table, + int scale_factor, boolean force_baseline) + Allows an arbitrary quantization table to be created. which_tbl + indicates which table slot to fill. basic_table points to an array + of 64 unsigned ints given in normal array order. These values are + multiplied by scale_factor/100 and then clamped to the range 1..65535 + (or to 1..255 if force_baseline is TRUE). + CAUTION: prior to library version 6a, jpeg_add_quant_table expected + the basic table to be given in JPEG zigzag order. If you need to + write code that works with either older or newer versions of this + routine, you must check the library version number. Something like + "#if JPEG_LIB_VERSION >= 61" is the right test. jpeg_simple_progression (j_compress_ptr cinfo) - Generates a default scan script for writing a progressive-JPEG file. - This is the recommended method of creating a progressive file, - unless you want to make a custom scan sequence. You must ensure that - the JPEG color space is set correctly before calling this routine. + Generates a default scan script for writing a progressive-JPEG file. + This is the recommended method of creating a progressive file, + unless you want to make a custom scan sequence. You must ensure that + the JPEG color space is set correctly before calling this routine. Compression parameters (cinfo fields) include: J_DCT_METHOD dct_method - Selects the algorithm used for the DCT step. Choices are: - JDCT_ISLOW: slow but accurate integer algorithm - JDCT_IFAST: faster, less accurate integer method - JDCT_FLOAT: floating-point method - JDCT_DEFAULT: default method (normally JDCT_ISLOW) - JDCT_FASTEST: fastest method (normally JDCT_IFAST) - The FLOAT method is very slightly more accurate than the ISLOW method, - but may give different results on different machines due to varying - roundoff behavior. The integer methods should give the same results - on all machines. On machines with sufficiently fast FP hardware, the - floating-point method may also be the fastest. The IFAST method is - considerably less accurate than the other two; its use is not - recommended if high quality is a concern. JDCT_DEFAULT and - JDCT_FASTEST are macros configurable by each installation. + Selects the algorithm used for the DCT step. Choices are: + JDCT_ISLOW: slow but accurate integer algorithm + JDCT_IFAST: faster, less accurate integer method + JDCT_FLOAT: floating-point method + JDCT_DEFAULT: default method (normally JDCT_ISLOW) + JDCT_FASTEST: fastest method (normally JDCT_IFAST) + The FLOAT method is very slightly more accurate than the ISLOW method, + but may give different results on different machines due to varying + roundoff behavior. The integer methods should give the same results + on all machines. On machines with sufficiently fast FP hardware, the + floating-point method may also be the fastest. The IFAST method is + considerably less accurate than the other two; its use is not + recommended if high quality is a concern. JDCT_DEFAULT and + JDCT_FASTEST are macros configurable by each installation. J_COLOR_SPACE jpeg_color_space int num_components - The JPEG color space and corresponding number of components; see - "Special color spaces", below, for more info. We recommend using - jpeg_set_color_space() if you want to change these. + The JPEG color space and corresponding number of components; see + "Special color spaces", below, for more info. We recommend using + jpeg_set_color_space() if you want to change these. boolean optimize_coding - TRUE causes the compressor to compute optimal Huffman coding tables - for the image. This requires an extra pass over the data and - therefore costs a good deal of space and time. The default is - FALSE, which tells the compressor to use the supplied or default - Huffman tables. In most cases optimal tables save only a few percent - of file size compared to the default tables. Note that when this is - TRUE, you need not supply Huffman tables at all, and any you do - supply will be overwritten. + TRUE causes the compressor to compute optimal Huffman coding tables + for the image. This requires an extra pass over the data and + therefore costs a good deal of space and time. The default is + FALSE, which tells the compressor to use the supplied or default + Huffman tables. In most cases optimal tables save only a few percent + of file size compared to the default tables. Note that when this is + TRUE, you need not supply Huffman tables at all, and any you do + supply will be overwritten. unsigned int restart_interval int restart_in_rows - To emit restart markers in the JPEG file, set one of these nonzero. - Set restart_interval to specify the exact interval in MCU blocks. - Set restart_in_rows to specify the interval in MCU rows. (If - restart_in_rows is not 0, then restart_interval is set after the - image width in MCUs is computed.) Defaults are zero (no restarts). - One restart marker per MCU row is often a good choice. - NOTE: the overhead of restart markers is higher in grayscale JPEG - files than in color files, and MUCH higher in progressive JPEGs. - If you use restarts, you may want to use larger intervals in those - cases. + To emit restart markers in the JPEG file, set one of these nonzero. + Set restart_interval to specify the exact interval in MCU blocks. + Set restart_in_rows to specify the interval in MCU rows. (If + restart_in_rows is not 0, then restart_interval is set after the + image width in MCUs is computed.) Defaults are zero (no restarts). + One restart marker per MCU row is often a good choice. + NOTE: the overhead of restart markers is higher in grayscale JPEG + files than in color files, and MUCH higher in progressive JPEGs. + If you use restarts, you may want to use larger intervals in those + cases. const jpeg_scan_info * scan_info int num_scans - By default, scan_info is NULL; this causes the compressor to write a - single-scan sequential JPEG file. If not NULL, scan_info points to - an array of scan definition records of length num_scans. The - compressor will then write a JPEG file having one scan for each scan - definition record. This is used to generate noninterleaved or - progressive JPEG files. The library checks that the scan array - defines a valid JPEG scan sequence. (jpeg_simple_progression creates - a suitable scan definition array for progressive JPEG.) This is - discussed further under "Progressive JPEG support". + By default, scan_info is NULL; this causes the compressor to write a + single-scan sequential JPEG file. If not NULL, scan_info points to + an array of scan definition records of length num_scans. The + compressor will then write a JPEG file having one scan for each scan + definition record. This is used to generate noninterleaved or + progressive JPEG files. The library checks that the scan array + defines a valid JPEG scan sequence. (jpeg_simple_progression creates + a suitable scan definition array for progressive JPEG.) This is + discussed further under "Progressive JPEG support". int smoothing_factor - If non-zero, the input image is smoothed; the value should be 1 for - minimal smoothing to 100 for maximum smoothing. Consult jcsample.c - for details of the smoothing algorithm. The default is zero. + If non-zero, the input image is smoothed; the value should be 1 for + minimal smoothing to 100 for maximum smoothing. Consult jcsample.c + for details of the smoothing algorithm. The default is zero. boolean write_JFIF_header - If TRUE, a JFIF APP0 marker is emitted. jpeg_set_defaults() and - jpeg_set_colorspace() set this TRUE if a JFIF-legal JPEG color space - (ie, YCbCr or grayscale) is selected, otherwise FALSE. + If TRUE, a JFIF APP0 marker is emitted. jpeg_set_defaults() and + jpeg_set_colorspace() set this TRUE if a JFIF-legal JPEG color space + (ie, YCbCr or grayscale) is selected, otherwise FALSE. UINT8 JFIF_major_version UINT8 JFIF_minor_version - The version number to be written into the JFIF marker. - jpeg_set_defaults() initializes the version to 1.01 (major=minor=1). - You should set it to 1.02 (major=1, minor=2) if you plan to write - any JFIF 1.02 extension markers. + The version number to be written into the JFIF marker. + jpeg_set_defaults() initializes the version to 1.01 (major=minor=1). + You should set it to 1.02 (major=1, minor=2) if you plan to write + any JFIF 1.02 extension markers. UINT8 density_unit UINT16 X_density UINT16 Y_density - The resolution information to be written into the JFIF marker; - not used otherwise. density_unit may be 0 for unknown, - 1 for dots/inch, or 2 for dots/cm. The default values are 0,1,1 - indicating square pixels of unknown size. + The resolution information to be written into the JFIF marker; + not used otherwise. density_unit may be 0 for unknown, + 1 for dots/inch, or 2 for dots/cm. The default values are 0,1,1 + indicating square pixels of unknown size. boolean write_Adobe_marker - If TRUE, an Adobe APP14 marker is emitted. jpeg_set_defaults() and - jpeg_set_colorspace() set this TRUE if JPEG color space RGB, CMYK, - or YCCK is selected, otherwise FALSE. It is generally a bad idea - to set both write_JFIF_header and write_Adobe_marker. In fact, - you probably shouldn't change the default settings at all --- the - default behavior ensures that the JPEG file's color space can be - recognized by the decoder. + If TRUE, an Adobe APP14 marker is emitted. jpeg_set_defaults() and + jpeg_set_colorspace() set this TRUE if JPEG color space RGB, CMYK, + or YCCK is selected, otherwise FALSE. It is generally a bad idea + to set both write_JFIF_header and write_Adobe_marker. In fact, + you probably shouldn't change the default settings at all --- the + default behavior ensures that the JPEG file's color space can be + recognized by the decoder. JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS] - Pointers to coefficient quantization tables, one per table slot, - or NULL if no table is defined for a slot. Usually these should - be set via one of the above helper routines; jpeg_add_quant_table() - is general enough to define any quantization table. The other - routines will set up table slot 0 for luminance quality and table - slot 1 for chrominance. + Pointers to coefficient quantization tables, one per table slot, + or NULL if no table is defined for a slot. Usually these should + be set via one of the above helper routines; jpeg_add_quant_table() + is general enough to define any quantization table. The other + routines will set up table slot 0 for luminance quality and table + slot 1 for chrominance. int q_scale_factor[NUM_QUANT_TBLS] - [libjpeg v7+ API/ABI emulation only] - Linear quantization scaling factors (0-100, default 100) - for use with jpeg_default_qtables(). - See rdswitch.c and cjpeg.c for an example of usage. - Note that the q_scale_factor[] values use "linear" scales, so JPEG - quality levels chosen by the user must be converted to these scales - using jpeg_quality_scaling(). Here is an example that corresponds to - cjpeg -quality 90,70: + [libjpeg v7+ API/ABI emulation only] + Linear quantization scaling factors (0-100, default 100) + for use with jpeg_default_qtables(). + See rdswitch.c and cjpeg.c for an example of usage. + Note that the q_scale_factor[] values use "linear" scales, so JPEG + quality levels chosen by the user must be converted to these scales + using jpeg_quality_scaling(). Here is an example that corresponds to + cjpeg -quality 90,70: - jpeg_set_defaults(cinfo); + jpeg_set_defaults(cinfo); - /* Set luminance quality 90. */ - cinfo->q_scale_factor[0] = jpeg_quality_scaling(90); - /* Set chrominance quality 70. */ - cinfo->q_scale_factor[1] = jpeg_quality_scaling(70); + /* Set luminance quality 90. */ + cinfo->q_scale_factor[0] = jpeg_quality_scaling(90); + /* Set chrominance quality 70. */ + cinfo->q_scale_factor[1] = jpeg_quality_scaling(70); - jpeg_default_qtables(cinfo, force_baseline); + jpeg_default_qtables(cinfo, force_baseline); - CAUTION: Setting separate quality levels for chrominance and luminance - is mainly only useful if chrominance subsampling is disabled. 2x2 - chrominance subsampling (AKA "4:2:0") is the default, but you can - explicitly disable subsampling as follows: + CAUTION: Setting separate quality levels for chrominance and luminance + is mainly only useful if chrominance subsampling is disabled. 2x2 + chrominance subsampling (AKA "4:2:0") is the default, but you can + explicitly disable subsampling as follows: - cinfo->comp_info[0].v_samp_factor = 1; - cinfo->comp_info[0].h_samp_factor = 1; + cinfo->comp_info[0].v_samp_factor = 1; + cinfo->comp_info[0].h_samp_factor = 1; JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS] JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS] - Pointers to Huffman coding tables, one per table slot, or NULL if - no table is defined for a slot. Slots 0 and 1 are filled with the - JPEG sample tables by jpeg_set_defaults(). If you need to allocate - more table structures, jpeg_alloc_huff_table() may be used. - Note that optimal Huffman tables can be computed for an image - by setting optimize_coding, as discussed above; there's seldom - any need to mess with providing your own Huffman tables. + Pointers to Huffman coding tables, one per table slot, or NULL if + no table is defined for a slot. Slots 0 and 1 are filled with the + JPEG sample tables by jpeg_set_defaults(). If you need to allocate + more table structures, jpeg_alloc_huff_table() may be used. + Note that optimal Huffman tables can be computed for an image + by setting optimize_coding, as discussed above; there's seldom + any need to mess with providing your own Huffman tables. [libjpeg v7+ API/ABI emulation only] @@ -1024,7 +1024,7 @@ also call jpeg_calc_jpeg_dimensions() to obtain the values that will result from the current parameter settings. This can be useful if you are trying to pick a scaling ratio that will get close to a desired target size. -JDIMENSION jpeg_width Actual dimensions of output image. +JDIMENSION jpeg_width Actual dimensions of output image. JDIMENSION jpeg_height @@ -1035,32 +1035,32 @@ comp_info[] array is allocated by jpeg_set_defaults(); if you choose not to use that routine, it's up to you to allocate the array. int component_id - The one-byte identifier code to be recorded in the JPEG file for - this component. For the standard color spaces, we recommend you - leave the default values alone. + The one-byte identifier code to be recorded in the JPEG file for + this component. For the standard color spaces, we recommend you + leave the default values alone. int h_samp_factor int v_samp_factor - Horizontal and vertical sampling factors for the component; must - be 1..4 according to the JPEG standard. Note that larger sampling - factors indicate a higher-resolution component; many people find - this behavior quite unintuitive. The default values are 2,2 for - luminance components and 1,1 for chrominance components, except - for grayscale where 1,1 is used. + Horizontal and vertical sampling factors for the component; must + be 1..4 according to the JPEG standard. Note that larger sampling + factors indicate a higher-resolution component; many people find + this behavior quite unintuitive. The default values are 2,2 for + luminance components and 1,1 for chrominance components, except + for grayscale where 1,1 is used. int quant_tbl_no - Quantization table number for component. The default value is - 0 for luminance components and 1 for chrominance components. + Quantization table number for component. The default value is + 0 for luminance components and 1 for chrominance components. int dc_tbl_no int ac_tbl_no - DC and AC entropy coding table numbers. The default values are - 0 for luminance components and 1 for chrominance components. + DC and AC entropy coding table numbers. The default values are + 0 for luminance components and 1 for chrominance components. int component_index - Must equal the component's index in comp_info[]. (Beginning in - release v6, the compressor library will fill this in automatically; - you don't have to.) + Must equal the component's index in comp_info[]. (Beginning in + release v6, the compressor library will fill this in automatically; + you don't have to.) Decompression parameter selection @@ -1080,18 +1080,18 @@ processing. The following fields in the JPEG object are set by jpeg_read_header() and may be useful to the application in choosing decompression parameters: -JDIMENSION image_width Width and height of image +JDIMENSION image_width Width and height of image JDIMENSION image_height -int num_components Number of color components -J_COLOR_SPACE jpeg_color_space Colorspace of image -boolean saw_JFIF_marker TRUE if a JFIF APP0 marker was seen - UINT8 JFIF_major_version Version information from JFIF marker +int num_components Number of color components +J_COLOR_SPACE jpeg_color_space Colorspace of image +boolean saw_JFIF_marker TRUE if a JFIF APP0 marker was seen + UINT8 JFIF_major_version Version information from JFIF marker UINT8 JFIF_minor_version - UINT8 density_unit Resolution data from JFIF marker + UINT8 density_unit Resolution data from JFIF marker UINT16 X_density UINT16 Y_density -boolean saw_Adobe_marker TRUE if an Adobe APP14 marker was seen - UINT8 Adobe_transform Color transform code from Adobe marker +boolean saw_Adobe_marker TRUE if an Adobe APP14 marker was seen + UINT8 Adobe_transform Color transform code from Adobe marker The JPEG color space, unfortunately, is something of a guess since the JPEG standard proper does not provide a way to record it. In practice most files @@ -1103,51 +1103,51 @@ The decompression parameters that determine the basic properties of the returned image are: J_COLOR_SPACE out_color_space - Output color space. jpeg_read_header() sets an appropriate default - based on jpeg_color_space; typically it will be RGB or grayscale. - The application can change this field to request output in a different - colorspace. For example, set it to JCS_GRAYSCALE to get grayscale - output from a color file. (This is useful for previewing: grayscale - output is faster than full color since the color components need not - be processed.) Note that not all possible color space transforms are - currently implemented; you may need to extend jdcolor.c if you want an - unusual conversion. + Output color space. jpeg_read_header() sets an appropriate default + based on jpeg_color_space; typically it will be RGB or grayscale. + The application can change this field to request output in a different + colorspace. For example, set it to JCS_GRAYSCALE to get grayscale + output from a color file. (This is useful for previewing: grayscale + output is faster than full color since the color components need not + be processed.) Note that not all possible color space transforms are + currently implemented; you may need to extend jdcolor.c if you want an + unusual conversion. unsigned int scale_num, scale_denom - Scale the image by the fraction scale_num/scale_denom. Default is - 1/1, or no scaling. Currently, the only supported scaling ratios - are M/8 with all M from 1 to 16, or any reduced fraction thereof (such - as 1/2, 3/4, etc.) (The library design allows for arbitrary - scaling ratios but this is not likely to be implemented any time soon.) - Smaller scaling ratios permit significantly faster decoding since - fewer pixels need be processed and a simpler IDCT method can be used. + Scale the image by the fraction scale_num/scale_denom. Default is + 1/1, or no scaling. Currently, the only supported scaling ratios + are M/8 with all M from 1 to 16, or any reduced fraction thereof (such + as 1/2, 3/4, etc.) (The library design allows for arbitrary + scaling ratios but this is not likely to be implemented any time soon.) + Smaller scaling ratios permit significantly faster decoding since + fewer pixels need be processed and a simpler IDCT method can be used. boolean quantize_colors - If set TRUE, colormapped output will be delivered. Default is FALSE, - meaning that full-color output will be delivered. + If set TRUE, colormapped output will be delivered. Default is FALSE, + meaning that full-color output will be delivered. The next three parameters are relevant only if quantize_colors is TRUE. int desired_number_of_colors - Maximum number of colors to use in generating a library-supplied color - map (the actual number of colors is returned in a different field). - Default 256. Ignored when the application supplies its own color map. + Maximum number of colors to use in generating a library-supplied color + map (the actual number of colors is returned in a different field). + Default 256. Ignored when the application supplies its own color map. boolean two_pass_quantize - If TRUE, an extra pass over the image is made to select a custom color - map for the image. This usually looks a lot better than the one-size- - fits-all colormap that is used otherwise. Default is TRUE. Ignored - when the application supplies its own color map. + If TRUE, an extra pass over the image is made to select a custom color + map for the image. This usually looks a lot better than the one-size- + fits-all colormap that is used otherwise. Default is TRUE. Ignored + when the application supplies its own color map. J_DITHER_MODE dither_mode - Selects color dithering method. Supported values are: - JDITHER_NONE no dithering: fast, very low quality - JDITHER_ORDERED ordered dither: moderate speed and quality - JDITHER_FS Floyd-Steinberg dither: slow, high quality - Default is JDITHER_FS. (At present, ordered dither is implemented - only in the single-pass, standard-colormap case. If you ask for - ordered dither when two_pass_quantize is TRUE or when you supply - an external color map, you'll get F-S dithering.) + Selects color dithering method. Supported values are: + JDITHER_NONE no dithering: fast, very low quality + JDITHER_ORDERED ordered dither: moderate speed and quality + JDITHER_FS Floyd-Steinberg dither: slow, high quality + Default is JDITHER_FS. (At present, ordered dither is implemented + only in the single-pass, standard-colormap case. If you ask for + ordered dither when two_pass_quantize is TRUE or when you supply + an external color map, you'll get F-S dithering.) When quantize_colors is TRUE, the target color map is described by the next two fields. colormap is set to NULL by jpeg_read_header(). The application @@ -1158,39 +1158,39 @@ selects a suitable color map and sets these two fields itself. only accepted for 3-component output color spaces.] JSAMPARRAY colormap - The color map, represented as a 2-D pixel array of out_color_components - rows and actual_number_of_colors columns. Ignored if not quantizing. - CAUTION: if the JPEG library creates its own colormap, the storage - pointed to by this field is released by jpeg_finish_decompress(). - Copy the colormap somewhere else first, if you want to save it. + The color map, represented as a 2-D pixel array of out_color_components + rows and actual_number_of_colors columns. Ignored if not quantizing. + CAUTION: if the JPEG library creates its own colormap, the storage + pointed to by this field is released by jpeg_finish_decompress(). + Copy the colormap somewhere else first, if you want to save it. int actual_number_of_colors - The number of colors in the color map. + The number of colors in the color map. Additional decompression parameters that the application may set include: J_DCT_METHOD dct_method - Selects the algorithm used for the DCT step. Choices are the same - as described above for compression. + Selects the algorithm used for the DCT step. Choices are the same + as described above for compression. boolean do_fancy_upsampling - If TRUE, do careful upsampling of chroma components. If FALSE, - a faster but sloppier method is used. Default is TRUE. The visual - impact of the sloppier method is often very small. + If TRUE, do careful upsampling of chroma components. If FALSE, + a faster but sloppier method is used. Default is TRUE. The visual + impact of the sloppier method is often very small. boolean do_block_smoothing - If TRUE, interblock smoothing is applied in early stages of decoding - progressive JPEG files; if FALSE, not. Default is TRUE. Early - progression stages look "fuzzy" with smoothing, "blocky" without. - In any case, block smoothing ceases to be applied after the first few - AC coefficients are known to full accuracy, so it is relevant only - when using buffered-image mode for progressive images. + If TRUE, interblock smoothing is applied in early stages of decoding + progressive JPEG files; if FALSE, not. Default is TRUE. Early + progression stages look "fuzzy" with smoothing, "blocky" without. + In any case, block smoothing ceases to be applied after the first few + AC coefficients are known to full accuracy, so it is relevant only + when using buffered-image mode for progressive images. boolean enable_1pass_quant boolean enable_external_quant boolean enable_2pass_quant - These are significant only in buffered-image mode, which is - described in its own section below. + These are significant only in buffered-image mode, which is + described in its own section below. The output image dimensions are given by the following fields. These are @@ -1202,11 +1202,11 @@ close to a desired target size. It's also important if you are using the JPEG library's memory manager to allocate output buffer space, because you are supposed to request such buffers *before* jpeg_start_decompress(). -JDIMENSION output_width Actual dimensions of output image. +JDIMENSION output_width Actual dimensions of output image. JDIMENSION output_height -int out_color_components Number of color components in out_color_space. -int output_components Number of color components returned. -int rec_outbuf_height Recommended height of scanline buffer. +int out_color_components Number of color components in out_color_space. +int output_components Number of color components returned. +int rec_outbuf_height Recommended height of scanline buffer. When quantizing colors, output_components is 1, indicating a single color map index per pixel. Otherwise it equals out_color_components. The output arrays @@ -1246,10 +1246,10 @@ by jpeg_color_space. jpeg_set_defaults() chooses a reasonable JPEG color space depending on in_color_space, but you can override this by calling jpeg_set_colorspace(). Of course you must select a supported transformation. jccolor.c currently supports the following transformations: - RGB => YCbCr - RGB => GRAYSCALE - YCbCr => GRAYSCALE - CMYK => YCCK + RGB => YCbCr + RGB => GRAYSCALE + YCbCr => GRAYSCALE + CMYK => YCCK plus the null transforms: GRAYSCALE => GRAYSCALE, RGB => RGB, YCbCr => YCbCr, CMYK => CMYK, YCCK => YCCK, and UNKNOWN => UNKNOWN. @@ -1279,11 +1279,11 @@ jpeg_read_header's guess by setting jpeg_color_space. jpeg_read_header also selects a default output color space based on (its guess of) jpeg_color_space; set out_color_space to override this. Again, you must select a supported transformation. jdcolor.c currently supports - YCbCr => RGB - YCbCr => GRAYSCALE - RGB => GRAYSCALE - GRAYSCALE => RGB - YCCK => CMYK + YCbCr => RGB + YCbCr => GRAYSCALE + RGB => GRAYSCALE + GRAYSCALE => RGB + YCCK => CMYK as well as the null transforms. (Since GRAYSCALE=>RGB is provided, an application can force grayscale JPEGs to look like color JPEGs if it only wants to handle one case.) @@ -1353,31 +1353,31 @@ The library does not touch client_data at all.) The individual methods that you might wish to override are: error_exit (j_common_ptr cinfo) - Receives control for a fatal error. Information sufficient to - generate the error message has been stored in cinfo->err; call - output_message to display it. Control must NOT return to the caller; - generally this routine will exit() or longjmp() somewhere. - Typically you would override this routine to get rid of the exit() - default behavior. Note that if you continue processing, you should - clean up the JPEG object with jpeg_abort() or jpeg_destroy(). + Receives control for a fatal error. Information sufficient to + generate the error message has been stored in cinfo->err; call + output_message to display it. Control must NOT return to the caller; + generally this routine will exit() or longjmp() somewhere. + Typically you would override this routine to get rid of the exit() + default behavior. Note that if you continue processing, you should + clean up the JPEG object with jpeg_abort() or jpeg_destroy(). output_message (j_common_ptr cinfo) - Actual output of any JPEG message. Override this to send messages - somewhere other than stderr. Note that this method does not know - how to generate a message, only where to send it. + Actual output of any JPEG message. Override this to send messages + somewhere other than stderr. Note that this method does not know + how to generate a message, only where to send it. format_message (j_common_ptr cinfo, char * buffer) - Constructs a readable error message string based on the error info - stored in cinfo->err. This method is called by output_message. Few - applications should need to override this method. One possible - reason for doing so is to implement dynamic switching of error message - language. + Constructs a readable error message string based on the error info + stored in cinfo->err. This method is called by output_message. Few + applications should need to override this method. One possible + reason for doing so is to implement dynamic switching of error message + language. emit_message (j_common_ptr cinfo, int msg_level) - Decide whether or not to emit a warning or trace message; if so, - calls output_message. The main reason for overriding this method - would be to abort on warnings. msg_level is -1 for warnings, - 0 and up for trace messages. + Decide whether or not to emit a warning or trace message; if so, + calls output_message. The main reason for overriding this method + would be to abort on warnings. msg_level is -1 for warnings, + 0 and up for trace messages. Only error_exit() and emit_message() are called from the rest of the JPEG library; the other two are internal to the error handler. @@ -1400,9 +1400,9 @@ messages. See the sample applications cjpeg/djpeg for an example of using addon messages (the addon messages are defined in cderror.h). Actual invocation of the error handler is done via macros defined in jerror.h: - ERREXITn(...) for fatal errors - WARNMSn(...) for corrupt-data warnings - TRACEMSn(...) for trace and informational messages. + ERREXITn(...) for fatal errors + WARNMSn(...) for corrupt-data warnings + TRACEMSn(...) for trace and informational messages. These macros store the message code and any additional parameters into the error handler struct, then invoke the error_exit() or emit_message() method. The variants of each macro are for varying numbers of additional parameters. @@ -1443,8 +1443,8 @@ on external storage. A data destination manager struct contains a pointer and count defining the next byte to write in the work buffer and the remaining free space: - JOCTET * next_output_byte; /* => next byte to write in buffer */ - size_t free_in_buffer; /* # of byte spaces remaining in buffer */ + JOCTET * next_output_byte; /* => next byte to write in buffer */ + size_t free_in_buffer; /* # of byte spaces remaining in buffer */ The library increments the pointer and decrements the count until the buffer is filled. The manager's empty_output_buffer method must reset the pointer @@ -1454,27 +1454,27 @@ and total size in private fields not visible to the library. A data destination manager provides three methods: init_destination (j_compress_ptr cinfo) - Initialize destination. This is called by jpeg_start_compress() - before any data is actually written. It must initialize - next_output_byte and free_in_buffer. free_in_buffer must be - initialized to a positive value. + Initialize destination. This is called by jpeg_start_compress() + before any data is actually written. It must initialize + next_output_byte and free_in_buffer. free_in_buffer must be + initialized to a positive value. empty_output_buffer (j_compress_ptr cinfo) - This is called whenever the buffer has filled (free_in_buffer - reaches zero). In typical applications, it should write out the - *entire* buffer (use the saved start address and buffer length; - ignore the current state of next_output_byte and free_in_buffer). - Then reset the pointer & count to the start of the buffer, and - return TRUE indicating that the buffer has been dumped. - free_in_buffer must be set to a positive value when TRUE is - returned. A FALSE return should only be used when I/O suspension is - desired (this operating mode is discussed in the next section). + This is called whenever the buffer has filled (free_in_buffer + reaches zero). In typical applications, it should write out the + *entire* buffer (use the saved start address and buffer length; + ignore the current state of next_output_byte and free_in_buffer). + Then reset the pointer & count to the start of the buffer, and + return TRUE indicating that the buffer has been dumped. + free_in_buffer must be set to a positive value when TRUE is + returned. A FALSE return should only be used when I/O suspension is + desired (this operating mode is discussed in the next section). term_destination (j_compress_ptr cinfo) - Terminate destination --- called by jpeg_finish_compress() after all - data has been written. In most applications, this must flush any - data remaining in the buffer. Use either next_output_byte or - free_in_buffer to determine how much data is in the buffer. + Terminate destination --- called by jpeg_finish_compress() after all + data has been written. In most applications, this must flush any + data remaining in the buffer. Use either next_output_byte or + free_in_buffer to determine how much data is in the buffer. term_destination() is NOT called by jpeg_abort() or jpeg_destroy(). If you want the destination manager to be cleaned up during an abort, you must do it @@ -1492,8 +1492,8 @@ additional frammishes. The source manager struct contains a pointer and count defining the next byte to read from the work buffer and the number of bytes remaining: - const JOCTET * next_input_byte; /* => next byte to read from buffer */ - size_t bytes_in_buffer; /* # of bytes remaining in buffer */ + const JOCTET * next_input_byte; /* => next byte to read from buffer */ + size_t bytes_in_buffer; /* # of bytes remaining in buffer */ The library increments the pointer and decrements the count until the buffer is emptied. The manager's fill_input_buffer method must reset the pointer and @@ -1503,47 +1503,47 @@ address and total size in private fields not visible to the library. A data source manager provides five methods: init_source (j_decompress_ptr cinfo) - Initialize source. This is called by jpeg_read_header() before any - data is actually read. Unlike init_destination(), it may leave - bytes_in_buffer set to 0 (in which case a fill_input_buffer() call - will occur immediately). + Initialize source. This is called by jpeg_read_header() before any + data is actually read. Unlike init_destination(), it may leave + bytes_in_buffer set to 0 (in which case a fill_input_buffer() call + will occur immediately). fill_input_buffer (j_decompress_ptr cinfo) - This is called whenever bytes_in_buffer has reached zero and more - data is wanted. In typical applications, it should read fresh data - into the buffer (ignoring the current state of next_input_byte and - bytes_in_buffer), reset the pointer & count to the start of the - buffer, and return TRUE indicating that the buffer has been reloaded. - It is not necessary to fill the buffer entirely, only to obtain at - least one more byte. bytes_in_buffer MUST be set to a positive value - if TRUE is returned. A FALSE return should only be used when I/O - suspension is desired (this mode is discussed in the next section). + This is called whenever bytes_in_buffer has reached zero and more + data is wanted. In typical applications, it should read fresh data + into the buffer (ignoring the current state of next_input_byte and + bytes_in_buffer), reset the pointer & count to the start of the + buffer, and return TRUE indicating that the buffer has been reloaded. + It is not necessary to fill the buffer entirely, only to obtain at + least one more byte. bytes_in_buffer MUST be set to a positive value + if TRUE is returned. A FALSE return should only be used when I/O + suspension is desired (this mode is discussed in the next section). skip_input_data (j_decompress_ptr cinfo, long num_bytes) - Skip num_bytes worth of data. The buffer pointer and count should - be advanced over num_bytes input bytes, refilling the buffer as - needed. This is used to skip over a potentially large amount of - uninteresting data (such as an APPn marker). In some applications - it may be possible to optimize away the reading of the skipped data, - but it's not clear that being smart is worth much trouble; large - skips are uncommon. bytes_in_buffer may be zero on return. - A zero or negative skip count should be treated as a no-op. + Skip num_bytes worth of data. The buffer pointer and count should + be advanced over num_bytes input bytes, refilling the buffer as + needed. This is used to skip over a potentially large amount of + uninteresting data (such as an APPn marker). In some applications + it may be possible to optimize away the reading of the skipped data, + but it's not clear that being smart is worth much trouble; large + skips are uncommon. bytes_in_buffer may be zero on return. + A zero or negative skip count should be treated as a no-op. resync_to_restart (j_decompress_ptr cinfo, int desired) - This routine is called only when the decompressor has failed to find - a restart (RSTn) marker where one is expected. Its mission is to - find a suitable point for resuming decompression. For most - applications, we recommend that you just use the default resync - procedure, jpeg_resync_to_restart(). However, if you are able to back - up in the input data stream, or if you have a-priori knowledge about - the likely location of restart markers, you may be able to do better. - Read the read_restart_marker() and jpeg_resync_to_restart() routines - in jdmarker.c if you think you'd like to implement your own resync - procedure. + This routine is called only when the decompressor has failed to find + a restart (RSTn) marker where one is expected. Its mission is to + find a suitable point for resuming decompression. For most + applications, we recommend that you just use the default resync + procedure, jpeg_resync_to_restart(). However, if you are able to back + up in the input data stream, or if you have a-priori knowledge about + the likely location of restart markers, you may be able to do better. + Read the read_restart_marker() and jpeg_resync_to_restart() routines + in jdmarker.c if you think you'd like to implement your own resync + procedure. term_source (j_decompress_ptr cinfo) - Terminate source --- called by jpeg_finish_decompress() after all - data has been read. Often a no-op. + Terminate source --- called by jpeg_finish_decompress() after all + data has been read. Often a no-op. For both fill_input_buffer() and skip_input_data(), there is no such thing as an EOF return. If the end of the file has been reached, the routine has @@ -1651,7 +1651,7 @@ that suspension has occurred. This can happen at four places: * jpeg_read_header(): will return JPEG_SUSPENDED. * jpeg_start_decompress(): will return FALSE, rather than its usual TRUE. * jpeg_read_scanlines(): will return the number of scanlines already - completed (possibly 0). + completed (possibly 0). * jpeg_finish_decompress(): will return FALSE, rather than its usual TRUE. The surrounding application must recognize these cases, load more data into the input buffer, and repeat the call. In the case of jpeg_read_scanlines(), @@ -1829,23 +1829,23 @@ rates. The basic control flow for buffered-image decoding is - jpeg_create_decompress() - set data source - jpeg_read_header() - set overall decompression parameters - cinfo.buffered_image = TRUE; /* select buffered-image mode */ - jpeg_start_decompress() - for (each output pass) { - adjust output decompression parameters if required - jpeg_start_output() /* start a new output pass */ - for (all scanlines in image) { - jpeg_read_scanlines() - display scanlines - } - jpeg_finish_output() /* terminate output pass */ - } - jpeg_finish_decompress() - jpeg_destroy_decompress() + jpeg_create_decompress() + set data source + jpeg_read_header() + set overall decompression parameters + cinfo.buffered_image = TRUE; /* select buffered-image mode */ + jpeg_start_decompress() + for (each output pass) { + adjust output decompression parameters if required + jpeg_start_output() /* start a new output pass */ + for (all scanlines in image) { + jpeg_read_scanlines() + display scanlines + } + jpeg_finish_output() /* terminate output pass */ + } + jpeg_finish_decompress() + jpeg_destroy_decompress() This differs from ordinary unbuffered decoding in that there is an additional level of looping. The application can choose how many output passes to make @@ -1854,9 +1854,9 @@ and how to display each pass. The simplest approach to displaying progressive images is to do one display pass for each scan appearing in the input file. In this case the outer loop condition is typically - while (! jpeg_input_complete(&cinfo)) + while (! jpeg_input_complete(&cinfo)) and the start-output call should read - jpeg_start_output(&cinfo, cinfo.input_scan_number); + jpeg_start_output(&cinfo, cinfo.input_scan_number); The second parameter to jpeg_start_output() indicates which scan of the input file is to be displayed; the scans are numbered starting at 1 for this purpose. (You can use a loop counter starting at 1 if you like, but using @@ -1887,11 +1887,11 @@ If input data arrives faster than it can be displayed, the application can cause the library to decode input data in advance of what's needed to produce output. This is done by calling the routine jpeg_consume_input(). The return value is one of the following: - JPEG_REACHED_SOS: reached an SOS marker (the start of a new scan) - JPEG_REACHED_EOI: reached the EOI marker (end of image) - JPEG_ROW_COMPLETED: completed reading one MCU row of compressed data - JPEG_SCAN_COMPLETED: completed reading last MCU row of current scan - JPEG_SUSPENDED: suspended before completing any of the above + JPEG_REACHED_SOS: reached an SOS marker (the start of a new scan) + JPEG_REACHED_EOI: reached the EOI marker (end of image) + JPEG_ROW_COMPLETED: completed reading one MCU row of compressed data + JPEG_SCAN_COMPLETED: completed reading last MCU row of current scan + JPEG_SUSPENDED: suspended before completing any of the above (JPEG_SUSPENDED can occur only if a suspending data source is used.) This routine can be called at any time after initializing the JPEG object. It reads some additional data and returns when one of the indicated significant @@ -1968,27 +1968,27 @@ When using this strategy, you'll want to be sure that you perform a final output pass after receiving all the data; otherwise your last display may not be full quality across the whole screen. So the right outer loop logic is something like this: - do { - absorb any waiting input by calling jpeg_consume_input() - final_pass = jpeg_input_complete(&cinfo); - adjust output decompression parameters if required - jpeg_start_output(&cinfo, cinfo.input_scan_number); - ... - jpeg_finish_output() - } while (! final_pass); + do { + absorb any waiting input by calling jpeg_consume_input() + final_pass = jpeg_input_complete(&cinfo); + adjust output decompression parameters if required + jpeg_start_output(&cinfo, cinfo.input_scan_number); + ... + jpeg_finish_output() + } while (! final_pass); rather than quitting as soon as jpeg_input_complete() returns TRUE. This arrangement makes it simple to use higher-quality decoding parameters for the final pass. But if you don't want to use special parameters for the final pass, the right loop logic is like this: - for (;;) { - absorb any waiting input by calling jpeg_consume_input() - jpeg_start_output(&cinfo, cinfo.input_scan_number); - ... - jpeg_finish_output() - if (jpeg_input_complete(&cinfo) && - cinfo.input_scan_number == cinfo.output_scan_number) - break; - } + for (;;) { + absorb any waiting input by calling jpeg_consume_input() + jpeg_start_output(&cinfo, cinfo.input_scan_number); + ... + jpeg_finish_output() + if (jpeg_input_complete(&cinfo) && + cinfo.input_scan_number == cinfo.output_scan_number) + break; + } In this case you don't need to know in advance whether an output pass is to be the last one, so it's not necessary to have reached EOF before starting the final output pass; rather, what you want to test is whether the output @@ -2097,9 +2097,9 @@ working-storage requirements, the library requires you to indicate which one(s) you intend to use before you call jpeg_start_decompress(). (If we did not require this, the max_memory_to_use setting would be a complete fiction.) You do this by setting one or more of these three cinfo fields to TRUE: - enable_1pass_quant Fixed color cube colormap - enable_external_quant Externally-supplied colormap - enable_2pass_quant Two-pass custom colormap + enable_1pass_quant Fixed color cube colormap + enable_external_quant Externally-supplied colormap + enable_2pass_quant Two-pass custom colormap All three are initialized FALSE by jpeg_read_header(). But jpeg_start_decompress() automatically sets TRUE the one selected by the current two_pass_quantize and colormap settings, so you only need to set the @@ -2250,14 +2250,14 @@ sent_tables flags will be set TRUE. A sure-fire way to create matching tables-only and abbreviated image files is to proceed as follows: - create JPEG compression object - set JPEG parameters - set destination to tables-only file - jpeg_write_tables(&cinfo); - set destination to image file - jpeg_start_compress(&cinfo, FALSE); - write data... - jpeg_finish_compress(&cinfo); + create JPEG compression object + set JPEG parameters + set destination to tables-only file + jpeg_write_tables(&cinfo); + set destination to image file + jpeg_start_compress(&cinfo, FALSE); + write data... + jpeg_finish_compress(&cinfo); Since the JPEG parameters are not altered between writing the table file and the abbreviated image file, the same tables are sure to be used. Of course, @@ -2285,7 +2285,7 @@ to load a fixed quantization table into table slot "n": if (cinfo.quant_tbl_ptrs[n] == NULL) cinfo.quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) &cinfo); - quant_ptr = cinfo.quant_tbl_ptrs[n]; /* quant_ptr is JQUANT_TBL* */ + quant_ptr = cinfo.quant_tbl_ptrs[n]; /* quant_ptr is JQUANT_TBL* */ for (i = 0; i < 64; i++) { /* Qtable[] is desired quantization table, in natural array order */ quant_ptr->quantval[i] = Qtable[i]; @@ -2295,7 +2295,7 @@ Code to load a fixed Huffman table is typically (for AC table "n"): if (cinfo.ac_huff_tbl_ptrs[n] == NULL) cinfo.ac_huff_tbl_ptrs[n] = jpeg_alloc_huff_table((j_common_ptr) &cinfo); - huff_ptr = cinfo.ac_huff_tbl_ptrs[n]; /* huff_ptr is JHUFF_TBL* */ + huff_ptr = cinfo.ac_huff_tbl_ptrs[n]; /* huff_ptr is JHUFF_TBL* */ for (i = 1; i <= 16; i++) { /* counts[i] is number of Huffman codes of length i bits, i=1..16 */ huff_ptr->bits[i] = counts[i]; @@ -2317,15 +2317,15 @@ sufficient to read a tables-only file. You must pass a second parameter of FALSE to indicate that you do not require an image to be present. Thus, the typical scenario is - create JPEG decompression object - set source to tables-only file - jpeg_read_header(&cinfo, FALSE); - set source to abbreviated image file - jpeg_read_header(&cinfo, TRUE); - set decompression parameters - jpeg_start_decompress(&cinfo); - read data... - jpeg_finish_decompress(&cinfo); + create JPEG decompression object + set source to tables-only file + jpeg_read_header(&cinfo, FALSE); + set source to abbreviated image file + jpeg_read_header(&cinfo, TRUE); + set decompression parameters + jpeg_start_decompress(&cinfo); + read data... + jpeg_finish_decompress(&cinfo); In some cases, you may want to read a file without knowing whether it contains an image or just tables. In that case, pass FALSE and check the return value @@ -2398,7 +2398,7 @@ all else. Specify the marker type parameter as "JPEG_COM" for COM or "JPEG_APP0 + n" for APPn. (Actually, jpeg_write_marker will let you write any marker type, but we don't recommend writing any other kinds of marker.) For example, to write a user comment string pointed to by comment_text: - jpeg_write_marker(cinfo, JPEG_COM, comment_text, strlen(comment_text)); + jpeg_write_marker(cinfo, JPEG_COM, comment_text, strlen(comment_text)); If it's not convenient to store all the marker data in memory at once, you can instead call jpeg_write_m_header() followed by multiple calls to @@ -2444,7 +2444,7 @@ determined separately for COM markers and for each APPn marker code. To save the contents of special markers in memory, call - jpeg_save_markers(cinfo, marker_code, length_limit) + jpeg_save_markers(cinfo, marker_code, length_limit) where marker_code is the marker type to save, JPEG_COM or JPEG_APP0+n. (To arrange to save all the special marker types, you need to call this routine 17 times, for COM and APP0-APP15.) If the incoming marker is longer @@ -2489,7 +2489,7 @@ effective length limit is exactly what you set it to be. If you want to supply your own marker-reading routine, you do it by calling jpeg_set_marker_processor(). A marker processor routine must have the signature - boolean jpeg_marker_parser_method (j_decompress_ptr cinfo) + boolean jpeg_marker_parser_method (j_decompress_ptr cinfo) Although the marker code is not explicitly passed, the routine can find it in cinfo->unread_marker. At the time of call, the marker proper has been read from the data source module. The processor routine is responsible for @@ -2576,8 +2576,8 @@ image; don't forget to pad your data as necessary. The required dimensions of the supplied data can be computed for each component as - cinfo->comp_info[i].width_in_blocks*DCTSIZE samples per row - cinfo->comp_info[i].height_in_blocks*DCTSIZE rows in image + cinfo->comp_info[i].width_in_blocks*DCTSIZE samples per row + cinfo->comp_info[i].height_in_blocks*DCTSIZE rows in image after jpeg_start_compress() has initialized those fields. If the valid data is smaller than this, it must be padded appropriately. For some sampling factors and image sizes, additional dummy DCT blocks are inserted to make @@ -2585,12 +2585,12 @@ the image a multiple of the MCU dimensions. The library creates such dummy blocks itself; it does not read them from your supplied data. Therefore you need never pad by more than DCTSIZE samples. An example may help here. Assume 2h2v downsampling of YCbCr data, that is - cinfo->comp_info[0].h_samp_factor = 2 for Y - cinfo->comp_info[0].v_samp_factor = 2 - cinfo->comp_info[1].h_samp_factor = 1 for Cb - cinfo->comp_info[1].v_samp_factor = 1 - cinfo->comp_info[2].h_samp_factor = 1 for Cr - cinfo->comp_info[2].v_samp_factor = 1 + cinfo->comp_info[0].h_samp_factor = 2 for Y + cinfo->comp_info[0].v_samp_factor = 2 + cinfo->comp_info[1].h_samp_factor = 1 for Cb + cinfo->comp_info[1].v_samp_factor = 1 + cinfo->comp_info[2].h_samp_factor = 1 for Cr + cinfo->comp_info[2].v_samp_factor = 1 and suppose that the nominal image dimensions (cinfo->image_width and cinfo->image_height) are 101x101 pixels. Then jpeg_start_compress() will compute downsampled_width = 101 and width_in_blocks = 13 for Y, @@ -2761,18 +2761,18 @@ JPEG memory manager with lifetime JPOOL_PERMANENT will work nicely.) You can use the same callback routine for both compression and decompression. The jpeg_progress_mgr struct contains four fields which are set by the library: - long pass_counter; /* work units completed in this pass */ - long pass_limit; /* total number of work units in this pass */ - int completed_passes; /* passes completed so far */ - int total_passes; /* total number of passes expected */ + long pass_counter; /* work units completed in this pass */ + long pass_limit; /* total number of work units in this pass */ + int completed_passes; /* passes completed so far */ + int total_passes; /* total number of passes expected */ During any one pass, pass_counter increases from 0 up to (not including) pass_limit; the step size is usually but not necessarily 1. The pass_limit value may change from one pass to another. The expected total number of passes is in total_passes, and the number of passes already completed is in completed_passes. Thus the fraction of work completed may be estimated as - completed_passes + (pass_counter/pass_limit) - -------------------------------------------- - total_passes + completed_passes + (pass_counter/pass_limit) + -------------------------------------------- + total_passes ignoring the fact that the passes may not be equal amounts of work. When decompressing, pass_limit can even change within a pass, because it @@ -2945,7 +2945,7 @@ functions. To do this, undefine xxx_SUPPORTED symbols as necessary. You can also save a few K by not having text error messages in the library; the standard error message table occupies about 5Kb. This is particularly -reasonable for embedded applications where there's no good way to display +reasonable for embedded applications where there's no good way to display a message anyway. To do this, remove the creation of the message table (jpeg_std_message_table[]) from jerror.c, and alter format_message to do something reasonable without it. You could output the numeric value of the @@ -2969,10 +2969,10 @@ See install.txt for configuration procedures. The code is not dependent on the exact sizes of the C data types. As distributed, we make the assumptions that - char is at least 8 bits wide - short is at least 16 bits wide - int is at least 16 bits wide - long is at least 32 bits wide + char is at least 8 bits wide + short is at least 16 bits wide + int is at least 16 bits wide + long is at least 32 bits wide (These are the minimum requirements of the ANSI C standard.) Wider types will work fine, although memory may be used inefficiently if char is much larger than 8 bits or short is much bigger than 16 bits. The code should work diff --git a/rdbmp.c b/rdbmp.c index 7c2e04411..0b2351db8 100644 --- a/rdbmp.c +++ b/rdbmp.c @@ -24,7 +24,7 @@ * This code contributed by James Arthur Boucher. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef BMP_SUPPORTED @@ -33,19 +33,19 @@ #ifdef HAVE_UNSIGNED_CHAR typedef unsigned char U_CHAR; -#define UCH(x) ((int) (x)) +#define UCH(x) ((int) (x)) #else /* !HAVE_UNSIGNED_CHAR */ #ifdef __CHAR_UNSIGNED__ typedef char U_CHAR; -#define UCH(x) ((int) (x)) +#define UCH(x) ((int) (x)) #else typedef char U_CHAR; -#define UCH(x) ((int) (x) & 0xFF) +#define UCH(x) ((int) (x) & 0xFF) #endif #endif /* HAVE_UNSIGNED_CHAR */ -#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len))) +#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len))) /* Private version of data source object */ @@ -55,15 +55,15 @@ typedef struct _bmp_source_struct * bmp_source_ptr; typedef struct _bmp_source_struct { struct cjpeg_source_struct pub; /* public fields */ - j_compress_ptr cinfo; /* back link saves passing separate parm */ + j_compress_ptr cinfo; /* back link saves passing separate parm */ - JSAMPARRAY colormap; /* BMP colormap (converted to my format) */ + JSAMPARRAY colormap; /* BMP colormap (converted to my format) */ - jvirt_sarray_ptr whole_image; /* Needed to reverse row order */ - JDIMENSION source_row; /* Current source row number */ - JDIMENSION row_width; /* Physical width of scanlines in file */ + jvirt_sarray_ptr whole_image; /* Needed to reverse row order */ + JDIMENSION source_row; /* Current source row number */ + JDIMENSION row_width; /* Physical width of scanlines in file */ - int bits_per_pixel; /* remembers 8- or 24-bit format */ + int bits_per_pixel; /* remembers 8- or 24-bit format */ } bmp_source_struct; @@ -140,7 +140,7 @@ get_8bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) outptr = source->pub.buffer[0]; for (col = cinfo->image_width; col > 0; col--) { t = GETJSAMPLE(*inptr++); - *outptr++ = colormap[0][t]; /* can omit GETJSAMPLE() safely */ + *outptr++ = colormap[0][t]; /* can omit GETJSAMPLE() safely */ *outptr++ = colormap[1][t]; *outptr++ = colormap[2][t]; } @@ -170,7 +170,7 @@ get_24bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) inptr = image_ptr[0]; outptr = source->pub.buffer[0]; for (col = cinfo->image_width; col > 0; col--) { - outptr[2] = *inptr++; /* can omit GETJSAMPLE() safely */ + outptr[2] = *inptr++; /* can omit GETJSAMPLE() safely */ outptr[1] = *inptr++; outptr[0] = *inptr++; outptr += 3; @@ -200,10 +200,10 @@ get_32bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) inptr = image_ptr[0]; outptr = source->pub.buffer[0]; for (col = cinfo->image_width; col > 0; col--) { - outptr[2] = *inptr++; /* can omit GETJSAMPLE() safely */ + outptr[2] = *inptr++; /* can omit GETJSAMPLE() safely */ outptr[1] = *inptr++; outptr[0] = *inptr++; - inptr++; /* skip the 4th byte (Alpha channel) */ + inptr++; /* skip the 4th byte (Alpha channel) */ outptr += 3; } @@ -280,11 +280,11 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) U_CHAR bmpfileheader[14]; U_CHAR bmpinfoheader[64]; #define GET_2B(array,offset) ((unsigned int) UCH(array[offset]) + \ - (((unsigned int) UCH(array[offset+1])) << 8)) + (((unsigned int) UCH(array[offset+1])) << 8)) #define GET_4B(array,offset) ((INT32) UCH(array[offset]) + \ - (((INT32) UCH(array[offset+1])) << 8) + \ - (((INT32) UCH(array[offset+2])) << 16) + \ - (((INT32) UCH(array[offset+3])) << 24)) + (((INT32) UCH(array[offset+1])) << 8) + \ + (((INT32) UCH(array[offset+2])) << 16) + \ + (((INT32) UCH(array[offset+3])) << 24)) INT32 bfOffBits; INT32 headerSize; INT32 biWidth; @@ -293,7 +293,7 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) INT32 biCompression; INT32 biXPelsPerMeter,biYPelsPerMeter; INT32 biClrUsed = 0; - int mapentrysize = 0; /* 0 indicates no colormap */ + int mapentrysize = 0; /* 0 indicates no colormap */ INT32 bPad; JDIMENSION row_width; @@ -325,11 +325,11 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) source->bits_per_pixel = (int) GET_2B(bmpinfoheader,10); switch (source->bits_per_pixel) { - case 8: /* colormapped image */ - mapentrysize = 3; /* OS/2 uses RGBTRIPLE colormap */ + case 8: /* colormapped image */ + mapentrysize = 3; /* OS/2 uses RGBTRIPLE colormap */ TRACEMS2(cinfo, 1, JTRC_BMP_OS2_MAPPED, (int) biWidth, (int) biHeight); break; - case 24: /* RGB image */ + case 24: /* RGB image */ TRACEMS2(cinfo, 1, JTRC_BMP_OS2, (int) biWidth, (int) biHeight); break; default: @@ -352,14 +352,14 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) /* biSizeImage, biClrImportant fields are ignored */ switch (source->bits_per_pixel) { - case 8: /* colormapped image */ - mapentrysize = 4; /* Windows uses RGBQUAD colormap */ + case 8: /* colormapped image */ + mapentrysize = 4; /* Windows uses RGBQUAD colormap */ TRACEMS2(cinfo, 1, JTRC_BMP_MAPPED, (int) biWidth, (int) biHeight); break; - case 24: /* RGB image */ + case 24: /* RGB image */ TRACEMS2(cinfo, 1, JTRC_BMP, (int) biWidth, (int) biHeight); break; - case 32: /* RGB image + Alpha channel */ + case 32: /* RGB image + Alpha channel */ TRACEMS2(cinfo, 1, JTRC_BMP, (int) biWidth, (int) biHeight); break; default: @@ -373,7 +373,7 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) /* Set JFIF density parameters from the BMP data */ cinfo->X_density = (UINT16) (biXPelsPerMeter/100); /* 100 cm per meter */ cinfo->Y_density = (UINT16) (biYPelsPerMeter/100); - cinfo->density_unit = 2; /* dots/cm */ + cinfo->density_unit = 2; /* dots/cm */ } break; default: @@ -392,7 +392,7 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) /* Read the colormap, if any */ if (mapentrysize > 0) { if (biClrUsed <= 0) - biClrUsed = 256; /* assume it's 256 */ + biClrUsed = 256; /* assume it's 256 */ else if (biClrUsed > 256) ERREXIT(cinfo, JERR_BMP_BADCMAP); /* Allocate space to store the colormap */ @@ -406,7 +406,7 @@ start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) } /* Skip any remaining pad bytes */ - if (bPad < 0) /* incorrect bfOffBits value? */ + if (bPad < 0) /* incorrect bfOffBits value? */ ERREXIT(cinfo, JERR_BMP_BADHEADER); while (--bPad >= 0) { (void) read_byte(source); @@ -469,8 +469,8 @@ jinit_read_bmp (j_compress_ptr cinfo) /* Create module interface object */ source = (bmp_source_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(bmp_source_struct)); - source->cinfo = cinfo; /* make back link for subroutines */ + SIZEOF(bmp_source_struct)); + source->cinfo = cinfo; /* make back link for subroutines */ /* Fill in method ptrs, except get_pixel_rows which start_input sets */ source->pub.start_input = start_input_bmp; source->pub.finish_input = finish_input_bmp; diff --git a/rdcolmap.c b/rdcolmap.c index 42b343763..ac6f50ee1 100644 --- a/rdcolmap.c +++ b/rdcolmap.c @@ -21,9 +21,9 @@ * currently implemented. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ -#ifdef QUANT_2PASS_SUPPORTED /* otherwise can't quantize to supplied map */ +#ifdef QUANT_2PASS_SUPPORTED /* otherwise can't quantize to supplied map */ /* Portions of this code are based on the PBMPLUS library, which is: ** @@ -54,9 +54,9 @@ add_map_entry (j_decompress_ptr cinfo, int R, int G, int B) /* Check for duplicate color. */ for (index = 0; index < ncolors; index++) { if (GETJSAMPLE(colormap0[index]) == R && - GETJSAMPLE(colormap1[index]) == G && - GETJSAMPLE(colormap2[index]) == B) - return; /* color is already in map */ + GETJSAMPLE(colormap1[index]) == G && + GETJSAMPLE(colormap2[index]) == B) + return; /* color is already in map */ } /* Check for map overflow. */ @@ -107,9 +107,9 @@ read_gif_map (j_decompress_ptr cinfo, FILE * infile) if (R == EOF || G == EOF || B == EOF) ERREXIT(cinfo, JERR_BAD_CMAP_FILE); add_map_entry(cinfo, - R << (BITS_IN_JSAMPLE-8), - G << (BITS_IN_JSAMPLE-8), - B << (BITS_IN_JSAMPLE-8)); + R << (BITS_IN_JSAMPLE-8), + G << (BITS_IN_JSAMPLE-8), + B << (BITS_IN_JSAMPLE-8)); } } @@ -123,7 +123,7 @@ pbm_getc (FILE * infile) /* A comment/newline sequence is returned as a newline */ { register int ch; - + ch = getc(infile); if (ch == '#') { do { @@ -143,17 +143,17 @@ read_pbm_integer (j_decompress_ptr cinfo, FILE * infile) { register int ch; register unsigned int val; - + /* Skip any leading whitespace */ do { ch = pbm_getc(infile); if (ch == EOF) ERREXIT(cinfo, JERR_BAD_CMAP_FILE); } while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'); - + if (ch < '0' || ch > '9') ERREXIT(cinfo, JERR_BAD_CMAP_FILE); - + val = ch - '0'; while ((ch = pbm_getc(infile)) >= '0' && ch <= '9') { val *= 10; @@ -175,7 +175,7 @@ read_ppm_map (j_decompress_ptr cinfo, FILE * infile) int R, G, B; /* Initial 'P' has already been read by read_color_map */ - c = getc(infile); /* save format discriminator for a sec */ + c = getc(infile); /* save format discriminator for a sec */ /* while we fetch the remaining header info */ w = read_pbm_integer(cinfo, infile); @@ -190,26 +190,26 @@ read_ppm_map (j_decompress_ptr cinfo, FILE * infile) ERREXIT(cinfo, JERR_BAD_CMAP_FILE); switch (c) { - case '3': /* it's a text-format PPM file */ + case '3': /* it's a text-format PPM file */ for (row = 0; row < h; row++) { for (col = 0; col < w; col++) { - R = read_pbm_integer(cinfo, infile); - G = read_pbm_integer(cinfo, infile); - B = read_pbm_integer(cinfo, infile); - add_map_entry(cinfo, R, G, B); + R = read_pbm_integer(cinfo, infile); + G = read_pbm_integer(cinfo, infile); + B = read_pbm_integer(cinfo, infile); + add_map_entry(cinfo, R, G, B); } } break; - case '6': /* it's a raw-format PPM file */ + case '6': /* it's a raw-format PPM file */ for (row = 0; row < h; row++) { for (col = 0; col < w; col++) { - R = getc(infile); - G = getc(infile); - B = getc(infile); - if (R == EOF || G == EOF || B == EOF) - ERREXIT(cinfo, JERR_BAD_CMAP_FILE); - add_map_entry(cinfo, R, G, B); + R = getc(infile); + G = getc(infile); + B = getc(infile); + if (R == EOF || G == EOF || B == EOF) + ERREXIT(cinfo, JERR_BAD_CMAP_FILE); + add_map_entry(cinfo, R, G, B); } } break; diff --git a/rdgif.c b/rdgif.c index b27c1675d..5caad8a08 100644 --- a/rdgif.c +++ b/rdgif.c @@ -19,7 +19,7 @@ * CompuServe Incorporated." */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef GIF_SUPPORTED @@ -32,7 +32,7 @@ jinit_read_gif (j_compress_ptr cinfo) { fprintf(stderr, "GIF input is unsupported for legal reasons. Sorry.\n"); exit(EXIT_FAILURE); - return NULL; /* keep compiler happy */ + return NULL; /* keep compiler happy */ } #endif /* GIF_SUPPORTED */ diff --git a/rdjpgcom.c b/rdjpgcom.c index 371915474..02ce90f2b 100644 --- a/rdjpgcom.c +++ b/rdjpgcom.c @@ -12,45 +12,45 @@ * JPEG markers. */ -#define JPEG_CJPEG_DJPEG /* to get the command-line config symbols */ -#include "jinclude.h" /* get auto-config symbols, */ +#define JPEG_CJPEG_DJPEG /* to get the command-line config symbols */ +#include "jinclude.h" /* get auto-config symbols, */ #ifdef HAVE_LOCALE_H -#include /* Bill Allombert: use locale for isprint */ +#include /* Bill Allombert: use locale for isprint */ #endif -#include /* to declare isupper(), tolower() */ +#include /* to declare isupper(), tolower() */ #ifdef USE_SETMODE -#include /* to declare setmode()'s parameter macros */ +#include /* to declare setmode()'s parameter macros */ /* If you have setmode() but not , just delete this line: */ -#include /* to declare setmode() */ +#include /* to declare setmode() */ #endif -#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ +#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ #ifdef __MWERKS__ #include /* Metrowerks needs this */ -#include /* ... and this */ +#include /* ... and this */ #endif #ifdef THINK_C -#include /* Think declares it here */ +#include /* Think declares it here */ #endif #endif -#ifdef DONT_USE_B_MODE /* define mode parameters for fopen() */ -#define READ_BINARY "r" +#ifdef DONT_USE_B_MODE /* define mode parameters for fopen() */ +#define READ_BINARY "r" #else -#ifdef VMS /* VMS is very nonstandard */ -#define READ_BINARY "rb", "ctx=stm" -#else /* standard ANSI-compliant case */ -#define READ_BINARY "rb" +#ifdef VMS /* VMS is very nonstandard */ +#define READ_BINARY "rb", "ctx=stm" +#else /* standard ANSI-compliant case */ +#define READ_BINARY "rb" #endif #endif -#ifndef EXIT_FAILURE /* define exit() codes if not provided */ +#ifndef EXIT_FAILURE /* define exit() codes if not provided */ #define EXIT_FAILURE 1 #endif #ifndef EXIT_SUCCESS #ifdef VMS -#define EXIT_SUCCESS 1 /* VMS is very nonstandard */ +#define EXIT_SUCCESS 1 /* VMS is very nonstandard */ #else #define EXIT_SUCCESS 0 #endif @@ -62,7 +62,7 @@ * To reuse this code in another application, you might need to change these. */ -static FILE * infile; /* input JPEG file */ +static FILE * infile; /* input JPEG file */ /* Return next input byte, or EOF if no more */ #define NEXTBYTE() getc(infile) @@ -107,11 +107,11 @@ read_2_bytes (void) * in this program. (See jdmarker.c for a more complete list.) */ -#define M_SOF0 0xC0 /* Start Of Frame N */ -#define M_SOF1 0xC1 /* N indicates which compression process */ -#define M_SOF2 0xC2 /* Only SOF0-SOF2 are now in common use */ +#define M_SOF0 0xC0 /* Start Of Frame N */ +#define M_SOF1 0xC1 /* N indicates which compression process */ +#define M_SOF2 0xC2 /* Only SOF0-SOF2 are now in common use */ #define M_SOF3 0xC3 -#define M_SOF5 0xC5 /* NB: codes C4 and CC are NOT SOF markers */ +#define M_SOF5 0xC5 /* NB: codes C4 and CC are NOT SOF markers */ #define M_SOF6 0xC6 #define M_SOF7 0xC7 #define M_SOF9 0xC9 @@ -120,12 +120,12 @@ read_2_bytes (void) #define M_SOF13 0xCD #define M_SOF14 0xCE #define M_SOF15 0xCF -#define M_SOI 0xD8 /* Start Of Image (beginning of datastream) */ -#define M_EOI 0xD9 /* End Of Image (end of datastream) */ -#define M_SOS 0xDA /* Start Of Scan (begins compressed data) */ -#define M_APP0 0xE0 /* Application-specific marker, type N */ -#define M_APP12 0xEC /* (we don't bother to list all 16 APPn's) */ -#define M_COM 0xFE /* COMment */ +#define M_SOI 0xD8 /* Start Of Image (beginning of datastream) */ +#define M_EOI 0xD9 /* End Of Image (end of datastream) */ +#define M_SOS 0xDA /* Start Of Scan (begins compressed data) */ +#define M_APP0 0xE0 /* Application-specific marker, type N */ +#define M_APP12 0xEC /* (we don't bother to list all 16 APPn's) */ +#define M_COM 0xFE /* COMment */ /* @@ -253,7 +253,7 @@ process_COM (int raw) printf("\n"); } else if (ch == '\n') { if (lastch != '\r') - printf("\n"); + printf("\n"); } else if (ch == '\\') { printf("\\\\"); } else if (isprint(ch)) { @@ -287,7 +287,7 @@ process_SOFn (int marker) const char * process; int ci; - length = read_2_bytes(); /* usual parameter length count */ + length = read_2_bytes(); /* usual parameter length count */ data_precision = read_1_byte(); image_height = read_2_bytes(); @@ -295,33 +295,33 @@ process_SOFn (int marker) num_components = read_1_byte(); switch (marker) { - case M_SOF0: process = "Baseline"; break; - case M_SOF1: process = "Extended sequential"; break; - case M_SOF2: process = "Progressive"; break; - case M_SOF3: process = "Lossless"; break; - case M_SOF5: process = "Differential sequential"; break; - case M_SOF6: process = "Differential progressive"; break; - case M_SOF7: process = "Differential lossless"; break; - case M_SOF9: process = "Extended sequential, arithmetic coding"; break; - case M_SOF10: process = "Progressive, arithmetic coding"; break; - case M_SOF11: process = "Lossless, arithmetic coding"; break; - case M_SOF13: process = "Differential sequential, arithmetic coding"; break; - case M_SOF14: process = "Differential progressive, arithmetic coding"; break; - case M_SOF15: process = "Differential lossless, arithmetic coding"; break; - default: process = "Unknown"; break; + case M_SOF0: process = "Baseline"; break; + case M_SOF1: process = "Extended sequential"; break; + case M_SOF2: process = "Progressive"; break; + case M_SOF3: process = "Lossless"; break; + case M_SOF5: process = "Differential sequential"; break; + case M_SOF6: process = "Differential progressive"; break; + case M_SOF7: process = "Differential lossless"; break; + case M_SOF9: process = "Extended sequential, arithmetic coding"; break; + case M_SOF10: process = "Progressive, arithmetic coding"; break; + case M_SOF11: process = "Lossless, arithmetic coding"; break; + case M_SOF13: process = "Differential sequential, arithmetic coding"; break; + case M_SOF14: process = "Differential progressive, arithmetic coding"; break; + case M_SOF15: process = "Differential lossless, arithmetic coding"; break; + default: process = "Unknown"; break; } printf("JPEG image is %uw * %uh, %d color components, %d bits per sample\n", - image_width, image_height, num_components, data_precision); + image_width, image_height, num_components, data_precision); printf("JPEG process: %s\n", process); if (length != (unsigned int) (8 + num_components * 3)) ERREXIT("Bogus SOF marker length"); for (ci = 0; ci < num_components; ci++) { - (void) read_1_byte(); /* Component ID code */ - (void) read_1_byte(); /* H, V sampling factors */ - (void) read_1_byte(); /* Quantization table number */ + (void) read_1_byte(); /* Component ID code */ + (void) read_1_byte(); /* H, V sampling factors */ + (void) read_1_byte(); /* Quantization table number */ } } @@ -352,29 +352,29 @@ scan_JPEG_header (int verbose, int raw) /* Note that marker codes 0xC4, 0xC8, 0xCC are not, and must not be, * treated as SOFn. C4 in particular is actually DHT. */ - case M_SOF0: /* Baseline */ - case M_SOF1: /* Extended sequential, Huffman */ - case M_SOF2: /* Progressive, Huffman */ - case M_SOF3: /* Lossless, Huffman */ - case M_SOF5: /* Differential sequential, Huffman */ - case M_SOF6: /* Differential progressive, Huffman */ - case M_SOF7: /* Differential lossless, Huffman */ - case M_SOF9: /* Extended sequential, arithmetic */ - case M_SOF10: /* Progressive, arithmetic */ - case M_SOF11: /* Lossless, arithmetic */ - case M_SOF13: /* Differential sequential, arithmetic */ - case M_SOF14: /* Differential progressive, arithmetic */ - case M_SOF15: /* Differential lossless, arithmetic */ + case M_SOF0: /* Baseline */ + case M_SOF1: /* Extended sequential, Huffman */ + case M_SOF2: /* Progressive, Huffman */ + case M_SOF3: /* Lossless, Huffman */ + case M_SOF5: /* Differential sequential, Huffman */ + case M_SOF6: /* Differential progressive, Huffman */ + case M_SOF7: /* Differential lossless, Huffman */ + case M_SOF9: /* Extended sequential, arithmetic */ + case M_SOF10: /* Progressive, arithmetic */ + case M_SOF11: /* Lossless, arithmetic */ + case M_SOF13: /* Differential sequential, arithmetic */ + case M_SOF14: /* Differential progressive, arithmetic */ + case M_SOF15: /* Differential lossless, arithmetic */ if (verbose) - process_SOFn(marker); + process_SOFn(marker); else - skip_variable(); + skip_variable(); break; - case M_SOS: /* stop before hitting compressed data */ + case M_SOS: /* stop before hitting compressed data */ return marker; - case M_EOI: /* in case it's a tables-only JPEG stream */ + case M_EOI: /* in case it's a tables-only JPEG stream */ return marker; case M_COM: @@ -386,14 +386,14 @@ scan_JPEG_header (int verbose, int raw) * APP12 markers, so we print those out too when in -verbose mode. */ if (verbose) { - printf("APP12 contains:\n"); - process_COM(raw); + printf("APP12 contains:\n"); + process_COM(raw); } else - skip_variable(); + skip_variable(); break; - default: /* Anything else just gets skipped */ - skip_variable(); /* we assume it has a parameter count... */ + default: /* Anything else just gets skipped */ + skip_variable(); /* we assume it has a parameter count... */ break; } } /* end loop */ @@ -402,7 +402,7 @@ scan_JPEG_header (int verbose, int raw) /* Command line parsing code */ -static const char * progname; /* program name for error messages */ +static const char * progname; /* program name for error messages */ static void @@ -432,17 +432,17 @@ keymatch (char * arg, const char * keyword, int minchars) while ((ca = *arg++) != '\0') { if ((ck = *keyword++) == '\0') - return 0; /* arg longer than keyword, no good */ - if (isupper(ca)) /* force arg to lcase (assume ck is already) */ + return 0; /* arg longer than keyword, no good */ + if (isupper(ca)) /* force arg to lcase (assume ck is already) */ ca = tolower(ca); if (ca != ck) - return 0; /* no good */ - nmatched++; /* count matched characters */ + return 0; /* no good */ + nmatched++; /* count matched characters */ } /* reached end of argument; fail if it's too short for unique abbrev */ if (nmatched < minchars) return 0; - return 1; /* A-OK */ + return 1; /* A-OK */ } @@ -464,14 +464,14 @@ main (int argc, char **argv) progname = argv[0]; if (progname == NULL || progname[0] == 0) - progname = "rdjpgcom"; /* in case C library doesn't provide it */ + progname = "rdjpgcom"; /* in case C library doesn't provide it */ /* Parse switches, if any */ for (argn = 1; argn < argc; argn++) { arg = argv[argn]; if (arg[0] != '-') - break; /* not switch, must be file name */ - arg++; /* advance over '-' */ + break; /* not switch, must be file name */ + arg++; /* advance over '-' */ if (keymatch(arg, "verbose", 1)) { verbose++; } else if (keymatch(arg, "raw", 1)) { @@ -493,10 +493,10 @@ main (int argc, char **argv) } } else { /* default input file is stdin */ -#ifdef USE_SETMODE /* need to hack file mode? */ +#ifdef USE_SETMODE /* need to hack file mode? */ setmode(fileno(stdin), O_BINARY); #endif -#ifdef USE_FDOPEN /* need to re-open in binary mode? */ +#ifdef USE_FDOPEN /* need to re-open in binary mode? */ if ((infile = fdopen(fileno(stdin), READ_BINARY)) == NULL) { fprintf(stderr, "%s: can't open stdin\n", progname); exit(EXIT_FAILURE); @@ -511,5 +511,5 @@ main (int argc, char **argv) /* All done. */ exit(EXIT_SUCCESS); - return 0; /* suppress no-return-value warnings */ + return 0; /* suppress no-return-value warnings */ } diff --git a/rdppm.c b/rdppm.c index 0f3994d29..c55ab2b64 100644 --- a/rdppm.c +++ b/rdppm.c @@ -19,7 +19,7 @@ * the file is indeed PPM format). */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef PPM_SUPPORTED @@ -41,19 +41,19 @@ #ifdef HAVE_UNSIGNED_CHAR typedef unsigned char U_CHAR; -#define UCH(x) ((int) (x)) +#define UCH(x) ((int) (x)) #else /* !HAVE_UNSIGNED_CHAR */ #ifdef __CHAR_UNSIGNED__ typedef char U_CHAR; -#define UCH(x) ((int) (x)) +#define UCH(x) ((int) (x)) #else typedef char U_CHAR; -#define UCH(x) ((int) (x) & 0xFF) +#define UCH(x) ((int) (x) & 0xFF) #endif #endif /* HAVE_UNSIGNED_CHAR */ -#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len))) +#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len))) /* @@ -72,10 +72,10 @@ typedef char U_CHAR; typedef struct { struct cjpeg_source_struct pub; /* public fields */ - U_CHAR *iobuffer; /* non-FAR pointer to I/O buffer */ - JSAMPROW pixrow; /* FAR pointer to same */ - size_t buffer_width; /* width of I/O buffer */ - JSAMPLE *rescale; /* => maxval-remapping array, or NULL */ + U_CHAR *iobuffer; /* non-FAR pointer to I/O buffer */ + JSAMPROW pixrow; /* FAR pointer to same */ + size_t buffer_width; /* width of I/O buffer */ + JSAMPLE *rescale; /* => maxval-remapping array, or NULL */ } ppm_source_struct; typedef ppm_source_struct * ppm_source_ptr; @@ -308,10 +308,10 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) /* detect unsupported variants (ie, PBM) before trying to read header */ switch (c) { - case '2': /* it's a text-format PGM file */ - case '3': /* it's a text-format PPM file */ - case '5': /* it's a raw-format PGM file */ - case '6': /* it's a raw-format PPM file */ + case '2': /* it's a text-format PGM file */ + case '3': /* it's a text-format PPM file */ + case '5': /* it's a raw-format PGM file */ + case '6': /* it's a raw-format PPM file */ break; default: ERREXIT(cinfo, JERR_PPM_NOT); @@ -331,12 +331,12 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) cinfo->image_height = (JDIMENSION) h; /* initialize flags to most common settings */ - need_iobuffer = TRUE; /* do we need an I/O buffer? */ - use_raw_buffer = FALSE; /* do we map input buffer onto I/O buffer? */ - need_rescale = TRUE; /* do we need a rescale array? */ + need_iobuffer = TRUE; /* do we need an I/O buffer? */ + use_raw_buffer = FALSE; /* do we map input buffer onto I/O buffer? */ + need_rescale = TRUE; /* do we need a rescale array? */ switch (c) { - case '2': /* it's a text-format PGM file */ + case '2': /* it's a text-format PGM file */ cinfo->input_components = 1; cinfo->in_color_space = JCS_GRAYSCALE; TRACEMS2(cinfo, 1, JTRC_PGM_TEXT, w, h); @@ -344,7 +344,7 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) need_iobuffer = FALSE; break; - case '3': /* it's a text-format PPM file */ + case '3': /* it's a text-format PPM file */ cinfo->input_components = 3; cinfo->in_color_space = JCS_RGB; TRACEMS2(cinfo, 1, JTRC_PPM_TEXT, w, h); @@ -352,7 +352,7 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) need_iobuffer = FALSE; break; - case '5': /* it's a raw-format PGM file */ + case '5': /* it's a raw-format PGM file */ cinfo->input_components = 1; cinfo->in_color_space = JCS_GRAYSCALE; TRACEMS2(cinfo, 1, JTRC_PGM, w, h); @@ -367,7 +367,7 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) } break; - case '6': /* it's a raw-format PPM file */ + case '6': /* it's a raw-format PPM file */ cinfo->input_components = 3; cinfo->in_color_space = JCS_RGB; TRACEMS2(cinfo, 1, JTRC_PPM, w, h); @@ -389,7 +389,7 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) ((maxval<=255) ? SIZEOF(U_CHAR) : (2*SIZEOF(U_CHAR))); source->iobuffer = (U_CHAR *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - source->buffer_width); + source->buffer_width); } /* Create compressor input buffer. */ @@ -415,7 +415,7 @@ start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) /* On 16-bit-int machines we have to be careful of maxval = 65535 */ source->rescale = (JSAMPLE *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (size_t) (((long) maxval + 1L) * SIZEOF(JSAMPLE))); + (size_t) (((long) maxval + 1L) * SIZEOF(JSAMPLE))); half_maxval = maxval / 2; for (val = 0; val <= (INT32) maxval; val++) { /* The multiplication here must be done in 32 bits to avoid overflow */ @@ -448,7 +448,7 @@ jinit_read_ppm (j_compress_ptr cinfo) /* Create module interface object */ source = (ppm_source_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(ppm_source_struct)); + SIZEOF(ppm_source_struct)); /* Fill in method ptrs, except get_pixel_rows which start_input sets */ source->pub.start_input = start_input_ppm; source->pub.finish_input = finish_input_ppm; diff --git a/rdrle.c b/rdrle.c index 542bc3749..f8b3587fb 100644 --- a/rdrle.c +++ b/rdrle.c @@ -19,7 +19,7 @@ * with updates from Robert Hutchinson. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef RLE_SUPPORTED @@ -38,7 +38,7 @@ /* * We support the following types of RLE files: - * + * * GRAYSCALE - 8 bits, no colormap * MAPPEDGRAY - 8 bits, 1 channel colomap * PSEUDOCOLOR - 8 bits, 3 channel colormap @@ -66,7 +66,7 @@ typedef struct _rle_source_struct { rle_kind visual; /* actual type of input file */ jvirt_sarray_ptr image; /* virtual array to hold the image */ - JDIMENSION row; /* current row # in the virtual array */ + JDIMENSION row; /* current row # in the virtual array */ rle_hdr header; /* Input file information */ rle_pixel** rle_row; /* holds a row returned by rle_getrow() */ @@ -111,10 +111,10 @@ start_input_rle (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) } /* Figure out what we have, set private vars and return values accordingly */ - + width = source->header.xmax - source->header.xmin + 1; height = source->header.ymax - source->header.ymin + 1; - source->header.xmin = 0; /* realign horizontally */ + source->header.xmin = 0; /* realign horizontally */ source->header.xmax = width-1; cinfo->image_width = width; @@ -131,17 +131,17 @@ start_input_rle (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) } else if (source->header.ncolors == 1 && source->header.ncmap == 3) { source->visual = PSEUDOCOLOR; TRACEMS3(cinfo, 1, JTRC_RLE_MAPPED, width, height, - 1 << source->header.cmaplen); + 1 << source->header.cmaplen); } else if (source->header.ncolors == 3 && source->header.ncmap == 3) { source->visual = TRUECOLOR; TRACEMS3(cinfo, 1, JTRC_RLE_FULLMAP, width, height, - 1 << source->header.cmaplen); + 1 << source->header.cmaplen); } else if (source->header.ncolors == 3 && source->header.ncmap == 0) { source->visual = DIRECTCOLOR; TRACEMS2(cinfo, 1, JTRC_RLE, width, height); } else ERREXIT(cinfo, JERR_RLE_UNSUPPORTED); - + if (source->visual == GRAYSCALE || source->visual == MAPPEDGRAY) { cinfo->in_color_space = JCS_GRAYSCALE; cinfo->input_components = 1; @@ -348,7 +348,7 @@ load_image (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) source->row = cinfo->image_height; /* And fetch the topmost (bottommost) row */ - return (*source->pub.get_pixel_rows) (cinfo, sinfo); + return (*source->pub.get_pixel_rows) (cinfo, sinfo); } diff --git a/rdswitch.c b/rdswitch.c index fc0727a55..654997403 100644 --- a/rdswitch.c +++ b/rdswitch.c @@ -9,15 +9,15 @@ * * This file contains routines to process some of cjpeg's more complicated * command-line switches. Switches processed here are: - * -qtables file Read quantization tables from text file - * -scans file Read scan script from text file - * -quality N[,N,...] Set quality ratings - * -qslots N[,N,...] Set component quantization table selectors - * -sample HxV[,HxV,...] Set component sampling factors + * -qtables file Read quantization tables from text file + * -scans file Read scan script from text file + * -quality N[,N,...] Set quality ratings + * -qslots N[,N,...] Set component quantization table selectors + * -sample HxV[,HxV,...] Set component sampling factors */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ -#include /* to declare isdigit(), isspace() */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include /* to declare isdigit(), isspace() */ LOCAL(int) @@ -26,7 +26,7 @@ text_getc (FILE * file) /* A comment/newline sequence is returned as a newline */ { register int ch; - + ch = getc(file); if (ch == '#') { do { @@ -44,7 +44,7 @@ read_text_integer (FILE * file, long * result, int * termchar) { register int ch; register long val; - + /* Skip any leading whitespace, detect EOF */ do { ch = text_getc(file); @@ -53,7 +53,7 @@ read_text_integer (FILE * file, long * result, int * termchar) return FALSE; } } while (isspace(ch)); - + if (! isdigit(ch)) { *termchar = ch; return FALSE; @@ -108,15 +108,15 @@ read_quant_tables (j_compress_ptr cinfo, char * filename, boolean force_baseline table[0] = (unsigned int) val; for (i = 1; i < DCTSIZE2; i++) { if (! read_text_integer(fp, &val, &termchar)) { - fprintf(stderr, "Invalid table data in file %s\n", filename); - fclose(fp); - return FALSE; + fprintf(stderr, "Invalid table data in file %s\n", filename); + fclose(fp); + return FALSE; } table[i] = (unsigned int) val; } #if JPEG_LIB_VERSION >= 70 jpeg_add_quant_table(cinfo, tblno, table, cinfo->q_scale_factor[tblno], - force_baseline); + force_baseline); #else jpeg_add_quant_table(cinfo, tblno, table, q_scale_factor[tblno], force_baseline); @@ -150,7 +150,7 @@ read_scan_integer (FILE * file, long * result, int * termchar) ch = *termchar; while (ch != EOF && isspace(ch)) ch = text_getc(file); - if (isdigit(ch)) { /* oops, put it back */ + if (isdigit(ch)) { /* oops, put it back */ if (ungetc(ch, file) == EOF) return FALSE; ch = ' '; @@ -188,7 +188,7 @@ read_scan_script (j_compress_ptr cinfo, char * filename) int scanno, ncomps, termchar; long val; jpeg_scan_info * scanptr; -#define MAX_SCANS 100 /* quite arbitrary limit */ +#define MAX_SCANS 100 /* quite arbitrary limit */ jpeg_scan_info scans[MAX_SCANS]; if ((fp = fopen(filename, "r")) == NULL) { @@ -208,29 +208,29 @@ read_scan_script (j_compress_ptr cinfo, char * filename) ncomps = 1; while (termchar == ' ') { if (ncomps >= MAX_COMPS_IN_SCAN) { - fprintf(stderr, "Too many components in one scan in file %s\n", - filename); - fclose(fp); - return FALSE; + fprintf(stderr, "Too many components in one scan in file %s\n", + filename); + fclose(fp); + return FALSE; } if (! read_scan_integer(fp, &val, &termchar)) - goto bogus; + goto bogus; scanptr->component_index[ncomps] = (int) val; ncomps++; } scanptr->comps_in_scan = ncomps; if (termchar == ':') { if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ') - goto bogus; + goto bogus; scanptr->Ss = (int) val; if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ') - goto bogus; + goto bogus; scanptr->Se = (int) val; if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ') - goto bogus; + goto bogus; scanptr->Ah = (int) val; if (! read_scan_integer(fp, &val, &termchar)) - goto bogus; + goto bogus; scanptr->Al = (int) val; } else { /* set non-progressive parameters */ @@ -261,7 +261,7 @@ read_scan_script (j_compress_ptr cinfo, char * filename) */ scanptr = (jpeg_scan_info *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - scanno * SIZEOF(jpeg_scan_info)); + scanno * SIZEOF(jpeg_scan_info)); MEMCOPY(scanptr, scans, scanno * SIZEOF(jpeg_scan_info)); cinfo->scan_info = scanptr; cinfo->num_scans = scanno; @@ -305,9 +305,9 @@ LOCAL(void) jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline) { jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, - q_scale_factor[0], force_baseline); + q_scale_factor[0], force_baseline); jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl, - q_scale_factor[1], force_baseline); + q_scale_factor[1], force_baseline); } #endif @@ -319,17 +319,17 @@ set_quality_ratings (j_compress_ptr cinfo, char *arg, boolean force_baseline) * If there are more q-table slots than parameters, the last value is replicated. */ { - int val = 75; /* default value */ + int val = 75; /* default value */ int tblno; char ch; for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) { if (*arg) { - ch = ','; /* if not set by sscanf, will be ',' */ + ch = ','; /* if not set by sscanf, will be ',' */ if (sscanf(arg, "%d%c", &val, &ch) < 1) - return FALSE; - if (ch != ',') /* syntax check */ - return FALSE; + return FALSE; + if (ch != ',') /* syntax check */ + return FALSE; /* Convert user 0-100 rating to percentage scaling */ #if JPEG_LIB_VERSION >= 70 cinfo->q_scale_factor[tblno] = jpeg_quality_scaling(val); @@ -337,7 +337,7 @@ set_quality_ratings (j_compress_ptr cinfo, char *arg, boolean force_baseline) q_scale_factor[tblno] = jpeg_quality_scaling(val); #endif while (*arg && *arg++ != ',') /* advance to next segment of arg string */ - ; + ; } else { /* reached end of parameter, set remaining factors to last value */ #if JPEG_LIB_VERSION >= 70 @@ -359,25 +359,25 @@ set_quant_slots (j_compress_ptr cinfo, char *arg) * If there are more components than parameters, the last value is replicated. */ { - int val = 0; /* default table # */ + int val = 0; /* default table # */ int ci; char ch; for (ci = 0; ci < MAX_COMPONENTS; ci++) { if (*arg) { - ch = ','; /* if not set by sscanf, will be ',' */ + ch = ','; /* if not set by sscanf, will be ',' */ if (sscanf(arg, "%d%c", &val, &ch) < 1) - return FALSE; - if (ch != ',') /* syntax check */ - return FALSE; + return FALSE; + if (ch != ',') /* syntax check */ + return FALSE; if (val < 0 || val >= NUM_QUANT_TBLS) { - fprintf(stderr, "JPEG quantization tables are numbered 0..%d\n", - NUM_QUANT_TBLS-1); - return FALSE; + fprintf(stderr, "JPEG quantization tables are numbered 0..%d\n", + NUM_QUANT_TBLS-1); + return FALSE; } cinfo->comp_info[ci].quant_tbl_no = val; while (*arg && *arg++ != ',') /* advance to next segment of arg string */ - ; + ; } else { /* reached end of parameter, set remaining components to last table */ cinfo->comp_info[ci].quant_tbl_no = val; @@ -399,19 +399,19 @@ set_sample_factors (j_compress_ptr cinfo, char *arg) for (ci = 0; ci < MAX_COMPONENTS; ci++) { if (*arg) { - ch2 = ','; /* if not set by sscanf, will be ',' */ + ch2 = ','; /* if not set by sscanf, will be ',' */ if (sscanf(arg, "%d%c%d%c", &val1, &ch1, &val2, &ch2) < 3) - return FALSE; + return FALSE; if ((ch1 != 'x' && ch1 != 'X') || ch2 != ',') /* syntax check */ - return FALSE; + return FALSE; if (val1 <= 0 || val1 > 4 || val2 <= 0 || val2 > 4) { - fprintf(stderr, "JPEG sampling factors must be 1..4\n"); - return FALSE; + fprintf(stderr, "JPEG sampling factors must be 1..4\n"); + return FALSE; } cinfo->comp_info[ci].h_samp_factor = val1; cinfo->comp_info[ci].v_samp_factor = val2; while (*arg && *arg++ != ',') /* advance to next segment of arg string */ - ; + ; } else { /* reached end of parameter, set remaining components to 1x1 sampling */ cinfo->comp_info[ci].h_samp_factor = 1; diff --git a/rdtarga.c b/rdtarga.c index 0ad4642e5..e8bbaf647 100644 --- a/rdtarga.c +++ b/rdtarga.c @@ -17,7 +17,7 @@ * Based on code contributed by Lee Daniel Crocker. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef TARGA_SUPPORTED @@ -26,19 +26,19 @@ #ifdef HAVE_UNSIGNED_CHAR typedef unsigned char U_CHAR; -#define UCH(x) ((int) (x)) +#define UCH(x) ((int) (x)) #else /* !HAVE_UNSIGNED_CHAR */ #ifdef __CHAR_UNSIGNED__ typedef char U_CHAR; -#define UCH(x) ((int) (x)) +#define UCH(x) ((int) (x)) #else typedef char U_CHAR; -#define UCH(x) ((int) (x) & 0xFF) +#define UCH(x) ((int) (x) & 0xFF) #endif #endif /* HAVE_UNSIGNED_CHAR */ -#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len))) +#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len))) /* Private version of data source object */ @@ -48,12 +48,12 @@ typedef struct _tga_source_struct * tga_source_ptr; typedef struct _tga_source_struct { struct cjpeg_source_struct pub; /* public fields */ - j_compress_ptr cinfo; /* back link saves passing separate parm */ + j_compress_ptr cinfo; /* back link saves passing separate parm */ - JSAMPARRAY colormap; /* Targa colormap (converted to my format) */ + JSAMPARRAY colormap; /* Targa colormap (converted to my format) */ - jvirt_sarray_ptr whole_image; /* Needed if funny input row order */ - JDIMENSION current_row; /* Current logical row number to read */ + jvirt_sarray_ptr whole_image; /* Needed if funny input row order */ + JDIMENSION current_row; /* Current logical row number to read */ /* Pointer to routine to extract next Targa pixel from input file */ JMETHOD(void, read_pixel, (tga_source_ptr sinfo)); @@ -61,15 +61,15 @@ typedef struct _tga_source_struct { /* Result of read_pixel is delivered here: */ U_CHAR tga_pixel[4]; - int pixel_size; /* Bytes per Targa pixel (1 to 4) */ + int pixel_size; /* Bytes per Targa pixel (1 to 4) */ /* State info for reading RLE-coded pixels; both counts must be init to 0 */ - int block_count; /* # of pixels remaining in RLE block */ - int dup_pixel_count; /* # of times to duplicate previous pixel */ + int block_count; /* # of pixels remaining in RLE block */ + int dup_pixel_count; /* # of times to duplicate previous pixel */ /* This saves the correct pixel-row-expansion method for preload_image */ JMETHOD(JDIMENSION, get_pixel_rows, (j_compress_ptr cinfo, - cjpeg_source_ptr sinfo)); + cjpeg_source_ptr sinfo)); } tga_source_struct; @@ -148,9 +148,9 @@ read_rle_pixel (tga_source_ptr sinfo) /* Time to read RLE block header? */ if (--sinfo->block_count < 0) { /* decrement pixels remaining in block */ i = read_byte(sinfo); - if (i & 0x80) { /* Start of duplicate-pixel block? */ + if (i & 0x80) { /* Start of duplicate-pixel block? */ sinfo->dup_pixel_count = i & 0x7F; /* number of dups after this one */ - sinfo->block_count = 0; /* then read new block header */ + sinfo->block_count = 0; /* then read new block header */ } else { sinfo->block_count = i & 0x7F; /* number of pixels after this one */ } @@ -338,8 +338,8 @@ start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) unsigned int width, height, maplen; boolean is_bottom_up; -#define GET_2B(offset) ((unsigned int) UCH(targaheader[offset]) + \ - (((unsigned int) UCH(targaheader[offset+1])) << 8)) +#define GET_2B(offset) ((unsigned int) UCH(targaheader[offset]) + \ + (((unsigned int) UCH(targaheader[offset+1])) << 8)) if (! ReadOK(source->pub.input_file, targaheader, 18)) ERREXIT(cinfo, JERR_INPUT_EOF); @@ -355,15 +355,15 @@ start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) width = GET_2B(12); height = GET_2B(14); source->pixel_size = UCH(targaheader[16]) >> 3; - flags = UCH(targaheader[17]); /* Image Descriptor byte */ + flags = UCH(targaheader[17]); /* Image Descriptor byte */ - is_bottom_up = ((flags & 0x20) == 0); /* bit 5 set => top-down */ - interlace_type = flags >> 6; /* bits 6/7 are interlace code */ + is_bottom_up = ((flags & 0x20) == 0); /* bit 5 set => top-down */ + interlace_type = flags >> 6; /* bits 6/7 are interlace code */ - if (cmaptype > 1 || /* cmaptype must be 0 or 1 */ + if (cmaptype > 1 || /* cmaptype must be 0 or 1 */ source->pixel_size < 1 || source->pixel_size > 4 || (UCH(targaheader[16]) & 7) != 0 || /* bits/pixel must be multiple of 8 */ - interlace_type != 0) /* currently don't allow interlaced image */ + interlace_type != 0) /* currently don't allow interlaced image */ ERREXIT(cinfo, JERR_TGA_BADPARMS); if (subtype > 8) { @@ -377,18 +377,18 @@ start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) } /* Now should have subtype 1, 2, or 3 */ - components = 3; /* until proven different */ + components = 3; /* until proven different */ cinfo->in_color_space = JCS_RGB; switch (subtype) { - case 1: /* Colormapped image */ + case 1: /* Colormapped image */ if (source->pixel_size == 1 && cmaptype == 1) source->get_pixel_rows = get_8bit_row; else ERREXIT(cinfo, JERR_TGA_BADPARMS); TRACEMS2(cinfo, 1, JTRC_TGA_MAPPED, width, height); break; - case 2: /* RGB image */ + case 2: /* RGB image */ switch (source->pixel_size) { case 2: source->get_pixel_rows = get_16bit_row; @@ -405,7 +405,7 @@ start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) } TRACEMS2(cinfo, 1, JTRC_TGA, width, height); break; - case 3: /* Grayscale image */ + case 3: /* Grayscale image */ components = 1; cinfo->in_color_space = JCS_GRAYSCALE; if (source->pixel_size == 1) @@ -441,7 +441,7 @@ start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) source->pub.get_pixel_rows = source->get_pixel_rows; } - while (idlen--) /* Throw away ID field */ + while (idlen--) /* Throw away ID field */ (void) read_byte(source); if (maplen > 0) { @@ -453,7 +453,7 @@ start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo) /* and read it from the file */ read_colormap(source, (int) maplen, UCH(targaheader[7])); } else { - if (cmaptype) /* but you promised a cmap! */ + if (cmaptype) /* but you promised a cmap! */ ERREXIT(cinfo, JERR_TGA_BADPARMS); source->colormap = NULL; } @@ -488,8 +488,8 @@ jinit_read_targa (j_compress_ptr cinfo) /* Create module interface object */ source = (tga_source_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(tga_source_struct)); - source->cinfo = cinfo; /* make back link for subroutines */ + SIZEOF(tga_source_struct)); + source->cinfo = cinfo; /* make back link for subroutines */ /* Fill in method ptrs, except get_pixel_rows which start_input sets */ source->pub.start_input = start_input_tga; source->pub.finish_input = finish_input_tga; diff --git a/simd/jcclrmmx.asm b/simd/jcclrmmx.asm index e09525310..f34104f32 100644 --- a/simd/jcclrmmx.asm +++ b/simd/jcclrmmx.asm @@ -28,450 +28,450 @@ ; JDIMENSION output_row, int num_rows); ; -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 8 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_rgb_ycc_convert_mmx) + align 16 + global EXTN(jsimd_rgb_ycc_convert_mmx) EXTN(jsimd_rgb_ycc_convert_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 .rowloop: - pushpic eax - push edx - push ebx - push edi - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - mov ebx, JSAMPROW [ebx] ; outptr1 - mov edx, JSAMPROW [edx] ; outptr2 - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - cmp ecx, byte SIZEOF_MMWORD - jae short .columnloop - alignx 16,7 + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16,7 %if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - xor eax,eax - mov al, BYTE [esi+ecx] + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax,eax + mov al, BYTE [esi+ecx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - xor edx,edx - mov dx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx,edx + mov dx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx .column_ld4: - movd mmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd mmG, DWORD [esi+ecx] - psllq mmA, DWORD_BIT - por mmA,mmG + movd mmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, DWORD [esi+ecx] + psllq mmA, DWORD_BIT + por mmA,mmG .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - movq mmG,mmA - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - mov ecx, SIZEOF_MMWORD - jmp short .rgb_ycc_cnv + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_ycc_cnv .column_ld16: - test cl, 2*SIZEOF_MMWORD - mov ecx, SIZEOF_MMWORD - jz short .rgb_ycc_cnv - movq mmF,mmA - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmF,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 .columnloop: - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] .rgb_ycc_cnv: - ; mmA=(00 10 20 01 11 21 02 12) - ; mmG=(22 03 13 23 04 14 24 05) - ; mmF=(15 25 06 16 26 07 17 27) + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) - movq mmD,mmA - psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) - psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + movq mmD,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) - punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) - psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) - punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) - punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) - movq mmE,mmA - psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) - psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + movq mmE,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) - punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) - psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) - punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) - punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) - pxor mmH,mmH + pxor mmH,mmH - movq mmC,mmA - punpcklbw mmA,mmH ; mmA=(00 02 04 06) - punpckhbw mmC,mmH ; mmC=(10 12 14 16) + movq mmC,mmA + punpcklbw mmA,mmH ; mmA=(00 02 04 06) + punpckhbw mmC,mmH ; mmC=(10 12 14 16) - movq mmB,mmE - punpcklbw mmE,mmH ; mmE=(20 22 24 26) - punpckhbw mmB,mmH ; mmB=(01 03 05 07) + movq mmB,mmE + punpcklbw mmE,mmH ; mmE=(20 22 24 26) + punpckhbw mmB,mmH ; mmB=(01 03 05 07) - movq mmF,mmD - punpcklbw mmD,mmH ; mmD=(11 13 15 17) - punpckhbw mmF,mmH ; mmF=(21 23 25 27) + movq mmF,mmD + punpcklbw mmD,mmH ; mmD=(11 13 15 17) + punpckhbw mmF,mmH ; mmF=(21 23 25 27) %else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_MMWORD/8 - jz short .column_ld2 - sub ecx, byte SIZEOF_MMWORD/8 - movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_MMWORD/4 - jz short .column_ld4 - sub ecx, byte SIZEOF_MMWORD/4 - movq mmF,mmA - movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF,mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] .column_ld4: - test cl, SIZEOF_MMWORD/2 - mov ecx, SIZEOF_MMWORD - jz short .rgb_ycc_cnv - movq mmD,mmA - movq mmC,mmF - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_ycc_cnv + movq mmD,mmA + movq mmC,mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 .columnloop: - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] - movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] - movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] .rgb_ycc_cnv: - ; mmA=(00 10 20 30 01 11 21 31) - ; mmF=(02 12 22 32 03 13 23 33) - ; mmD=(04 14 24 34 05 15 25 35) - ; mmC=(06 16 26 36 07 17 27 37) + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) - movq mmB,mmA - punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) - punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) + movq mmB,mmA + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) - movq mmG,mmD - punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) - punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) + movq mmG,mmD + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) - movq mmE,mmA - punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) - punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) + movq mmE,mmA + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) - movq mmH,mmB - punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) - punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) + movq mmH,mmB + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) - pxor mmF,mmF + pxor mmF,mmF - movq mmC,mmA - punpcklbw mmA,mmF ; mmA=(00 02 04 06) - punpckhbw mmC,mmF ; mmC=(10 12 14 16) + movq mmC,mmA + punpcklbw mmA,mmF ; mmA=(00 02 04 06) + punpckhbw mmC,mmF ; mmC=(10 12 14 16) - movq mmD,mmB - punpcklbw mmB,mmF ; mmB=(01 03 05 07) - punpckhbw mmD,mmF ; mmD=(11 13 15 17) + movq mmD,mmB + punpcklbw mmB,mmF ; mmB=(01 03 05 07) + punpckhbw mmD,mmF ; mmD=(11 13 15 17) - movq mmG,mmE - punpcklbw mmE,mmF ; mmE=(20 22 24 26) - punpckhbw mmG,mmF ; mmG=(30 32 34 36) + movq mmG,mmE + punpcklbw mmE,mmF ; mmE=(20 22 24 26) + punpckhbw mmG,mmF ; mmG=(30 32 34 36) - punpcklbw mmF,mmH - punpckhbw mmH,mmH - psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) - psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) + punpcklbw mmF,mmH + punpckhbw mmH,mmH + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) %endif ; RGB_PIXELSIZE ; --------------- - ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE - ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - - movq MMWORD [wk(0)], mm0 ; wk(0)=RE - movq MMWORD [wk(1)], mm1 ; wk(1)=RO - movq MMWORD [wk(2)], mm4 ; wk(2)=BE - movq MMWORD [wk(3)], mm5 ; wk(3)=BO - - movq mm6,mm1 - punpcklwd mm1,mm3 - punpckhwd mm6,mm3 - movq mm7,mm1 - movq mm4,mm6 - pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) - pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) - pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) - - movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) - movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) - - pxor mm1,mm1 - pxor mm6,mm6 - punpcklwd mm1,mm5 ; mm1=BOL - punpckhwd mm6,mm5 ; mm6=BOH - psrld mm1,1 ; mm1=BOL*FIX(0.500) - psrld mm6,1 ; mm6=BOH*FIX(0.500) - - movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] - - paddd mm7,mm1 - paddd mm4,mm6 - paddd mm7,mm5 - paddd mm4,mm5 - psrld mm7,SCALEBITS ; mm7=CbOL - psrld mm4,SCALEBITS ; mm4=CbOH - packssdw mm7,mm4 ; mm7=CbO - - movq mm1, MMWORD [wk(2)] ; mm1=BE - - movq mm6,mm0 - punpcklwd mm0,mm2 - punpckhwd mm6,mm2 - movq mm5,mm0 - movq mm4,mm6 - pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) - pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) - pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) - - movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) - movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) - - pxor mm0,mm0 - pxor mm6,mm6 - punpcklwd mm0,mm1 ; mm0=BEL - punpckhwd mm6,mm1 ; mm6=BEH - psrld mm0,1 ; mm0=BEL*FIX(0.500) - psrld mm6,1 ; mm6=BEH*FIX(0.500) - - movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] - - paddd mm5,mm0 - paddd mm4,mm6 - paddd mm5,mm1 - paddd mm4,mm1 - psrld mm5,SCALEBITS ; mm5=CbEL - psrld mm4,SCALEBITS ; mm4=CbEH - packssdw mm5,mm4 ; mm5=CbE - - psllw mm7,BYTE_BIT - por mm5,mm7 ; mm5=Cb - movq MMWORD [ebx], mm5 ; Save Cb - - movq mm0, MMWORD [wk(3)] ; mm0=BO - movq mm6, MMWORD [wk(2)] ; mm6=BE - movq mm1, MMWORD [wk(1)] ; mm1=RO - - movq mm4,mm0 - punpcklwd mm0,mm3 - punpckhwd mm4,mm3 - movq mm7,mm0 - movq mm5,mm4 - pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) - pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) - pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) - - movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] - - paddd mm0, MMWORD [wk(4)] - paddd mm4, MMWORD [wk(5)] - paddd mm0,mm3 - paddd mm4,mm3 - psrld mm0,SCALEBITS ; mm0=YOL - psrld mm4,SCALEBITS ; mm4=YOH - packssdw mm0,mm4 ; mm0=YO - - pxor mm3,mm3 - pxor mm4,mm4 - punpcklwd mm3,mm1 ; mm3=ROL - punpckhwd mm4,mm1 ; mm4=ROH - psrld mm3,1 ; mm3=ROL*FIX(0.500) - psrld mm4,1 ; mm4=ROH*FIX(0.500) - - movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] - - paddd mm7,mm3 - paddd mm5,mm4 - paddd mm7,mm1 - paddd mm5,mm1 - psrld mm7,SCALEBITS ; mm7=CrOL - psrld mm5,SCALEBITS ; mm5=CrOH - packssdw mm7,mm5 ; mm7=CrO - - movq mm3, MMWORD [wk(0)] ; mm3=RE - - movq mm4,mm6 - punpcklwd mm6,mm2 - punpckhwd mm4,mm2 - movq mm1,mm6 - movq mm5,mm4 - pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) - pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) - pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) - - movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] - - paddd mm6, MMWORD [wk(6)] - paddd mm4, MMWORD [wk(7)] - paddd mm6,mm2 - paddd mm4,mm2 - psrld mm6,SCALEBITS ; mm6=YEL - psrld mm4,SCALEBITS ; mm4=YEH - packssdw mm6,mm4 ; mm6=YE - - psllw mm0,BYTE_BIT - por mm6,mm0 ; mm6=Y - movq MMWORD [edi], mm6 ; Save Y - - pxor mm2,mm2 - pxor mm4,mm4 - punpcklwd mm2,mm3 ; mm2=REL - punpckhwd mm4,mm3 ; mm4=REH - psrld mm2,1 ; mm2=REL*FIX(0.500) - psrld mm4,1 ; mm4=REH*FIX(0.500) - - movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] - - paddd mm1,mm2 - paddd mm5,mm4 - paddd mm1,mm0 - paddd mm5,mm0 - psrld mm1,SCALEBITS ; mm1=CrEL - psrld mm5,SCALEBITS ; mm5=CrEH - packssdw mm1,mm5 ; mm1=CrE - - psllw mm7,BYTE_BIT - por mm1,mm7 ; mm1=Cr - movq MMWORD [edx], mm1 ; Save Cr - - sub ecx, byte SIZEOF_MMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr - add edi, byte SIZEOF_MMWORD ; outptr0 - add ebx, byte SIZEOF_MMWORD ; outptr1 - add edx, byte SIZEOF_MMWORD ; outptr2 - cmp ecx, byte SIZEOF_MMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - pop ebx - pop edx - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop - - emms ; empty MMX state + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movq MMWORD [wk(0)], mm0 ; wk(0)=RE + movq MMWORD [wk(1)], mm1 ; wk(1)=RO + movq MMWORD [wk(2)], mm4 ; wk(2)=BE + movq MMWORD [wk(3)], mm5 ; wk(3)=BO + + movq mm6,mm1 + punpcklwd mm1,mm3 + punpckhwd mm6,mm3 + movq mm7,mm1 + movq mm4,mm6 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor mm1,mm1 + pxor mm6,mm6 + punpcklwd mm1,mm5 ; mm1=BOL + punpckhwd mm6,mm5 ; mm6=BOH + psrld mm1,1 ; mm1=BOL*FIX(0.500) + psrld mm6,1 ; mm6=BOH*FIX(0.500) + + movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] + + paddd mm7,mm1 + paddd mm4,mm6 + paddd mm7,mm5 + paddd mm4,mm5 + psrld mm7,SCALEBITS ; mm7=CbOL + psrld mm4,SCALEBITS ; mm4=CbOH + packssdw mm7,mm4 ; mm7=CbO + + movq mm1, MMWORD [wk(2)] ; mm1=BE + + movq mm6,mm0 + punpcklwd mm0,mm2 + punpckhwd mm6,mm2 + movq mm5,mm0 + movq mm4,mm6 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor mm0,mm0 + pxor mm6,mm6 + punpcklwd mm0,mm1 ; mm0=BEL + punpckhwd mm6,mm1 ; mm6=BEH + psrld mm0,1 ; mm0=BEL*FIX(0.500) + psrld mm6,1 ; mm6=BEH*FIX(0.500) + + movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm5,mm0 + paddd mm4,mm6 + paddd mm5,mm1 + paddd mm4,mm1 + psrld mm5,SCALEBITS ; mm5=CbEL + psrld mm4,SCALEBITS ; mm4=CbEH + packssdw mm5,mm4 ; mm5=CbE + + psllw mm7,BYTE_BIT + por mm5,mm7 ; mm5=Cb + movq MMWORD [ebx], mm5 ; Save Cb + + movq mm0, MMWORD [wk(3)] ; mm0=BO + movq mm6, MMWORD [wk(2)] ; mm6=BE + movq mm1, MMWORD [wk(1)] ; mm1=RO + + movq mm4,mm0 + punpcklwd mm0,mm3 + punpckhwd mm4,mm3 + movq mm7,mm0 + movq mm5,mm4 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, MMWORD [wk(4)] + paddd mm4, MMWORD [wk(5)] + paddd mm0,mm3 + paddd mm4,mm3 + psrld mm0,SCALEBITS ; mm0=YOL + psrld mm4,SCALEBITS ; mm4=YOH + packssdw mm0,mm4 ; mm0=YO + + pxor mm3,mm3 + pxor mm4,mm4 + punpcklwd mm3,mm1 ; mm3=ROL + punpckhwd mm4,mm1 ; mm4=ROH + psrld mm3,1 ; mm3=ROL*FIX(0.500) + psrld mm4,1 ; mm4=ROH*FIX(0.500) + + movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] + + paddd mm7,mm3 + paddd mm5,mm4 + paddd mm7,mm1 + paddd mm5,mm1 + psrld mm7,SCALEBITS ; mm7=CrOL + psrld mm5,SCALEBITS ; mm5=CrOH + packssdw mm7,mm5 ; mm7=CrO + + movq mm3, MMWORD [wk(0)] ; mm3=RE + + movq mm4,mm6 + punpcklwd mm6,mm2 + punpckhwd mm4,mm2 + movq mm1,mm6 + movq mm5,mm4 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(6)] + paddd mm4, MMWORD [wk(7)] + paddd mm6,mm2 + paddd mm4,mm2 + psrld mm6,SCALEBITS ; mm6=YEL + psrld mm4,SCALEBITS ; mm4=YEH + packssdw mm6,mm4 ; mm6=YE + + psllw mm0,BYTE_BIT + por mm6,mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + pxor mm2,mm2 + pxor mm4,mm4 + punpcklwd mm2,mm3 ; mm2=REL + punpckhwd mm4,mm3 ; mm4=REH + psrld mm2,1 ; mm2=REL*FIX(0.500) + psrld mm4,1 ; mm4=REH*FIX(0.500) + + movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] + + paddd mm1,mm2 + paddd mm5,mm4 + paddd mm1,mm0 + paddd mm5,mm0 + psrld mm1,SCALEBITS ; mm1=CrEL + psrld mm5,SCALEBITS ; mm5=CrEH + packssdw mm1,mm5 ; mm1=CrE + + psllw mm7,BYTE_BIT + por mm1,mm7 ; mm1=Cr + movq MMWORD [edx], mm1 ; Save Cr + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + add ebx, byte SIZEOF_MMWORD ; outptr1 + add edx, byte SIZEOF_MMWORD ; outptr2 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcclrss2-64.asm b/simd/jcclrss2-64.asm index f5d6bed95..1cdae27f8 100644 --- a/simd/jcclrss2-64.asm +++ b/simd/jcclrss2-64.asm @@ -32,454 +32,454 @@ ; r13 = JDIMENSION output_row ; r14 = int num_rows -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 8 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 - align 16 + align 16 - global EXTN(jsimd_rgb_ycc_convert_sse2) + global EXTN(jsimd_rgb_ycc_convert_sse2) EXTN(jsimd_rgb_ycc_convert_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov rcx, r10 - test rcx,rcx - jz near .return - - push rcx - - mov rsi, r12 - mov rcx, r13 - mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] - mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] - mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] - lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] - lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] - lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] - - pop rcx - - mov rsi, r11 - mov eax, r14d - test rax,rax - jle near .return + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov rcx, r10 + test rcx,rcx + jz near .return + + push rcx + + mov rsi, r12 + mov rcx, r13 + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax,rax + jle near .return .rowloop: - push rdx - push rbx - push rdi - push rsi - push rcx ; col + push rdx + push rbx + push rdi + push rsi + push rcx ; col - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr0 - mov rbx, JSAMPROW [rbx] ; outptr1 - mov rdx, JSAMPROW [rdx] ; outptr2 + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 + mov rbx, JSAMPROW [rbx] ; outptr1 + mov rdx, JSAMPROW [rdx] ; outptr2 - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop %if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push rax - push rdx - lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub rcx, byte SIZEOF_BYTE - movzx rax, BYTE [rsi+rcx] + push rax + push rdx + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub rcx, byte SIZEOF_WORD - movzx rdx, WORD [rsi+rcx] - shl rax, WORD_BIT - or rax,rdx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax,rdx .column_ld4: - movd xmmA,eax - pop rdx - pop rax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub rcx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [rsi+rcx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF + movd xmmA,eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub rcx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [rsi+rcx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB .column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - mov rcx, SIZEOF_XMMWORD - jmp short .rgb_ycc_cnv + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv .column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov rcx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv .columnloop: - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] .rgb_ycc_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - pxor xmmH,xmmH + pxor xmmH,xmmH - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) %else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub rcx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub rcx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE .column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub rcx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] .column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov rcx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv .columnloop: - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] .rgb_ycc_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - pxor xmmF,xmmF + pxor xmmF,xmmF - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) %endif ; RGB_PIXELSIZE ; --------------- - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE - movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - movdqa xmm7,xmm1 - movdqa xmm4,xmm6 - pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) - pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) - - movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) - movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) - - pxor xmm1,xmm1 - pxor xmm6,xmm6 - punpcklwd xmm1,xmm5 ; xmm1=BOL - punpckhwd xmm6,xmm5 ; xmm6=BOH - psrld xmm1,1 ; xmm1=BOL*FIX(0.500) - psrld xmm6,1 ; xmm6=BOH*FIX(0.500) - - movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm1 - paddd xmm4,xmm6 - paddd xmm7,xmm5 - paddd xmm4,xmm5 - psrld xmm7,SCALEBITS ; xmm7=CbOL - psrld xmm4,SCALEBITS ; xmm4=CbOH - packssdw xmm7,xmm4 ; xmm7=CbO - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - movdqa xmm5,xmm0 - movdqa xmm4,xmm6 - pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) - pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) - - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) - - pxor xmm0,xmm0 - pxor xmm6,xmm6 - punpcklwd xmm0,xmm1 ; xmm0=BEL - punpckhwd xmm6,xmm1 ; xmm6=BEH - psrld xmm0,1 ; xmm0=BEL*FIX(0.500) - psrld xmm6,1 ; xmm6=BEH*FIX(0.500) - - movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm5,xmm0 - paddd xmm4,xmm6 - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrld xmm5,SCALEBITS ; xmm5=CbEL - psrld xmm4,SCALEBITS ; xmm4=CbEH - packssdw xmm5,xmm4 ; xmm5=CbE - - psllw xmm7,BYTE_BIT - por xmm5,xmm7 ; xmm5=Cb - movdqa XMMWORD [rbx], xmm5 ; Save Cb - - movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - movdqa xmm7,xmm0 - movdqa xmm5,xmm4 - pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) - pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) - - movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] - - paddd xmm0, XMMWORD [wk(4)] - paddd xmm4, XMMWORD [wk(5)] - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - pxor xmm3,xmm3 - pxor xmm4,xmm4 - punpcklwd xmm3,xmm1 ; xmm3=ROL - punpckhwd xmm4,xmm1 ; xmm4=ROH - psrld xmm3,1 ; xmm3=ROL*FIX(0.500) - psrld xmm4,1 ; xmm4=ROH*FIX(0.500) - - movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm3 - paddd xmm5,xmm4 - paddd xmm7,xmm1 - paddd xmm5,xmm1 - psrld xmm7,SCALEBITS ; xmm7=CrOL - psrld xmm5,SCALEBITS ; xmm5=CrOH - packssdw xmm7,xmm5 ; xmm7=CrO - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) - pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) - - movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(6)] - paddd xmm4, XMMWORD [wk(7)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [rdi], xmm6 ; Save Y - - pxor xmm2,xmm2 - pxor xmm4,xmm4 - punpcklwd xmm2,xmm3 ; xmm2=REL - punpckhwd xmm4,xmm3 ; xmm4=REH - psrld xmm2,1 ; xmm2=REL*FIX(0.500) - psrld xmm4,1 ; xmm4=REH*FIX(0.500) - - movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] - - paddd xmm1,xmm2 - paddd xmm5,xmm4 - paddd xmm1,xmm0 - paddd xmm5,xmm0 - psrld xmm1,SCALEBITS ; xmm1=CrEL - psrld xmm5,SCALEBITS ; xmm5=CrEH - packssdw xmm1,xmm5 ; xmm1=CrE - - psllw xmm7,BYTE_BIT - por xmm1,xmm7 ; xmm1=Cr - movdqa XMMWORD [rdx], xmm1 ; Save Cr - - sub rcx, byte SIZEOF_XMMWORD - add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add rdi, byte SIZEOF_XMMWORD ; outptr0 - add rbx, byte SIZEOF_XMMWORD ; outptr1 - add rdx, byte SIZEOF_XMMWORD ; outptr2 - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop - test rcx,rcx - jnz near .column_ld1 - - pop rcx ; col - pop rsi - pop rdi - pop rbx - pop rdx - - add rsi, byte SIZEOF_JSAMPROW ; input_buf - add rdi, byte SIZEOF_JSAMPROW - add rbx, byte SIZEOF_JSAMPROW - add rdx, byte SIZEOF_JSAMPROW - dec rax ; num_rows - jg near .rowloop + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + movdqa xmm7,xmm1 + movdqa xmm4,xmm6 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1,xmm1 + pxor xmm6,xmm6 + punpcklwd xmm1,xmm5 ; xmm1=BOL + punpckhwd xmm6,xmm5 ; xmm6=BOH + psrld xmm1,1 ; xmm1=BOL*FIX(0.500) + psrld xmm6,1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm1 + paddd xmm4,xmm6 + paddd xmm7,xmm5 + paddd xmm4,xmm5 + psrld xmm7,SCALEBITS ; xmm7=CbOL + psrld xmm4,SCALEBITS ; xmm4=CbOH + packssdw xmm7,xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + movdqa xmm5,xmm0 + movdqa xmm4,xmm6 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0,xmm0 + pxor xmm6,xmm6 + punpcklwd xmm0,xmm1 ; xmm0=BEL + punpckhwd xmm6,xmm1 ; xmm6=BEH + psrld xmm0,1 ; xmm0=BEL*FIX(0.500) + psrld xmm6,1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5,xmm0 + paddd xmm4,xmm6 + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrld xmm5,SCALEBITS ; xmm5=CbEL + psrld xmm4,SCALEBITS ; xmm4=CbEH + packssdw xmm5,xmm4 ; xmm5=CbE + + psllw xmm7,BYTE_BIT + por xmm5,xmm7 ; xmm5=Cb + movdqa XMMWORD [rbx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + movdqa xmm7,xmm0 + movdqa xmm5,xmm4 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + pxor xmm3,xmm3 + pxor xmm4,xmm4 + punpcklwd xmm3,xmm1 ; xmm3=ROL + punpckhwd xmm4,xmm1 ; xmm4=ROH + psrld xmm3,1 ; xmm3=ROL*FIX(0.500) + psrld xmm4,1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm3 + paddd xmm5,xmm4 + paddd xmm7,xmm1 + paddd xmm5,xmm1 + psrld xmm7,SCALEBITS ; xmm7=CrOL + psrld xmm5,SCALEBITS ; xmm5=CrOH + packssdw xmm7,xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + pxor xmm2,xmm2 + pxor xmm4,xmm4 + punpcklwd xmm2,xmm3 ; xmm2=REL + punpckhwd xmm4,xmm3 ; xmm4=REH + psrld xmm2,1 ; xmm2=REL*FIX(0.500) + psrld xmm4,1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1,xmm2 + paddd xmm5,xmm4 + paddd xmm1,xmm0 + paddd xmm5,xmm0 + psrld xmm1,SCALEBITS ; xmm1=CrEL + psrld xmm5,SCALEBITS ; xmm5=CrEH + packssdw xmm1,xmm5 ; xmm1=CrE + + psllw xmm7,BYTE_BIT + por xmm1,xmm7 ; xmm1=Cr + movdqa XMMWORD [rdx], xmm1 ; Save Cr + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + add rbx, byte SIZEOF_XMMWORD ; outptr1 + add rdx, byte SIZEOF_XMMWORD ; outptr2 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + pop rbx + pop rdx + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcclrss2.asm b/simd/jcclrss2.asm index 517b70563..220d016fc 100644 --- a/simd/jcclrss2.asm +++ b/simd/jcclrss2.asm @@ -25,479 +25,479 @@ ; JDIMENSION output_row, int num_rows); ; -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 8 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 8 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 16 - global EXTN(jsimd_rgb_ycc_convert_sse2) + global EXTN(jsimd_rgb_ycc_convert_sse2) EXTN(jsimd_rgb_ycc_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 .rowloop: - pushpic eax - push edx - push ebx - push edi - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - mov ebx, JSAMPROW [ebx] ; outptr1 - mov edx, JSAMPROW [edx] ; outptr2 - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - alignx 16,7 + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 %if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx .column_ld4: - movd xmmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [esi+ecx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub ecx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [esi+ecx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB .column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - mov ecx, SIZEOF_XMMWORD - jmp short .rgb_ycc_cnv + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv .column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov ecx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 .columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] .rgb_ycc_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - pxor xmmH,xmmH + pxor xmmH,xmmH - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) %else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub ecx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub ecx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE .column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub ecx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] .column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov ecx, SIZEOF_XMMWORD - jz short .rgb_ycc_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_ycc_cnv - alignx 16,7 + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 .columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] .rgb_ycc_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - pxor xmmF,xmmF + pxor xmmF,xmmF - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) %endif ; RGB_PIXELSIZE ; --------------- - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE - ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE - movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - movdqa xmm7,xmm1 - movdqa xmm4,xmm6 - pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) - pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) - - movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) - movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) - - pxor xmm1,xmm1 - pxor xmm6,xmm6 - punpcklwd xmm1,xmm5 ; xmm1=BOL - punpckhwd xmm6,xmm5 ; xmm6=BOH - psrld xmm1,1 ; xmm1=BOL*FIX(0.500) - psrld xmm6,1 ; xmm6=BOH*FIX(0.500) - - movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm1 - paddd xmm4,xmm6 - paddd xmm7,xmm5 - paddd xmm4,xmm5 - psrld xmm7,SCALEBITS ; xmm7=CbOL - psrld xmm4,SCALEBITS ; xmm4=CbOH - packssdw xmm7,xmm4 ; xmm7=CbO - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - movdqa xmm5,xmm0 - movdqa xmm4,xmm6 - pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) - pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) - - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) - - pxor xmm0,xmm0 - pxor xmm6,xmm6 - punpcklwd xmm0,xmm1 ; xmm0=BEL - punpckhwd xmm6,xmm1 ; xmm6=BEH - psrld xmm0,1 ; xmm0=BEL*FIX(0.500) - psrld xmm6,1 ; xmm6=BEH*FIX(0.500) - - movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm5,xmm0 - paddd xmm4,xmm6 - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrld xmm5,SCALEBITS ; xmm5=CbEL - psrld xmm4,SCALEBITS ; xmm4=CbEH - packssdw xmm5,xmm4 ; xmm5=CbE - - psllw xmm7,BYTE_BIT - por xmm5,xmm7 ; xmm5=Cb - movdqa XMMWORD [ebx], xmm5 ; Save Cb - - movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - movdqa xmm7,xmm0 - movdqa xmm5,xmm4 - pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) - pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) - - movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] - - paddd xmm0, XMMWORD [wk(4)] - paddd xmm4, XMMWORD [wk(5)] - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - pxor xmm3,xmm3 - pxor xmm4,xmm4 - punpcklwd xmm3,xmm1 ; xmm3=ROL - punpckhwd xmm4,xmm1 ; xmm4=ROH - psrld xmm3,1 ; xmm3=ROL*FIX(0.500) - psrld xmm4,1 ; xmm4=ROH*FIX(0.500) - - movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] - - paddd xmm7,xmm3 - paddd xmm5,xmm4 - paddd xmm7,xmm1 - paddd xmm5,xmm1 - psrld xmm7,SCALEBITS ; xmm7=CrOL - psrld xmm5,SCALEBITS ; xmm5=CrOH - packssdw xmm7,xmm5 ; xmm7=CrO - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) - pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) - - movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(6)] - paddd xmm4, XMMWORD [wk(7)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [edi], xmm6 ; Save Y - - pxor xmm2,xmm2 - pxor xmm4,xmm4 - punpcklwd xmm2,xmm3 ; xmm2=REL - punpckhwd xmm4,xmm3 ; xmm4=REH - psrld xmm2,1 ; xmm2=REL*FIX(0.500) - psrld xmm4,1 ; xmm4=REH*FIX(0.500) - - movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] - - paddd xmm1,xmm2 - paddd xmm5,xmm4 - paddd xmm1,xmm0 - paddd xmm5,xmm0 - psrld xmm1,SCALEBITS ; xmm1=CrEL - psrld xmm5,SCALEBITS ; xmm5=CrEH - packssdw xmm1,xmm5 ; xmm1=CrE - - psllw xmm7,BYTE_BIT - por xmm1,xmm7 ; xmm1=Cr - movdqa XMMWORD [edx], xmm1 ; Save Cr - - sub ecx, byte SIZEOF_XMMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add edi, byte SIZEOF_XMMWORD ; outptr0 - add ebx, byte SIZEOF_XMMWORD ; outptr1 - add edx, byte SIZEOF_XMMWORD ; outptr2 - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - pop ebx - pop edx - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + movdqa xmm7,xmm1 + movdqa xmm4,xmm6 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1,xmm1 + pxor xmm6,xmm6 + punpcklwd xmm1,xmm5 ; xmm1=BOL + punpckhwd xmm6,xmm5 ; xmm6=BOH + psrld xmm1,1 ; xmm1=BOL*FIX(0.500) + psrld xmm6,1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm1 + paddd xmm4,xmm6 + paddd xmm7,xmm5 + paddd xmm4,xmm5 + psrld xmm7,SCALEBITS ; xmm7=CbOL + psrld xmm4,SCALEBITS ; xmm4=CbOH + packssdw xmm7,xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + movdqa xmm5,xmm0 + movdqa xmm4,xmm6 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0,xmm0 + pxor xmm6,xmm6 + punpcklwd xmm0,xmm1 ; xmm0=BEL + punpckhwd xmm6,xmm1 ; xmm6=BEH + psrld xmm0,1 ; xmm0=BEL*FIX(0.500) + psrld xmm6,1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5,xmm0 + paddd xmm4,xmm6 + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrld xmm5,SCALEBITS ; xmm5=CbEL + psrld xmm4,SCALEBITS ; xmm4=CbEH + packssdw xmm5,xmm4 ; xmm5=CbE + + psllw xmm7,BYTE_BIT + por xmm5,xmm7 ; xmm5=Cb + movdqa XMMWORD [ebx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + movdqa xmm7,xmm0 + movdqa xmm5,xmm4 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + pxor xmm3,xmm3 + pxor xmm4,xmm4 + punpcklwd xmm3,xmm1 ; xmm3=ROL + punpckhwd xmm4,xmm1 ; xmm4=ROH + psrld xmm3,1 ; xmm3=ROL*FIX(0.500) + psrld xmm4,1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm3 + paddd xmm5,xmm4 + paddd xmm7,xmm1 + paddd xmm5,xmm1 + psrld xmm7,SCALEBITS ; xmm7=CrOL + psrld xmm5,SCALEBITS ; xmm5=CrOH + packssdw xmm7,xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + pxor xmm2,xmm2 + pxor xmm4,xmm4 + punpcklwd xmm2,xmm3 ; xmm2=REL + punpckhwd xmm4,xmm3 ; xmm4=REH + psrld xmm2,1 ; xmm2=REL*FIX(0.500) + psrld xmm4,1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1,xmm2 + paddd xmm5,xmm4 + paddd xmm1,xmm0 + paddd xmm5,xmm0 + psrld xmm1,SCALEBITS ; xmm1=CrEL + psrld xmm5,SCALEBITS ; xmm5=CrEH + packssdw xmm1,xmm5 ; xmm1=CrE + + psllw xmm7,BYTE_BIT + por xmm1,xmm7 ; xmm1=Cr + movdqa XMMWORD [edx], xmm1 ; Save Cr + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + add ebx, byte SIZEOF_XMMWORD ; outptr1 + add edx, byte SIZEOF_XMMWORD ; outptr2 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jccolmmx.asm b/simd/jccolmmx.asm index 9650e47d4..87058ee1f 100644 --- a/simd/jccolmmx.asm +++ b/simd/jccolmmx.asm @@ -21,38 +21,38 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 - -F_0_081 equ 5329 ; FIX(0.08131) -F_0_114 equ 7471 ; FIX(0.11400) -F_0_168 equ 11059 ; FIX(0.16874) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_331 equ 21709 ; FIX(0.33126) -F_0_418 equ 27439 ; FIX(0.41869) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_ycc_convert_mmx) + alignz 16 + global EXTN(jconst_rgb_ycc_convert_mmx) EXTN(jconst_rgb_ycc_convert_mmx): -PW_F0299_F0337 times 2 dw F_0_299, F_0_337 -PW_F0114_F0250 times 2 dw F_0_114, F_0_250 -PW_MF016_MF033 times 2 dw -F_0_168,-F_0_331 -PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418 -PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) -PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PW_MF016_MF033 times 2 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 2 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 2 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jcclrmmx.asm" diff --git a/simd/jccolss2-64.asm b/simd/jccolss2-64.asm index ae6014885..5b1ee78a6 100644 --- a/simd/jccolss2-64.asm +++ b/simd/jccolss2-64.asm @@ -18,38 +18,38 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 - -F_0_081 equ 5329 ; FIX(0.08131) -F_0_114 equ 7471 ; FIX(0.11400) -F_0_168 equ 11059 ; FIX(0.16874) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_331 equ 21709 ; FIX(0.33126) -F_0_418 equ 27439 ; FIX(0.41869) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_ycc_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): -PW_F0299_F0337 times 4 dw F_0_299, F_0_337 -PW_F0114_F0250 times 4 dw F_0_114, F_0_250 -PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 -PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 -PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) -PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 %include "jcclrss2-64.asm" diff --git a/simd/jccolss2.asm b/simd/jccolss2.asm index ac001d186..2b8faef79 100644 --- a/simd/jccolss2.asm +++ b/simd/jccolss2.asm @@ -18,38 +18,38 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 - -F_0_081 equ 5329 ; FIX(0.08131) -F_0_114 equ 7471 ; FIX(0.11400) -F_0_168 equ 11059 ; FIX(0.16874) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_331 equ 21709 ; FIX(0.33126) -F_0_418 equ 27439 ; FIX(0.41869) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +%define SCALEBITS 16 + +F_0_081 equ 5329 ; FIX(0.08131) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_168 equ 11059 ; FIX(0.16874) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_331 equ 21709 ; FIX(0.33126) +F_0_418 equ 27439 ; FIX(0.41869) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_ycc_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) EXTN(jconst_rgb_ycc_convert_sse2): -PW_F0299_F0337 times 4 dw F_0_299, F_0_337 -PW_F0114_F0250 times 4 dw F_0_114, F_0_250 -PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 -PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 -PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) -PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 +PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 +PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jcclrss2.asm" diff --git a/simd/jcgrammx.asm b/simd/jcgrammx.asm index b8b8dd3ad..43ffd0f3f 100644 --- a/simd/jcgrammx.asm +++ b/simd/jcgrammx.asm @@ -21,31 +21,31 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_114 equ 7471 ; FIX(0.11400) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_gray_convert_mmx) + alignz 16 + global EXTN(jconst_rgb_gray_convert_mmx) EXTN(jconst_rgb_gray_convert_mmx): -PW_F0299_F0337 times 2 dw F_0_299, F_0_337 -PW_F0114_F0250 times 2 dw F_0_114, F_0_250 -PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 2 dw F_0_299, F_0_337 +PW_F0114_F0250 times 2 dw F_0_114, F_0_250 +PD_ONEHALF times 2 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jcgrymmx.asm" diff --git a/simd/jcgrass2-64.asm b/simd/jcgrass2-64.asm index ba28cc31d..39236ffdd 100644 --- a/simd/jcgrass2-64.asm +++ b/simd/jcgrass2-64.asm @@ -18,31 +18,31 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_114 equ 7471 ; FIX(0.11400) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_gray_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): -PW_F0299_F0337 times 4 dw F_0_299, F_0_337 -PW_F0114_F0250 times 4 dw F_0_114, F_0_250 -PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 %include "jcgryss2-64.asm" diff --git a/simd/jcgrass2.asm b/simd/jcgrass2.asm index 998968e76..f5bd93dd8 100644 --- a/simd/jcgrass2.asm +++ b/simd/jcgrass2.asm @@ -18,31 +18,31 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_114 equ 7471 ; FIX(0.11400) -F_0_250 equ 16384 ; FIX(0.25000) -F_0_299 equ 19595 ; FIX(0.29900) -F_0_587 equ 38470 ; FIX(0.58700) -F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) +F_0_114 equ 7471 ; FIX(0.11400) +F_0_250 equ 16384 ; FIX(0.25000) +F_0_299 equ 19595 ; FIX(0.29900) +F_0_587 equ 38470 ; FIX(0.58700) +F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_rgb_gray_convert_sse2) + alignz 16 + global EXTN(jconst_rgb_gray_convert_sse2) EXTN(jconst_rgb_gray_convert_sse2): -PW_F0299_F0337 times 4 dw F_0_299, F_0_337 -PW_F0114_F0250 times 4 dw F_0_114, F_0_250 -PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) +PW_F0299_F0337 times 4 dw F_0_299, F_0_337 +PW_F0114_F0250 times 4 dw F_0_114, F_0_250 +PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jcgryss2.asm" diff --git a/simd/jcgrymmx.asm b/simd/jcgrymmx.asm index bbeea09be..cbe56222c 100644 --- a/simd/jcgrymmx.asm +++ b/simd/jcgrymmx.asm @@ -29,329 +29,329 @@ ; JDIMENSION output_row, int num_rows); ; -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_rgb_gray_convert_mmx) + align 16 + global EXTN(jsimd_rgb_gray_convert_mmx) EXTN(jsimd_rgb_gray_convert_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 .rowloop: - pushpic eax - push edi - push esi - push ecx ; col + pushpic eax + push edi + push esi + push ecx ; col - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) - cmp ecx, byte SIZEOF_MMWORD - jae short .columnloop - alignx 16,7 + cmp ecx, byte SIZEOF_MMWORD + jae short .columnloop + alignx 16,7 %if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - xor eax,eax - mov al, BYTE [esi+ecx] + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + xor eax,eax + mov al, BYTE [esi+ecx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - xor edx,edx - mov dx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + xor edx,edx + mov dx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx .column_ld4: - movd mmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd mmG, DWORD [esi+ecx] - psllq mmA, DWORD_BIT - por mmA,mmG + movd mmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd mmG, DWORD [esi+ecx] + psllq mmA, DWORD_BIT + por mmA,mmG .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - movq mmG,mmA - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - mov ecx, SIZEOF_MMWORD - jmp short .rgb_gray_cnv + test cl, SIZEOF_MMWORD + jz short .column_ld16 + movq mmG,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + mov ecx, SIZEOF_MMWORD + jmp short .rgb_gray_cnv .column_ld16: - test cl, 2*SIZEOF_MMWORD - mov ecx, SIZEOF_MMWORD - jz short .rgb_gray_cnv - movq mmF,mmA - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 + test cl, 2*SIZEOF_MMWORD + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmF,mmA + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 .columnloop: - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] .rgb_gray_cnv: - ; mmA=(00 10 20 01 11 21 02 12) - ; mmG=(22 03 13 23 04 14 24 05) - ; mmF=(15 25 06 16 26 07 17 27) + ; mmA=(00 10 20 01 11 21 02 12) + ; mmG=(22 03 13 23 04 14 24 05) + ; mmF=(15 25 06 16 26 07 17 27) - movq mmD,mmA - psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) - psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) + movq mmD,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) - punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) - psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) - punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) - punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) - movq mmE,mmA - psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) - psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) + movq mmE,mmA + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) - punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) - psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) - punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) - punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) - pxor mmH,mmH + pxor mmH,mmH - movq mmC,mmA - punpcklbw mmA,mmH ; mmA=(00 02 04 06) - punpckhbw mmC,mmH ; mmC=(10 12 14 16) + movq mmC,mmA + punpcklbw mmA,mmH ; mmA=(00 02 04 06) + punpckhbw mmC,mmH ; mmC=(10 12 14 16) - movq mmB,mmE - punpcklbw mmE,mmH ; mmE=(20 22 24 26) - punpckhbw mmB,mmH ; mmB=(01 03 05 07) + movq mmB,mmE + punpcklbw mmE,mmH ; mmE=(20 22 24 26) + punpckhbw mmB,mmH ; mmB=(01 03 05 07) - movq mmF,mmD - punpcklbw mmD,mmH ; mmD=(11 13 15 17) - punpckhbw mmF,mmH ; mmF=(21 23 25 27) + movq mmF,mmD + punpcklbw mmD,mmH ; mmD=(11 13 15 17) + punpckhbw mmF,mmH ; mmF=(21 23 25 27) %else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_MMWORD/8 - jz short .column_ld2 - sub ecx, byte SIZEOF_MMWORD/8 - movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_MMWORD/8 + jz short .column_ld2 + sub ecx, byte SIZEOF_MMWORD/8 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_MMWORD/4 - jz short .column_ld4 - sub ecx, byte SIZEOF_MMWORD/4 - movq mmF,mmA - movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_MMWORD/4 + jz short .column_ld4 + sub ecx, byte SIZEOF_MMWORD/4 + movq mmF,mmA + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] .column_ld4: - test cl, SIZEOF_MMWORD/2 - mov ecx, SIZEOF_MMWORD - jz short .rgb_gray_cnv - movq mmD,mmA - movq mmC,mmF - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 + test cl, SIZEOF_MMWORD/2 + mov ecx, SIZEOF_MMWORD + jz short .rgb_gray_cnv + movq mmD,mmA + movq mmC,mmF + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 .columnloop: - movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] - movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] - movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] - movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] .rgb_gray_cnv: - ; mmA=(00 10 20 30 01 11 21 31) - ; mmF=(02 12 22 32 03 13 23 33) - ; mmD=(04 14 24 34 05 15 25 35) - ; mmC=(06 16 26 36 07 17 27 37) + ; mmA=(00 10 20 30 01 11 21 31) + ; mmF=(02 12 22 32 03 13 23 33) + ; mmD=(04 14 24 34 05 15 25 35) + ; mmC=(06 16 26 36 07 17 27 37) - movq mmB,mmA - punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) - punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) + movq mmB,mmA + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) - movq mmG,mmD - punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) - punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) + movq mmG,mmD + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) - movq mmE,mmA - punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) - punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) + movq mmE,mmA + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) - movq mmH,mmB - punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) - punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) + movq mmH,mmB + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) - pxor mmF,mmF + pxor mmF,mmF - movq mmC,mmA - punpcklbw mmA,mmF ; mmA=(00 02 04 06) - punpckhbw mmC,mmF ; mmC=(10 12 14 16) + movq mmC,mmA + punpcklbw mmA,mmF ; mmA=(00 02 04 06) + punpckhbw mmC,mmF ; mmC=(10 12 14 16) - movq mmD,mmB - punpcklbw mmB,mmF ; mmB=(01 03 05 07) - punpckhbw mmD,mmF ; mmD=(11 13 15 17) + movq mmD,mmB + punpcklbw mmB,mmF ; mmB=(01 03 05 07) + punpckhbw mmD,mmF ; mmD=(11 13 15 17) - movq mmG,mmE - punpcklbw mmE,mmF ; mmE=(20 22 24 26) - punpckhbw mmG,mmF ; mmG=(30 32 34 36) + movq mmG,mmE + punpcklbw mmE,mmF ; mmE=(20 22 24 26) + punpckhbw mmG,mmF ; mmG=(30 32 34 36) - punpcklbw mmF,mmH - punpckhbw mmH,mmH - psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) - psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) + punpcklbw mmF,mmH + punpckhbw mmH,mmH + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) %endif ; RGB_PIXELSIZE ; --------------- - ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE - ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - - movq mm6,mm1 - punpcklwd mm1,mm3 - punpckhwd mm6,mm3 - pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) - - movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) - - movq mm6,mm0 - punpcklwd mm0,mm2 - punpckhwd mm6,mm2 - pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) - - movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) - movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) - - movq mm0, mm5 ; mm0=BO - movq mm6, mm4 ; mm6=BE - - movq mm4,mm0 - punpcklwd mm0,mm3 - punpckhwd mm4,mm3 - pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) - - movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] - - paddd mm0, mm1 - paddd mm4, mm7 - paddd mm0,mm3 - paddd mm4,mm3 - psrld mm0,SCALEBITS ; mm0=YOL - psrld mm4,SCALEBITS ; mm4=YOH - packssdw mm0,mm4 ; mm0=YO - - movq mm4,mm6 - punpcklwd mm6,mm2 - punpckhwd mm4,mm2 - pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) - - movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] - - paddd mm6, MMWORD [wk(0)] - paddd mm4, MMWORD [wk(1)] - paddd mm6,mm2 - paddd mm4,mm2 - psrld mm6,SCALEBITS ; mm6=YEL - psrld mm4,SCALEBITS ; mm4=YEH - packssdw mm6,mm4 ; mm6=YE - - psllw mm0,BYTE_BIT - por mm6,mm0 ; mm6=Y - movq MMWORD [edi], mm6 ; Save Y - - sub ecx, byte SIZEOF_MMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr - add edi, byte SIZEOF_MMWORD ; outptr0 - cmp ecx, byte SIZEOF_MMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop - - emms ; empty MMX state + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movq mm6,mm1 + punpcklwd mm1,mm3 + punpckhwd mm6,mm3 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movq mm6,mm0 + punpcklwd mm0,mm2 + punpckhwd mm6,mm2 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movq mm0, mm5 ; mm0=BO + movq mm6, mm4 ; mm6=BE + + movq mm4,mm0 + punpcklwd mm0,mm3 + punpckhwd mm4,mm3 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] + + paddd mm0, mm1 + paddd mm4, mm7 + paddd mm0,mm3 + paddd mm4,mm3 + psrld mm0,SCALEBITS ; mm0=YOL + psrld mm4,SCALEBITS ; mm4=YOH + packssdw mm0,mm4 ; mm0=YO + + movq mm4,mm6 + punpcklwd mm6,mm2 + punpckhwd mm4,mm2 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] + + paddd mm6, MMWORD [wk(0)] + paddd mm4, MMWORD [wk(1)] + paddd mm6,mm2 + paddd mm4,mm2 + psrld mm6,SCALEBITS ; mm6=YEL + psrld mm4,SCALEBITS ; mm4=YEH + packssdw mm6,mm4 ; mm6=YE + + psllw mm0,BYTE_BIT + por mm6,mm0 ; mm6=Y + movq MMWORD [edi], mm6 ; Save Y + + sub ecx, byte SIZEOF_MMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr + add edi, byte SIZEOF_MMWORD ; outptr0 + cmp ecx, byte SIZEOF_MMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcgryss2-64.asm b/simd/jcgryss2-64.asm index 23ae8afb1..2308129e1 100644 --- a/simd/jcgryss2-64.asm +++ b/simd/jcgryss2-64.asm @@ -32,333 +32,333 @@ ; r13 = JDIMENSION output_row ; r14 = int num_rows -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 + align 16 - global EXTN(jsimd_rgb_gray_convert_sse2) + global EXTN(jsimd_rgb_gray_convert_sse2) EXTN(jsimd_rgb_gray_convert_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov rcx, r10 - test rcx,rcx - jz near .return - - push rcx - - mov rsi, r12 - mov rcx, r13 - mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] - lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] - - pop rcx - - mov rsi, r11 - mov eax, r14d - test rax,rax - jle near .return + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov rcx, r10 + test rcx,rcx + jz near .return + + push rcx + + mov rsi, r12 + mov rcx, r13 + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rsi, r11 + mov eax, r14d + test rax,rax + jle near .return .rowloop: - push rdi - push rsi - push rcx ; col + push rdi + push rsi + push rcx ; col - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr0 + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr0 - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop %if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push rax - push rdx - lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub rcx, byte SIZEOF_BYTE - movzx rax, BYTE [rsi+rcx] + push rax + push rdx + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub rcx, byte SIZEOF_BYTE + movzx rax, BYTE [rsi+rcx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub rcx, byte SIZEOF_WORD - movzx rdx, WORD [rsi+rcx] - shl rax, WORD_BIT - or rax,rdx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub rcx, byte SIZEOF_WORD + movzx rdx, WORD [rsi+rcx] + shl rax, WORD_BIT + or rax,rdx .column_ld4: - movd xmmA,eax - pop rdx - pop rax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub rcx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [rsi+rcx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF + movd xmmA,eax + pop rdx + pop rax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub rcx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [rsi+rcx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub rcx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [rsi+rcx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub rcx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [rsi+rcx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB .column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - mov rcx, SIZEOF_XMMWORD - jmp short .rgb_gray_cnv + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + mov rcx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv .column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov rcx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv + test cl, 2*SIZEOF_XMMWORD + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv .columnloop: - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] .rgb_gray_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - pxor xmmH,xmmH + pxor xmmH,xmmH - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) %else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub rcx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub rcx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub rcx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub rcx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE .column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub rcx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub rcx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] .column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov rcx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv + test cl, SIZEOF_XMMWORD/2 + mov rcx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv .columnloop: - movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] .rgb_gray_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - pxor xmmF,xmmF + pxor xmmF,xmmF - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) %endif ; RGB_PIXELSIZE ; --------------- - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa xmm0, xmm5 ; xmm0=BO - movdqa xmm6, xmm4 ; xmm6=BE - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - - movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] - - paddd xmm0, xmm1 - paddd xmm4, xmm7 - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - - movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(0)] - paddd xmm4, XMMWORD [wk(1)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [rdi], xmm6 ; Save Y - - sub rcx, byte SIZEOF_XMMWORD - add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add rdi, byte SIZEOF_XMMWORD ; outptr0 - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop - test rcx,rcx - jnz near .column_ld1 - - pop rcx ; col - pop rsi - pop rdi - - add rsi, byte SIZEOF_JSAMPROW ; input_buf - add rdi, byte SIZEOF_JSAMPROW - dec rax ; num_rows - jg near .rowloop + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [rdi], xmm6 ; Save Y + + sub rcx, byte SIZEOF_XMMWORD + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add rdi, byte SIZEOF_XMMWORD ; outptr0 + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .column_ld1 + + pop rcx ; col + pop rsi + pop rdi + + add rsi, byte SIZEOF_JSAMPROW ; input_buf + add rdi, byte SIZEOF_JSAMPROW + dec rax ; num_rows + jg near .rowloop .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcgryss2.asm b/simd/jcgryss2.asm index c29428793..b5125c1ba 100644 --- a/simd/jcgryss2.asm +++ b/simd/jcgryss2.asm @@ -26,358 +26,358 @@ ; JDIMENSION output_row, int num_rows); ; -%define img_width(b) (b)+8 ; JDIMENSION img_width -%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf -%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf -%define output_row(b) (b)+20 ; JDIMENSION output_row -%define num_rows(b) (b)+24 ; int num_rows +%define img_width(b) (b)+8 ; JDIMENSION img_width +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf +%define output_row(b) (b)+20 ; JDIMENSION output_row +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 + align 16 - global EXTN(jsimd_rgb_gray_convert_sse2) + global EXTN(jsimd_rgb_gray_convert_sse2) EXTN(jsimd_rgb_gray_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [img_width(eax)] - test ecx,ecx - jz near .return - - push ecx - - mov esi, JSAMPIMAGE [output_buf(eax)] - mov ecx, JDIMENSION [output_row(eax)] - mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] - lea edi, [edi+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov esi, JSAMPARRAY [input_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 .rowloop: - pushpic eax - push edi - push esi - push ecx ; col + pushpic eax + push edi + push esi + push ecx ; col - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr0 - movpic eax, POINTER [gotptr] ; load GOT address (eax) + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + movpic eax, POINTER [gotptr] ; load GOT address (eax) - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - alignx 16,7 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 %if RGB_PIXELSIZE == 3 ; --------------- .column_ld1: - push eax - push edx - lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE - test cl, SIZEOF_BYTE - jz short .column_ld2 - sub ecx, byte SIZEOF_BYTE - movzx eax, BYTE [esi+ecx] + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] .column_ld2: - test cl, SIZEOF_WORD - jz short .column_ld4 - sub ecx, byte SIZEOF_WORD - movzx edx, WORD [esi+ecx] - shl eax, WORD_BIT - or eax,edx + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx .column_ld4: - movd xmmA,eax - pop edx - pop eax - test cl, SIZEOF_DWORD - jz short .column_ld8 - sub ecx, byte SIZEOF_DWORD - movd xmmF, XMM_DWORD [esi+ecx] - pslldq xmmA, SIZEOF_DWORD - por xmmA,xmmF + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD + movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF .column_ld8: - test cl, SIZEOF_MMWORD - jz short .column_ld16 - sub ecx, byte SIZEOF_MMWORD - movq xmmB, XMM_MMWORD [esi+ecx] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmB + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD + movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB .column_ld16: - test cl, SIZEOF_XMMWORD - jz short .column_ld32 - movdqa xmmF,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - mov ecx, SIZEOF_XMMWORD - jmp short .rgb_gray_cnv + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_gray_cnv .column_ld32: - test cl, 2*SIZEOF_XMMWORD - mov ecx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmB,xmmA - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 .columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] .rgb_gray_cnv: - ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - movdqa xmmG,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) - psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) - pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) - punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) - punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) - movdqa xmmD,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) - psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) - pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) - punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) - punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) - movdqa xmmE,xmmA - pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) - psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) - punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) - punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) - pxor xmmH,xmmH + pxor xmmH,xmmH - movdqa xmmC,xmmA - punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmB,xmmE - punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) - movdqa xmmF,xmmD - punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) - punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) %else ; RGB_PIXELSIZE == 4 ; ----------- .column_ld1: - test cl, SIZEOF_XMMWORD/16 - jz short .column_ld2 - sub ecx, byte SIZEOF_XMMWORD/16 - movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] .column_ld2: - test cl, SIZEOF_XMMWORD/8 - jz short .column_ld4 - sub ecx, byte SIZEOF_XMMWORD/8 - movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] - pslldq xmmA, SIZEOF_MMWORD - por xmmA,xmmE + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE .column_ld4: - test cl, SIZEOF_XMMWORD/4 - jz short .column_ld8 - sub ecx, byte SIZEOF_XMMWORD/4 - movdqa xmmE,xmmA - movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] .column_ld8: - test cl, SIZEOF_XMMWORD/2 - mov ecx, SIZEOF_XMMWORD - jz short .rgb_gray_cnv - movdqa xmmF,xmmA - movdqa xmmH,xmmE - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - jmp short .rgb_gray_cnv - alignx 16,7 + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_gray_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_gray_cnv + alignx 16,7 .columnloop: - movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] - movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] .rgb_gray_cnv: - ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - movdqa xmmD,xmmA - punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) - punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) - movdqa xmmC,xmmF - punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) - punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) - movdqa xmmB,xmmA - punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) - punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) - movdqa xmmG,xmmD - punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) - punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) - movdqa xmmE,xmmA - punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) - punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) - movdqa xmmH,xmmB - punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) - punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) - pxor xmmF,xmmF + pxor xmmF,xmmF - movdqa xmmC,xmmA - punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) - punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) - movdqa xmmD,xmmB - punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) - punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) - movdqa xmmG,xmmE - punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) - punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) - punpcklbw xmmF,xmmH - punpckhbw xmmH,xmmH - psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) - psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) %endif ; RGB_PIXELSIZE ; --------------- - ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE - ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO - - ; (Original) - ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B - ; - ; (This implementation) - ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G - - movdqa xmm6,xmm1 - punpcklwd xmm1,xmm3 - punpckhwd xmm6,xmm3 - pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) - - movdqa xmm6,xmm0 - punpcklwd xmm0,xmm2 - punpckhwd xmm6,xmm2 - pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) - pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) - - movdqa xmm0, xmm5 ; xmm0=BO - movdqa xmm6, xmm4 ; xmm6=BE - - movdqa xmm4,xmm0 - punpcklwd xmm0,xmm3 - punpckhwd xmm4,xmm3 - pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) - - movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] - - paddd xmm0, xmm1 - paddd xmm4, xmm7 - paddd xmm0,xmm3 - paddd xmm4,xmm3 - psrld xmm0,SCALEBITS ; xmm0=YOL - psrld xmm4,SCALEBITS ; xmm4=YOH - packssdw xmm0,xmm4 ; xmm0=YO - - movdqa xmm4,xmm6 - punpcklwd xmm6,xmm2 - punpckhwd xmm4,xmm2 - pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) - pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) - - movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] - - paddd xmm6, XMMWORD [wk(0)] - paddd xmm4, XMMWORD [wk(1)] - paddd xmm6,xmm2 - paddd xmm4,xmm2 - psrld xmm6,SCALEBITS ; xmm6=YEL - psrld xmm4,SCALEBITS ; xmm4=YEH - packssdw xmm6,xmm4 ; xmm6=YE - - psllw xmm0,BYTE_BIT - por xmm6,xmm0 ; xmm6=Y - movdqa XMMWORD [edi], xmm6 ; Save Y - - sub ecx, byte SIZEOF_XMMWORD - add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr - add edi, byte SIZEOF_XMMWORD ; outptr0 - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .column_ld1 - - pop ecx ; col - pop esi - pop edi - poppic eax - - add esi, byte SIZEOF_JSAMPROW ; input_buf - add edi, byte SIZEOF_JSAMPROW - dec eax ; num_rows - jg near .rowloop + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) + + movdqa xmm0, xmm5 ; xmm0=BO + movdqa xmm6, xmm4 ; xmm6=BE + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, xmm1 + paddd xmm4, xmm7 + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(0)] + paddd xmm4, XMMWORD [wk(1)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcqnt3dn.asm b/simd/jcqnt3dn.asm index 182c86952..0b8ec17ed 100644 --- a/simd/jcqnt3dn.asm +++ b/simd/jcqnt3dn.asm @@ -20,8 +20,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -30,98 +30,98 @@ ; FAST_FLOAT * workspace); ; -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; FAST_FLOAT * workspace +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT * workspace - align 16 - global EXTN(jsimd_convsamp_float_3dnow) + align 16 + global EXTN(jsimd_convsamp_float_3dnow) EXTN(jsimd_convsamp_float_3dnow): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pcmpeqw mm7,mm7 - psllw mm7,7 - packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/2 - alignx 16,7 + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7,mm7 + psllw mm7,7 + packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 .convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] - movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] - - psubb mm0,mm7 ; mm0=(01234567) - psubb mm1,mm7 ; mm1=(89ABCDEF) - - punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) - punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) - punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) - punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) - - punpcklwd mm4,mm2 ; mm4=(***0***1) - punpckhwd mm2,mm2 ; mm2=(***2***3) - punpcklwd mm5,mm0 ; mm5=(***4***5) - punpckhwd mm0,mm0 ; mm0=(***6***7) - - psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) - psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) - pi2fd mm4,mm4 - pi2fd mm2,mm2 - psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) - psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) - pi2fd mm5,mm5 - pi2fd mm0,mm0 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 - movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 - movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 - - punpcklwd mm6,mm3 ; mm6=(***8***9) - punpckhwd mm3,mm3 ; mm3=(***A***B) - punpcklwd mm4,mm1 ; mm4=(***C***D) - punpckhwd mm1,mm1 ; mm1=(***E***F) - - psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) - psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) - pi2fd mm6,mm6 - pi2fd mm3,mm3 - psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) - psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) - pi2fd mm4,mm4 - pi2fd mm1,mm1 - - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 - movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 - - add esi, byte 2*SIZEOF_JSAMPROW - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .convloop - - femms ; empty MMX/3DNow! state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0,mm7 ; mm0=(01234567) + psubb mm1,mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4,mm2 ; mm4=(***0***1) + punpckhwd mm2,mm2 ; mm2=(***2***3) + punpcklwd mm5,mm0 ; mm5=(***4***5) + punpckhwd mm0,mm0 ; mm0=(***6***7) + + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) + pi2fd mm4,mm4 + pi2fd mm2,mm2 + psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) + pi2fd mm5,mm5 + pi2fd mm0,mm0 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + + punpcklwd mm6,mm3 ; mm6=(***8***9) + punpckhwd mm3,mm3 ; mm3=(***A***B) + punpcklwd mm4,mm1 ; mm4=(***C***D) + punpckhwd mm1,mm1 ; mm1=(***E***F) + + psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) + pi2fd mm6,mm6 + pi2fd mm3,mm3 + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) + pi2fd mm4,mm4 + pi2fd mm1,mm1 + + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; -------------------------------------------------------------------------- @@ -133,101 +133,101 @@ EXTN(jsimd_convsamp_float_3dnow): ; FAST_FLOAT * workspace); ; -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; FAST_FLOAT * divisors -%define workspace ebp+16 ; FAST_FLOAT * workspace +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT * divisors +%define workspace ebp+16 ; FAST_FLOAT * workspace - align 16 - global EXTN(jsimd_quantize_float_3dnow) + align 16 + global EXTN(jsimd_quantize_float_3dnow) EXTN(jsimd_quantize_float_3dnow): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) - movd mm7,eax - punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F} - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/16 - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) + movd mm7,eax + punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F} + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 .quantloop: - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] - pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] - pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] - pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] - - pfadd mm0,mm7 ; mm0=(00 ** 01 **) - pfadd mm1,mm7 ; mm1=(02 ** 03 **) - pfadd mm2,mm7 ; mm0=(04 ** 05 **) - pfadd mm3,mm7 ; mm1=(06 ** 07 **) - - movq mm4,mm0 - punpcklwd mm0,mm1 ; mm0=(00 02 ** **) - punpckhwd mm4,mm1 ; mm4=(01 03 ** **) - movq mm5,mm2 - punpcklwd mm2,mm3 ; mm2=(04 06 ** **) - punpckhwd mm5,mm3 ; mm5=(05 07 ** **) - - punpcklwd mm0,mm4 ; mm0=(00 01 02 03) - punpcklwd mm2,mm5 ; mm2=(04 05 06 07) - - movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] - pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] - movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] - pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] - pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] - - pfadd mm6,mm7 ; mm0=(10 ** 11 **) - pfadd mm1,mm7 ; mm4=(12 ** 13 **) - pfadd mm3,mm7 ; mm0=(14 ** 15 **) - pfadd mm4,mm7 ; mm4=(16 ** 17 **) - - movq mm5,mm6 - punpcklwd mm6,mm1 ; mm6=(10 12 ** **) - punpckhwd mm5,mm1 ; mm5=(11 13 ** **) - movq mm1,mm3 - punpcklwd mm3,mm4 ; mm3=(14 16 ** **) - punpckhwd mm1,mm4 ; mm1=(15 17 ** **) - - punpcklwd mm6,mm5 ; mm6=(10 11 12 13) - punpcklwd mm3,mm1 ; mm3=(14 15 16 17) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 - - add esi, byte 16*SIZEOF_FAST_FLOAT - add edx, byte 16*SIZEOF_FAST_FLOAT - add edi, byte 16*SIZEOF_JCOEF - dec eax - jnz near .quantloop - - femms ; empty MMX/3DNow! state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm0,mm7 ; mm0=(00 ** 01 **) + pfadd mm1,mm7 ; mm1=(02 ** 03 **) + pfadd mm2,mm7 ; mm0=(04 ** 05 **) + pfadd mm3,mm7 ; mm1=(06 ** 07 **) + + movq mm4,mm0 + punpcklwd mm0,mm1 ; mm0=(00 02 ** **) + punpckhwd mm4,mm1 ; mm4=(01 03 ** **) + movq mm5,mm2 + punpcklwd mm2,mm3 ; mm2=(04 06 ** **) + punpckhwd mm5,mm3 ; mm5=(05 07 ** **) + + punpcklwd mm0,mm4 ; mm0=(00 01 02 03) + punpcklwd mm2,mm5 ; mm2=(04 05 06 07) + + movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] + movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] + pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + pfadd mm6,mm7 ; mm0=(10 ** 11 **) + pfadd mm1,mm7 ; mm4=(12 ** 13 **) + pfadd mm3,mm7 ; mm0=(14 ** 15 **) + pfadd mm4,mm7 ; mm4=(16 ** 17 **) + + movq mm5,mm6 + punpcklwd mm6,mm1 ; mm6=(10 12 ** **) + punpckhwd mm5,mm1 ; mm5=(11 13 ** **) + movq mm1,mm3 + punpcklwd mm3,mm4 ; mm3=(14 16 ** **) + punpckhwd mm1,mm4 ; mm1=(15 17 ** **) + + punpcklwd mm6,mm5 ; mm6=(10 11 12 13) + punpcklwd mm3,mm1 ; mm3=(14 15 16 17) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcqntmmx.asm b/simd/jcqntmmx.asm index 08b08b79e..ae837d2b8 100644 --- a/simd/jcqntmmx.asm +++ b/simd/jcqntmmx.asm @@ -20,8 +20,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -30,92 +30,92 @@ ; DCTELEM * workspace); ; -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; DCTELEM * workspace +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; DCTELEM * workspace - align 16 - global EXTN(jsimd_convsamp_mmx) + align 16 + global EXTN(jsimd_convsamp_mmx) EXTN(jsimd_convsamp_mmx): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pxor mm6,mm6 ; mm6=(all 0's) - pcmpeqw mm7,mm7 - psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor mm6,mm6 ; mm6=(all 0's) + pcmpeqw mm7,mm7 + psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 .convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) - movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) - - mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) - movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) - - movq mm4,mm0 - punpcklbw mm0,mm6 ; mm0=(0123) - punpckhbw mm4,mm6 ; mm4=(4567) - movq mm5,mm1 - punpcklbw mm1,mm6 ; mm1=(89AB) - punpckhbw mm5,mm6 ; mm5=(CDEF) - - paddw mm0,mm7 - paddw mm4,mm7 - paddw mm1,mm7 - paddw mm5,mm7 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 - - movq mm0,mm2 - punpcklbw mm2,mm6 ; mm2=(GHIJ) - punpckhbw mm0,mm6 ; mm0=(KLMN) - movq mm4,mm3 - punpcklbw mm3,mm6 ; mm3=(OPQR) - punpckhbw mm4,mm6 ; mm4=(STUV) - - paddw mm2,mm7 - paddw mm0,mm7 - paddw mm3,mm7 - paddw mm4,mm7 - - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 - - add esi, byte 4*SIZEOF_JSAMPROW - add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz short .convloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) + movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) + + movq mm4,mm0 + punpcklbw mm0,mm6 ; mm0=(0123) + punpckhbw mm4,mm6 ; mm4=(4567) + movq mm5,mm1 + punpcklbw mm1,mm6 ; mm1=(89AB) + punpckhbw mm5,mm6 ; mm5=(CDEF) + + paddw mm0,mm7 + paddw mm4,mm7 + paddw mm1,mm7 + paddw mm5,mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 + + movq mm0,mm2 + punpcklbw mm2,mm6 ; mm2=(GHIJ) + punpckhbw mm0,mm6 ; mm0=(KLMN) + movq mm4,mm3 + punpcklbw mm3,mm6 ; mm3=(OPQR) + punpckhbw mm4,mm6 ; mm4=(STUV) + + paddw mm2,mm7 + paddw mm0,mm7 + paddw mm3,mm7 + paddw mm4,mm7 + + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -135,140 +135,140 @@ EXTN(jsimd_convsamp_mmx): %define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) %define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM) -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; DCTELEM * divisors -%define workspace ebp+16 ; DCTELEM * workspace +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; DCTELEM * divisors +%define workspace ebp+16 ; DCTELEM * workspace - align 16 - global EXTN(jsimd_quantize_mmx) + align 16 + global EXTN(jsimd_quantize_mmx) EXTN(jsimd_quantize_mmx): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov ah, 2 - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov ah, 2 + alignx 16,7 .quantloop1: - mov al, DCTSIZE2/8/2 - alignx 16,7 + mov al, DCTSIZE2/8/2 + alignx 16,7 .quantloop2: - movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] - - movq mm0,mm2 - movq mm1,mm3 - - psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise - psraw mm3,(WORD_BIT-1) - - pxor mm0,mm2 ; val = -val - pxor mm1,mm3 - psubw mm0,mm2 - psubw mm1,mm3 - - ; - ; MMX is an annoyingly crappy instruction set. It has two - ; misfeatures that are causing problems here: - ; - ; - All multiplications are signed. - ; - ; - The second operand for the shifts is not treated as packed. - ; - ; - ; We work around the first problem by implementing this algorithm: - ; - ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) - ; { - ; enum { SHORT_BIT = 16 }; - ; signed short sx = (signed short) x; - ; signed short sy = (signed short) y; - ; signed long sz; - ; - ; sz = (long) sx * (long) sy; /* signed multiply */ - ; - ; if (sx < 0) sz += (long) sy << SHORT_BIT; - ; if (sy < 0) sz += (long) sx << SHORT_BIT; - ; - ; return (unsigned long) sz; - ; } - ; - ; (note that a negative sx adds _sy_ and vice versa) - ; - ; For the second problem, we replace the shift by a multiplication. - ; Unfortunately that means we have to deal with the signed issue again. - ; - - paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor - paddw mm1, MMWORD [CORRECTION(0,1,edx)] - - movq mm4,mm0 ; store current value for later - movq mm5,mm1 - pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal - pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] - paddw mm0,mm4 ; reciprocal is always negative (MSB=1), - paddw mm1,mm5 ; so we always need to add the initial value - ; (input value is never negative as we - ; inverted it at the start of this routine) - - ; here it gets a bit tricky as both scale - ; and mm0/mm1 can be negative - movq mm6, MMWORD [SCALE(0,0,edx)] ; scale - movq mm7, MMWORD [SCALE(0,1,edx)] - movq mm4,mm0 - movq mm5,mm1 - pmulhw mm0,mm6 - pmulhw mm1,mm7 - - psraw mm6,(WORD_BIT-1) ; determine if scale is negative - psraw mm7,(WORD_BIT-1) - - pand mm6,mm4 ; and add input if it is - pand mm7,mm5 - paddw mm0,mm6 - paddw mm1,mm7 - - psraw mm4,(WORD_BIT-1) ; then check if negative input - psraw mm5,(WORD_BIT-1) - - pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is - pand mm5, MMWORD [SCALE(0,1,edx)] - paddw mm0,mm4 - paddw mm1,mm5 - - pxor mm0,mm2 ; val = -val - pxor mm1,mm3 - psubw mm0,mm2 - psubw mm1,mm3 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 - - add esi, byte 8*SIZEOF_DCTELEM - add edx, byte 8*SIZEOF_DCTELEM - add edi, byte 8*SIZEOF_JCOEF - dec al - jnz near .quantloop2 - dec ah - jnz near .quantloop1 ; to avoid branch misprediction - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret + movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] + + movq mm0,mm2 + movq mm1,mm3 + + psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise + psraw mm3,(WORD_BIT-1) + + pxor mm0,mm2 ; val = -val + pxor mm1,mm3 + psubw mm0,mm2 + psubw mm1,mm3 + + ; + ; MMX is an annoyingly crappy instruction set. It has two + ; misfeatures that are causing problems here: + ; + ; - All multiplications are signed. + ; + ; - The second operand for the shifts is not treated as packed. + ; + ; + ; We work around the first problem by implementing this algorithm: + ; + ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) + ; { + ; enum { SHORT_BIT = 16 }; + ; signed short sx = (signed short) x; + ; signed short sy = (signed short) y; + ; signed long sz; + ; + ; sz = (long) sx * (long) sy; /* signed multiply */ + ; + ; if (sx < 0) sz += (long) sy << SHORT_BIT; + ; if (sy < 0) sz += (long) sx << SHORT_BIT; + ; + ; return (unsigned long) sz; + ; } + ; + ; (note that a negative sx adds _sy_ and vice versa) + ; + ; For the second problem, we replace the shift by a multiplication. + ; Unfortunately that means we have to deal with the signed issue again. + ; + + paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw mm1, MMWORD [CORRECTION(0,1,edx)] + + movq mm4,mm0 ; store current value for later + movq mm5,mm1 + pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] + paddw mm0,mm4 ; reciprocal is always negative (MSB=1), + paddw mm1,mm5 ; so we always need to add the initial value + ; (input value is never negative as we + ; inverted it at the start of this routine) + + ; here it gets a bit tricky as both scale + ; and mm0/mm1 can be negative + movq mm6, MMWORD [SCALE(0,0,edx)] ; scale + movq mm7, MMWORD [SCALE(0,1,edx)] + movq mm4,mm0 + movq mm5,mm1 + pmulhw mm0,mm6 + pmulhw mm1,mm7 + + psraw mm6,(WORD_BIT-1) ; determine if scale is negative + psraw mm7,(WORD_BIT-1) + + pand mm6,mm4 ; and add input if it is + pand mm7,mm5 + paddw mm0,mm6 + paddw mm1,mm7 + + psraw mm4,(WORD_BIT-1) ; then check if negative input + psraw mm5,(WORD_BIT-1) + + pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is + pand mm5, MMWORD [SCALE(0,1,edx)] + paddw mm0,mm4 + paddw mm1,mm5 + + pxor mm0,mm2 ; val = -val + pxor mm1,mm3 + psubw mm0,mm2 + psubw mm1,mm3 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 + + add esi, byte 8*SIZEOF_DCTELEM + add edx, byte 8*SIZEOF_DCTELEM + add edi, byte 8*SIZEOF_JCOEF + dec al + jnz near .quantloop2 + dec ah + jnz near .quantloop1 ; to avoid branch misprediction + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcqnts2f-64.asm b/simd/jcqnts2f-64.asm index d0efa1b90..0752542f3 100644 --- a/simd/jcqnts2f-64.asm +++ b/simd/jcqnts2f-64.asm @@ -21,8 +21,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -35,65 +35,65 @@ ; r11 = JDIMENSION start_col ; r12 = FAST_FLOAT * workspace - align 16 - global EXTN(jsimd_convsamp_float_sse2) + align 16 + global EXTN(jsimd_convsamp_float_sse2) EXTN(jsimd_convsamp_float_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - pcmpeqw xmm7,xmm7 - psllw xmm7,7 - packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) - - mov rsi, r10 - mov rax, r11 - mov rdi, r12 - mov rcx, DCTSIZE/2 + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + pcmpeqw xmm7,xmm7 + psllw xmm7,7 + packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov rsi, r10 + mov rax, r11 + mov rdi, r12 + mov rcx, DCTSIZE/2 .convloop: - mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] - movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] - psubb xmm0,xmm7 ; xmm0=(01234567) - psubb xmm1,xmm7 ; xmm1=(89ABCDEF) + psubb xmm0,xmm7 ; xmm0=(01234567) + psubb xmm1,xmm7 ; xmm1=(89ABCDEF) - punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) - punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) - punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) - punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) - punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) - punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) + punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) - psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) - psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) - cvtdq2ps xmm2,xmm2 ; xmm2=(0123) - cvtdq2ps xmm0,xmm0 ; xmm0=(4567) - psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) - psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) - cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) - cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) + psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2,xmm2 ; xmm2=(0123) + cvtdq2ps xmm0,xmm0 ; xmm0=(4567) + psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) - movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 - add rsi, byte 2*SIZEOF_JSAMPROW - add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec rcx - jnz short .convloop + add rsi, byte 2*SIZEOF_JSAMPROW + add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz short .convloop - pop rbx - uncollect_args - pop rbp - ret + pop rbx + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- @@ -109,50 +109,50 @@ EXTN(jsimd_convsamp_float_sse2): ; r11 = FAST_FLOAT * divisors ; r12 = FAST_FLOAT * workspace - align 16 - global EXTN(jsimd_quantize_float_sse2) + align 16 + global EXTN(jsimd_quantize_float_sse2) EXTN(jsimd_quantize_float_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov rsi, r12 - mov rdx, r11 - mov rdi, r10 - mov rax, DCTSIZE2/16 + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/16 .quantloop: - movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] - mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] - mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] - mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] - mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] - - cvtps2dq xmm0,xmm0 - cvtps2dq xmm1,xmm1 - cvtps2dq xmm2,xmm2 - cvtps2dq xmm3,xmm3 - - packssdw xmm0,xmm1 - packssdw xmm2,xmm3 - - movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 - - add rsi, byte 16*SIZEOF_FAST_FLOAT - add rdx, byte 16*SIZEOF_FAST_FLOAT - add rdi, byte 16*SIZEOF_JCOEF - dec rax - jnz short .quantloop - - uncollect_args - pop rbp - ret + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0,xmm0 + cvtps2dq xmm1,xmm1 + cvtps2dq xmm2,xmm2 + cvtps2dq xmm3,xmm3 + + packssdw xmm0,xmm1 + packssdw xmm2,xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 + + add rsi, byte 16*SIZEOF_FAST_FLOAT + add rdx, byte 16*SIZEOF_FAST_FLOAT + add rdi, byte 16*SIZEOF_JCOEF + dec rax + jnz short .quantloop + + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcqnts2f.asm b/simd/jcqnts2f.asm index d80ae5dc9..0df2df78a 100644 --- a/simd/jcqnts2f.asm +++ b/simd/jcqnts2f.asm @@ -20,8 +20,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -30,75 +30,75 @@ ; FAST_FLOAT * workspace); ; -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; FAST_FLOAT * workspace +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT * workspace - align 16 - global EXTN(jsimd_convsamp_float_sse2) + align 16 + global EXTN(jsimd_convsamp_float_sse2) EXTN(jsimd_convsamp_float_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pcmpeqw xmm7,xmm7 - psllw xmm7,7 - packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/2 - alignx 16,7 + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw xmm7,xmm7 + psllw xmm7,7 + packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 .convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] - movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] - - psubb xmm0,xmm7 ; xmm0=(01234567) - psubb xmm1,xmm7 ; xmm1=(89ABCDEF) - - punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) - punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) - - punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) - punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) - punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) - punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) - - psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) - psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) - cvtdq2ps xmm2,xmm2 ; xmm2=(0123) - cvtdq2ps xmm0,xmm0 ; xmm0=(4567) - psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) - psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) - cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) - cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 - - add esi, byte 2*SIZEOF_JSAMPROW - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz short .convloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb xmm0,xmm7 ; xmm0=(01234567) + psubb xmm1,xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2,xmm2 ; xmm2=(0123) + cvtdq2ps xmm0,xmm0 ; xmm0=(4567) + psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; -------------------------------------------------------------------------- @@ -110,62 +110,62 @@ EXTN(jsimd_convsamp_float_sse2): ; FAST_FLOAT * workspace); ; -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; FAST_FLOAT * divisors -%define workspace ebp+16 ; FAST_FLOAT * workspace +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT * divisors +%define workspace ebp+16 ; FAST_FLOAT * workspace - align 16 - global EXTN(jsimd_quantize_float_sse2) + align 16 + global EXTN(jsimd_quantize_float_sse2) EXTN(jsimd_quantize_float_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/16 - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 .quantloop: - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - - cvtps2dq xmm0,xmm0 - cvtps2dq xmm1,xmm1 - cvtps2dq xmm2,xmm2 - cvtps2dq xmm3,xmm3 - - packssdw xmm0,xmm1 - packssdw xmm2,xmm3 - - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 - - add esi, byte 16*SIZEOF_FAST_FLOAT - add edx, byte 16*SIZEOF_FAST_FLOAT - add edi, byte 16*SIZEOF_JCOEF - dec eax - jnz short .quantloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0,xmm0 + cvtps2dq xmm1,xmm1 + cvtps2dq xmm2,xmm2 + cvtps2dq xmm3,xmm3 + + packssdw xmm0,xmm1 + packssdw xmm2,xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcqnts2i-64.asm b/simd/jcqnts2i-64.asm index cc33d59f9..5de8062ff 100644 --- a/simd/jcqnts2i-64.asm +++ b/simd/jcqnts2i-64.asm @@ -21,8 +21,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -35,60 +35,60 @@ ; r11 = JDIMENSION start_col ; r12 = DCTELEM * workspace - align 16 - global EXTN(jsimd_convsamp_sse2) + align 16 + global EXTN(jsimd_convsamp_sse2) EXTN(jsimd_convsamp_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - pxor xmm6,xmm6 ; xmm6=(all 0's) - pcmpeqw xmm7,xmm7 - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - mov rsi, r10 - mov rax, r11 - mov rdi, r12 - mov rcx, DCTSIZE/4 + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + pxor xmm6,xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7,xmm7 + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov rsi, r10 + mov rax, r11 + mov rdi, r12 + mov rcx, DCTSIZE/4 .convloop: - mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) - movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) - - mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) - movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) - - punpcklbw xmm0,xmm6 ; xmm0=(01234567) - punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) - paddw xmm0,xmm7 - paddw xmm1,xmm7 - punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) - punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) - paddw xmm2,xmm7 - paddw xmm3,xmm7 - - movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 - - add rsi, byte 4*SIZEOF_JSAMPROW - add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec rcx - jnz short .convloop - - pop rbx - uncollect_args - pop rbp - ret + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0,xmm6 ; xmm0=(01234567) + punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) + paddw xmm0,xmm7 + paddw xmm1,xmm7 + punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2,xmm7 + paddw xmm3,xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 4*SIZEOF_JSAMPROW + add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec rcx + jnz short .convloop + + pop rbx + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -111,77 +111,77 @@ EXTN(jsimd_convsamp_sse2): ; r11 = DCTELEM * divisors ; r12 = DCTELEM * workspace - align 16 - global EXTN(jsimd_quantize_sse2) + align 16 + global EXTN(jsimd_quantize_sse2) EXTN(jsimd_quantize_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov rsi, r12 - mov rdx, r11 - mov rdi, r10 - mov rax, DCTSIZE2/32 + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov rsi, r12 + mov rdx, r11 + mov rdi, r10 + mov rax, DCTSIZE2/32 .quantloop: - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] - movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] - movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - psraw xmm4,(WORD_BIT-1) - psraw xmm5,(WORD_BIT-1) - psraw xmm6,(WORD_BIT-1) - psraw xmm7,(WORD_BIT-1) - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; - psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; - psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; - psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; - - paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor - paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] - paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] - paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] - pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal - pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] - pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] - pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] - pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale - pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] - pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] - pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] - - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 - psubw xmm1,xmm5 - psubw xmm2,xmm6 - psubw xmm3,xmm7 - movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 - - add rsi, byte 32*SIZEOF_DCTELEM - add rdx, byte 32*SIZEOF_DCTELEM - add rdi, byte 32*SIZEOF_JCOEF - dec rax - jnz near .quantloop - - uncollect_args - pop rbp - ret + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + psraw xmm4,(WORD_BIT-1) + psraw xmm5,(WORD_BIT-1) + psraw xmm6,(WORD_BIT-1) + psraw xmm7,(WORD_BIT-1) + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] + + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 + psubw xmm1,xmm5 + psubw xmm2,xmm6 + psubw xmm3,xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 + + add rsi, byte 32*SIZEOF_DCTELEM + add rdx, byte 32*SIZEOF_DCTELEM + add rdi, byte 32*SIZEOF_JCOEF + dec rax + jnz near .quantloop + + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcqnts2i.asm b/simd/jcqnts2i.asm index 0864d6ed4..07cdc6854 100644 --- a/simd/jcqnts2i.asm +++ b/simd/jcqnts2i.asm @@ -20,8 +20,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -30,70 +30,70 @@ ; DCTELEM * workspace); ; -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; DCTELEM * workspace +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; DCTELEM * workspace - align 16 - global EXTN(jsimd_convsamp_sse2) + align 16 + global EXTN(jsimd_convsamp_sse2) EXTN(jsimd_convsamp_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pxor xmm6,xmm6 ; xmm6=(all 0's) - pcmpeqw xmm7,xmm7 - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pxor xmm6,xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7,xmm7 + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 .convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) - movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) - - mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) - movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) - - punpcklbw xmm0,xmm6 ; xmm0=(01234567) - punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) - paddw xmm0,xmm7 - paddw xmm1,xmm7 - punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) - punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) - paddw xmm2,xmm7 - paddw xmm3,xmm7 - - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 - - add esi, byte 4*SIZEOF_JSAMPROW - add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz short .convloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0,xmm6 ; xmm0=(01234567) + punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) + paddw xmm0,xmm7 + paddw xmm1,xmm7 + punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2,xmm7 + paddw xmm3,xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -112,89 +112,89 @@ EXTN(jsimd_convsamp_sse2): %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) %define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; DCTELEM * divisors -%define workspace ebp+16 ; DCTELEM * workspace +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; DCTELEM * divisors +%define workspace ebp+16 ; DCTELEM * workspace - align 16 - global EXTN(jsimd_quantize_sse2) + align 16 + global EXTN(jsimd_quantize_sse2) EXTN(jsimd_quantize_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/32 - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/32 + alignx 16,7 .quantloop: - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] - movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] - movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - psraw xmm4,(WORD_BIT-1) - psraw xmm5,(WORD_BIT-1) - psraw xmm6,(WORD_BIT-1) - psraw xmm7,(WORD_BIT-1) - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; - psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; - psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; - psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; - - paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor - paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] - paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] - paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] - pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal - pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] - pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] - pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] - pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale - pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] - pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] - pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] - - pxor xmm0,xmm4 - pxor xmm1,xmm5 - pxor xmm2,xmm6 - pxor xmm3,xmm7 - psubw xmm0,xmm4 - psubw xmm1,xmm5 - psubw xmm2,xmm6 - psubw xmm3,xmm7 - movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 - movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 - movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 - - add esi, byte 32*SIZEOF_DCTELEM - add edx, byte 32*SIZEOF_DCTELEM - add edi, byte 32*SIZEOF_JCOEF - dec eax - jnz near .quantloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + psraw xmm4,(WORD_BIT-1) + psraw xmm5,(WORD_BIT-1) + psraw xmm6,(WORD_BIT-1) + psraw xmm7,(WORD_BIT-1) + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] + pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] + + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 + psubw xmm1,xmm5 + psubw xmm2,xmm6 + psubw xmm3,xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 32*SIZEOF_DCTELEM + add edx, byte 32*SIZEOF_DCTELEM + add edi, byte 32*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcqntsse.asm b/simd/jcqntsse.asm index 3065eca81..2e32d0cb1 100644 --- a/simd/jcqntsse.asm +++ b/simd/jcqntsse.asm @@ -20,8 +20,8 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Load data into workspace, applying unsigned->signed conversion ; @@ -30,98 +30,98 @@ ; FAST_FLOAT * workspace); ; -%define sample_data ebp+8 ; JSAMPARRAY sample_data -%define start_col ebp+12 ; JDIMENSION start_col -%define workspace ebp+16 ; FAST_FLOAT * workspace +%define sample_data ebp+8 ; JSAMPARRAY sample_data +%define start_col ebp+12 ; JDIMENSION start_col +%define workspace ebp+16 ; FAST_FLOAT * workspace - align 16 - global EXTN(jsimd_convsamp_float_sse) + align 16 + global EXTN(jsimd_convsamp_float_sse) EXTN(jsimd_convsamp_float_sse): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - pcmpeqw mm7,mm7 - psllw mm7,7 - packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) - - mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) - mov eax, JDIMENSION [start_col] - mov edi, POINTER [workspace] ; (DCTELEM *) - mov ecx, DCTSIZE/2 - alignx 16,7 + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + pcmpeqw mm7,mm7 + psllw mm7,7 + packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 .convloop: - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) - mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) - - movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] - movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] - - psubb mm0,mm7 ; mm0=(01234567) - psubb mm1,mm7 ; mm1=(89ABCDEF) - - punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) - punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) - punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) - punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) - - punpcklwd mm4,mm2 ; mm4=(***0***1) - punpckhwd mm2,mm2 ; mm2=(***2***3) - punpcklwd mm5,mm0 ; mm5=(***4***5) - punpckhwd mm0,mm0 ; mm0=(***6***7) - - psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) - psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) - cvtpi2ps xmm0,mm4 ; xmm0=(01**) - cvtpi2ps xmm1,mm2 ; xmm1=(23**) - psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) - psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) - cvtpi2ps xmm2,mm5 ; xmm2=(45**) - cvtpi2ps xmm3,mm0 ; xmm3=(67**) - - punpcklwd mm6,mm3 ; mm6=(***8***9) - punpckhwd mm3,mm3 ; mm3=(***A***B) - punpcklwd mm4,mm1 ; mm4=(***C***D) - punpckhwd mm1,mm1 ; mm1=(***E***F) - - psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) - psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) - cvtpi2ps xmm4,mm6 ; xmm4=(89**) - cvtpi2ps xmm5,mm3 ; xmm5=(AB**) - psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) - psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) - cvtpi2ps xmm6,mm4 ; xmm6=(CD**) - cvtpi2ps xmm7,mm1 ; xmm7=(EF**) - - movlhps xmm0,xmm1 ; xmm0=(0123) - movlhps xmm2,xmm3 ; xmm2=(4567) - movlhps xmm4,xmm5 ; xmm4=(89AB) - movlhps xmm6,xmm7 ; xmm6=(CDEF) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 - - add esi, byte 2*SIZEOF_JSAMPROW - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .convloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb mm0,mm7 ; mm0=(01234567) + psubb mm1,mm7 ; mm1=(89ABCDEF) + + punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) + punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) + punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) + punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) + + punpcklwd mm4,mm2 ; mm4=(***0***1) + punpckhwd mm2,mm2 ; mm2=(***2***3) + punpcklwd mm5,mm0 ; mm5=(***4***5) + punpckhwd mm0,mm0 ; mm0=(***6***7) + + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) + psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) + cvtpi2ps xmm0,mm4 ; xmm0=(01**) + cvtpi2ps xmm1,mm2 ; xmm1=(23**) + psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) + psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) + cvtpi2ps xmm2,mm5 ; xmm2=(45**) + cvtpi2ps xmm3,mm0 ; xmm3=(67**) + + punpcklwd mm6,mm3 ; mm6=(***8***9) + punpckhwd mm3,mm3 ; mm3=(***A***B) + punpcklwd mm4,mm1 ; mm4=(***C***D) + punpckhwd mm1,mm1 ; mm1=(***E***F) + + psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) + psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) + cvtpi2ps xmm4,mm6 ; xmm4=(89**) + cvtpi2ps xmm5,mm3 ; xmm5=(AB**) + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) + psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) + cvtpi2ps xmm6,mm4 ; xmm6=(CD**) + cvtpi2ps xmm7,mm1 ; xmm7=(EF**) + + movlhps xmm0,xmm1 ; xmm0=(0123) + movlhps xmm2,xmm3 ; xmm2=(4567) + movlhps xmm4,xmm5 ; xmm4=(89AB) + movlhps xmm6,xmm7 ; xmm6=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .convloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; -------------------------------------------------------------------------- @@ -133,79 +133,79 @@ EXTN(jsimd_convsamp_float_sse): ; FAST_FLOAT * workspace); ; -%define coef_block ebp+8 ; JCOEFPTR coef_block -%define divisors ebp+12 ; FAST_FLOAT * divisors -%define workspace ebp+16 ; FAST_FLOAT * workspace +%define coef_block ebp+8 ; JCOEFPTR coef_block +%define divisors ebp+12 ; FAST_FLOAT * divisors +%define workspace ebp+16 ; FAST_FLOAT * workspace - align 16 - global EXTN(jsimd_quantize_float_sse) + align 16 + global EXTN(jsimd_quantize_float_sse) EXTN(jsimd_quantize_float_sse): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - mov esi, POINTER [workspace] - mov edx, POINTER [divisors] - mov edi, JCOEFPTR [coef_block] - mov eax, DCTSIZE2/16 - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 .quantloop: - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - - movhlps xmm4,xmm0 - movhlps xmm5,xmm1 - - cvtps2pi mm0,xmm0 - cvtps2pi mm1,xmm1 - cvtps2pi mm4,xmm4 - cvtps2pi mm5,xmm5 - - movhlps xmm6,xmm2 - movhlps xmm7,xmm3 - - cvtps2pi mm2,xmm2 - cvtps2pi mm3,xmm3 - cvtps2pi mm6,xmm6 - cvtps2pi mm7,xmm7 - - packssdw mm0,mm4 - packssdw mm1,mm5 - packssdw mm2,mm6 - packssdw mm3,mm7 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 - - add esi, byte 16*SIZEOF_FAST_FLOAT - add edx, byte 16*SIZEOF_FAST_FLOAT - add edi, byte 16*SIZEOF_JCOEF - dec eax - jnz short .quantloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused -; pop ebx ; unused - pop ebp - ret + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + movhlps xmm4,xmm0 + movhlps xmm5,xmm1 + + cvtps2pi mm0,xmm0 + cvtps2pi mm1,xmm1 + cvtps2pi mm4,xmm4 + cvtps2pi mm5,xmm5 + + movhlps xmm6,xmm2 + movhlps xmm7,xmm3 + + cvtps2pi mm2,xmm2 + cvtps2pi mm3,xmm3 + cvtps2pi mm6,xmm6 + cvtps2pi mm7,xmm7 + + packssdw mm0,mm4 + packssdw mm1,mm5 + packssdw mm2,mm6 + packssdw mm3,mm7 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcsammmx.asm b/simd/jcsammmx.asm index 9e43b2f85..9ab1518f2 100644 --- a/simd/jcsammmx.asm +++ b/simd/jcsammmx.asm @@ -19,8 +19,8 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Downsample pixel values of a single component. ; This version handles the common case of 2:1 horizontal and 1:1 vertical, @@ -32,135 +32,135 @@ ; JSAMPARRAY input_data, JSAMPARRAY output_data); ; -%define img_width(b) (b)+8 ; JDIMENSION image_width -%define max_v_samp(b) (b)+12 ; int max_v_samp_factor -%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor -%define width_blks(b) (b)+20 ; JDIMENSION width_blocks -%define input_data(b) (b)+24 ; JSAMPARRAY input_data -%define output_data(b) (b)+28 ; JSAMPARRAY output_data +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v1_downsample_mmx) + align 16 + global EXTN(jsimd_h2v1_downsample_mmx) EXTN(jsimd_h2v1_downsample_mmx): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 .expandloop: - push eax - push ecx + push eax + push ecx - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] - rep stosb + rep stosb - pop ecx - pop eax + pop ecx + pop eax - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop .expand_end: - pop ecx ; output_cols + pop ecx ; output_cols - ; -- h2v1_downsample + ; -- h2v1_downsample - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return - mov edx, 0x00010000 ; bias pattern - movd mm7,edx - pcmpeqw mm6,mm6 - punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} - psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + mov edx, 0x00010000 ; bias pattern + movd mm7,edx + pcmpeqw mm6,mm6 + punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 .rowloop: - push ecx - push edi - push esi + push ecx + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - alignx 16,7 + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + alignx 16,7 .columnloop: - movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] - movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] - movq mm2,mm0 - movq mm3,mm1 + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] + movq mm2,mm0 + movq mm3,mm1 - pand mm0,mm6 - psrlw mm2,BYTE_BIT - pand mm1,mm6 - psrlw mm3,BYTE_BIT + pand mm0,mm6 + psrlw mm2,BYTE_BIT + pand mm1,mm6 + psrlw mm3,BYTE_BIT - paddw mm0,mm2 - paddw mm1,mm3 - paddw mm0,mm7 - paddw mm1,mm7 - psrlw mm0,1 - psrlw mm1,1 + paddw mm0,mm2 + paddw mm1,mm3 + paddw mm0,mm7 + paddw mm1,mm7 + psrlw mm0,1 + psrlw mm1,1 - packuswb mm0,mm1 + packuswb mm0,mm1 - movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 - add esi, byte 2*SIZEOF_MMWORD ; inptr - add edi, byte 1*SIZEOF_MMWORD ; outptr - sub ecx, byte SIZEOF_MMWORD ; outcol - jnz short .columnloop + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz short .columnloop - pop esi - pop edi - pop ecx + pop esi + pop edi + pop ecx - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg short .rowloop + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg short .rowloop - emms ; empty MMX state + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -174,151 +174,151 @@ EXTN(jsimd_h2v1_downsample_mmx): ; JSAMPARRAY input_data, JSAMPARRAY output_data); ; -%define img_width(b) (b)+8 ; JDIMENSION image_width -%define max_v_samp(b) (b)+12 ; int max_v_samp_factor -%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor -%define width_blks(b) (b)+20 ; JDIMENSION width_blocks -%define input_data(b) (b)+24 ; JSAMPARRAY input_data -%define output_data(b) (b)+28 ; JSAMPARRAY output_data +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v2_downsample_mmx) + align 16 + global EXTN(jsimd_h2v2_downsample_mmx) EXTN(jsimd_h2v2_downsample_mmx): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 .expandloop: - push eax - push ecx + push eax + push ecx - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] - rep stosb + rep stosb - pop ecx - pop eax + pop ecx + pop eax - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop .expand_end: - pop ecx ; output_cols + pop ecx ; output_cols - ; -- h2v2_downsample + ; -- h2v2_downsample - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return - mov edx, 0x00020001 ; bias pattern - movd mm7,edx - pcmpeqw mm6,mm6 - punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} - psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + mov edx, 0x00020001 ; bias pattern + movd mm7,edx + pcmpeqw mm6,mm6 + punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 .rowloop: - push ecx - push edi - push esi - - mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 - mov edi, JSAMPROW [edi] ; outptr - alignx 16,7 + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + alignx 16,7 .columnloop: - movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] - movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] - movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] - movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] - - movq mm4,mm0 - movq mm5,mm1 - pand mm0,mm6 - psrlw mm4,BYTE_BIT - pand mm1,mm6 - psrlw mm5,BYTE_BIT - paddw mm0,mm4 - paddw mm1,mm5 - - movq mm4,mm2 - movq mm5,mm3 - pand mm2,mm6 - psrlw mm4,BYTE_BIT - pand mm3,mm6 - psrlw mm5,BYTE_BIT - paddw mm2,mm4 - paddw mm3,mm5 - - paddw mm0,mm1 - paddw mm2,mm3 - paddw mm0,mm7 - paddw mm2,mm7 - psrlw mm0,2 - psrlw mm2,2 - - packuswb mm0,mm2 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 - - add edx, byte 2*SIZEOF_MMWORD ; inptr0 - add esi, byte 2*SIZEOF_MMWORD ; inptr1 - add edi, byte 1*SIZEOF_MMWORD ; outptr - sub ecx, byte SIZEOF_MMWORD ; outcol - jnz near .columnloop - - pop esi - pop edi - pop ecx - - add esi, byte 2*SIZEOF_JSAMPROW ; input_data - add edi, byte 1*SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg near .rowloop - - emms ; empty MMX state + movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] + movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] + + movq mm4,mm0 + movq mm5,mm1 + pand mm0,mm6 + psrlw mm4,BYTE_BIT + pand mm1,mm6 + psrlw mm5,BYTE_BIT + paddw mm0,mm4 + paddw mm1,mm5 + + movq mm4,mm2 + movq mm5,mm3 + pand mm2,mm6 + psrlw mm4,BYTE_BIT + pand mm3,mm6 + psrlw mm5,BYTE_BIT + paddw mm2,mm4 + paddw mm3,mm5 + + paddw mm0,mm1 + paddw mm2,mm3 + paddw mm0,mm7 + paddw mm2,mm7 + psrlw mm0,2 + psrlw mm2,2 + + packuswb mm0,mm2 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + + add edx, byte 2*SIZEOF_MMWORD ; inptr0 + add esi, byte 2*SIZEOF_MMWORD ; inptr1 + add edi, byte 1*SIZEOF_MMWORD ; outptr + sub ecx, byte SIZEOF_MMWORD ; outcol + jnz near .columnloop + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcsamss2-64.asm b/simd/jcsamss2-64.asm index 6a16dc5f7..6c50d9c54 100644 --- a/simd/jcsamss2-64.asm +++ b/simd/jcsamss2-64.asm @@ -20,8 +20,8 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Downsample pixel values of a single component. ; This version handles the common case of 2:1 horizontal and 1:1 vertical, @@ -40,130 +40,130 @@ ; r14 = JSAMPARRAY input_data ; r15 = JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v1_downsample_sse2) + align 16 + global EXTN(jsimd_h2v1_downsample_sse2) EXTN(jsimd_h2v1_downsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args + push rbp + mov rax,rsp + mov rbp,rsp + collect_args - mov rcx, r13 - shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) - jz near .return + mov rcx, r13 + shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return - mov rdx, r10 + mov rdx, r10 - ; -- expand_right_edge + ; -- expand_right_edge - push rcx - shl rcx,1 ; output_cols * 2 - sub rcx,rdx - jle short .expand_end + push rcx + shl rcx,1 ; output_cols * 2 + sub rcx,rdx + jle short .expand_end - mov rax, r11 - test rax,rax - jle short .expand_end + mov rax, r11 + test rax,rax + jle short .expand_end - cld - mov rsi, r14 ; input_data + cld + mov rsi, r14 ; input_data .expandloop: - push rax - push rcx + push rax + push rcx - mov rdi, JSAMPROW [rsi] - add rdi,rdx - mov al, JSAMPLE [rdi-1] + mov rdi, JSAMPROW [rsi] + add rdi,rdx + mov al, JSAMPLE [rdi-1] - rep stosb + rep stosb - pop rcx - pop rax + pop rcx + pop rax - add rsi, byte SIZEOF_JSAMPROW - dec rax - jg short .expandloop + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop .expand_end: - pop rcx ; output_cols + pop rcx ; output_cols - ; -- h2v1_downsample + ; -- h2v1_downsample - mov rax, r12 ; rowctr - test eax,eax - jle near .return + mov rax, r12 ; rowctr + test eax,eax + jle near .return - mov rdx, 0x00010000 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + mov rdx, 0x00010000 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - mov rsi, r14 ; input_data - mov rdi, r15 ; output_data + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data .rowloop: - push rcx - push rdi - push rsi + push rcx + push rdi + push rsi - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop .columnloop_r8: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - pxor xmm1,xmm1 - mov rcx, SIZEOF_XMMWORD - jmp short .downsample + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm1,xmm1 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample .columnloop: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] .downsample: - movdqa xmm2,xmm0 - movdqa xmm3,xmm1 - - pand xmm0,xmm6 - psrlw xmm2,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm3,BYTE_BIT - - paddw xmm0,xmm2 - paddw xmm1,xmm3 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - psrlw xmm0,1 - psrlw xmm1,1 - - packuswb xmm0,xmm1 - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - - sub rcx, byte SIZEOF_XMMWORD ; outcol - add rsi, byte 2*SIZEOF_XMMWORD ; inptr - add rdi, byte 1*SIZEOF_XMMWORD ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop - test rcx,rcx - jnz short .columnloop_r8 - - pop rsi - pop rdi - pop rcx - - add rsi, byte SIZEOF_JSAMPROW ; input_data - add rdi, byte SIZEOF_JSAMPROW ; output_data - dec rax ; rowctr - jg near .rowloop + movdqa xmm2,xmm0 + movdqa xmm3,xmm1 + + pand xmm0,xmm6 + psrlw xmm2,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm3,BYTE_BIT + + paddw xmm0,xmm2 + paddw xmm1,xmm3 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + psrlw xmm0,1 + psrlw xmm1,1 + + packuswb xmm0,xmm1 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop + test rcx,rcx + jnz short .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop .return: - uncollect_args - pop rbp - ret + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -184,147 +184,147 @@ EXTN(jsimd_h2v1_downsample_sse2): ; r14 = JSAMPARRAY input_data ; r15 = JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v2_downsample_sse2) + align 16 + global EXTN(jsimd_h2v2_downsample_sse2) EXTN(jsimd_h2v2_downsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args + push rbp + mov rax,rsp + mov rbp,rsp + collect_args - mov rcx, r13 - shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) - jz near .return + mov rcx, r13 + shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) + jz near .return - mov rdx, r10 + mov rdx, r10 - ; -- expand_right_edge + ; -- expand_right_edge - push rcx - shl rcx,1 ; output_cols * 2 - sub rcx,rdx - jle short .expand_end + push rcx + shl rcx,1 ; output_cols * 2 + sub rcx,rdx + jle short .expand_end - mov rax, r11 - test rax,rax - jle short .expand_end + mov rax, r11 + test rax,rax + jle short .expand_end - cld - mov rsi, r14 ; input_data + cld + mov rsi, r14 ; input_data .expandloop: - push rax - push rcx + push rax + push rcx - mov rdi, JSAMPROW [rsi] - add rdi,rdx - mov al, JSAMPLE [rdi-1] + mov rdi, JSAMPROW [rsi] + add rdi,rdx + mov al, JSAMPLE [rdi-1] - rep stosb + rep stosb - pop rcx - pop rax + pop rcx + pop rax - add rsi, byte SIZEOF_JSAMPROW - dec rax - jg short .expandloop + add rsi, byte SIZEOF_JSAMPROW + dec rax + jg short .expandloop .expand_end: - pop rcx ; output_cols + pop rcx ; output_cols - ; -- h2v2_downsample + ; -- h2v2_downsample - mov rax, r12 ; rowctr - test rax,rax - jle near .return + mov rax, r12 ; rowctr + test rax,rax + jle near .return - mov rdx, 0x00020001 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + mov rdx, 0x00020001 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - mov rsi, r14 ; input_data - mov rdi, r15 ; output_data + mov rsi, r14 ; input_data + mov rdi, r15 ; output_data .rowloop: - push rcx - push rdi - push rsi + push rcx + push rdi + push rsi - mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 - mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 - mov rdi, JSAMPROW [rdi] ; outptr + mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 + mov rdi, JSAMPROW [rdi] ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae short .columnloop + cmp rcx, byte SIZEOF_XMMWORD + jae short .columnloop .columnloop_r8: - movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] - pxor xmm2,xmm2 - pxor xmm3,xmm3 - mov rcx, SIZEOF_XMMWORD - jmp short .downsample + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov rcx, SIZEOF_XMMWORD + jmp short .downsample .columnloop: - movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] .downsample: - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - pand xmm0,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm0,xmm4 - paddw xmm1,xmm5 - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - pand xmm2,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm3,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm2,xmm4 - paddw xmm3,xmm5 - - paddw xmm0,xmm1 - paddw xmm2,xmm3 - paddw xmm0,xmm7 - paddw xmm2,xmm7 - psrlw xmm0,2 - psrlw xmm2,2 - - packuswb xmm0,xmm2 - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - - sub rcx, byte SIZEOF_XMMWORD ; outcol - add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 - add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 - add rdi, byte 1*SIZEOF_XMMWORD ; outptr - cmp rcx, byte SIZEOF_XMMWORD - jae near .columnloop - test rcx,rcx - jnz near .columnloop_r8 - - pop rsi - pop rdi - pop rcx - - add rsi, byte 2*SIZEOF_JSAMPROW ; input_data - add rdi, byte 1*SIZEOF_JSAMPROW ; output_data - dec rax ; rowctr - jg near .rowloop + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + pand xmm0,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm0,xmm4 + paddw xmm1,xmm5 + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + pand xmm2,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm3,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm2,xmm4 + paddw xmm3,xmm5 + + paddw xmm0,xmm1 + paddw xmm2,xmm3 + paddw xmm0,xmm7 + paddw xmm2,xmm7 + psrlw xmm0,2 + psrlw xmm2,2 + + packuswb xmm0,xmm2 + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + + sub rcx, byte SIZEOF_XMMWORD ; outcol + add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 + add rdi, byte 1*SIZEOF_XMMWORD ; outptr + cmp rcx, byte SIZEOF_XMMWORD + jae near .columnloop + test rcx,rcx + jnz near .columnloop_r8 + + pop rsi + pop rdi + pop rcx + + add rsi, byte 2*SIZEOF_JSAMPROW ; input_data + add rdi, byte 1*SIZEOF_JSAMPROW ; output_data + dec rax ; rowctr + jg near .rowloop .return: - uncollect_args - pop rbp - ret + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jcsamss2.asm b/simd/jcsamss2.asm index 818e911df..736184357 100644 --- a/simd/jcsamss2.asm +++ b/simd/jcsamss2.asm @@ -19,8 +19,8 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Downsample pixel values of a single component. ; This version handles the common case of 2:1 horizontal and 1:1 vertical, @@ -32,148 +32,148 @@ ; JSAMPARRAY input_data, JSAMPARRAY output_data); ; -%define img_width(b) (b)+8 ; JDIMENSION image_width -%define max_v_samp(b) (b)+12 ; int max_v_samp_factor -%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor -%define width_blks(b) (b)+20 ; JDIMENSION width_blocks -%define input_data(b) (b)+24 ; JSAMPARRAY input_data -%define output_data(b) (b)+28 ; JSAMPARRAY output_data +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v1_downsample_sse2) + align 16 + global EXTN(jsimd_h2v1_downsample_sse2) EXTN(jsimd_h2v1_downsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 .expandloop: - push eax - push ecx + push eax + push ecx - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] - rep stosb + rep stosb - pop ecx - pop eax + pop ecx + pop eax - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop .expand_end: - pop ecx ; output_cols + pop ecx ; output_cols - ; -- h2v1_downsample + ; -- h2v1_downsample - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return - mov edx, 0x00010000 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + mov edx, 0x00010000 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 .rowloop: - push ecx - push edi - push esi + push ecx + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - alignx 16,7 + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16,7 .columnloop_r8: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - pxor xmm1,xmm1 - mov ecx, SIZEOF_XMMWORD - jmp short .downsample - alignx 16,7 + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm1,xmm1 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16,7 .columnloop: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] .downsample: - movdqa xmm2,xmm0 - movdqa xmm3,xmm1 - - pand xmm0,xmm6 - psrlw xmm2,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm3,BYTE_BIT - - paddw xmm0,xmm2 - paddw xmm1,xmm3 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - psrlw xmm0,1 - psrlw xmm1,1 - - packuswb xmm0,xmm1 - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - - sub ecx, byte SIZEOF_XMMWORD ; outcol - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add edi, byte 1*SIZEOF_XMMWORD ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - test ecx,ecx - jnz short .columnloop_r8 - - pop esi - pop edi - pop ecx - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg near .rowloop + movdqa xmm2,xmm0 + movdqa xmm3,xmm1 + + pand xmm0,xmm6 + psrlw xmm2,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm3,BYTE_BIT + + paddw xmm0,xmm2 + paddw xmm1,xmm3 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + psrlw xmm0,1 + psrlw xmm1,1 + + packuswb xmm0,xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + test ecx,ecx + jnz short .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -187,165 +187,165 @@ EXTN(jsimd_h2v1_downsample_sse2): ; JSAMPARRAY input_data, JSAMPARRAY output_data); ; -%define img_width(b) (b)+8 ; JDIMENSION image_width -%define max_v_samp(b) (b)+12 ; int max_v_samp_factor -%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor -%define width_blks(b) (b)+20 ; JDIMENSION width_blocks -%define input_data(b) (b)+24 ; JSAMPARRAY input_data -%define output_data(b) (b)+28 ; JSAMPARRAY output_data +%define img_width(b) (b)+8 ; JDIMENSION image_width +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks +%define input_data(b) (b)+24 ; JSAMPARRAY input_data +%define output_data(b) (b)+28 ; JSAMPARRAY output_data - align 16 - global EXTN(jsimd_h2v2_downsample_sse2) + align 16 + global EXTN(jsimd_h2v2_downsample_sse2) EXTN(jsimd_h2v2_downsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov ecx, JDIMENSION [width_blks(ebp)] - shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) - jz near .return - - mov edx, JDIMENSION [img_width(ebp)] - - ; -- expand_right_edge - - push ecx - shl ecx,1 ; output_cols * 2 - sub ecx,edx - jle short .expand_end - - mov eax, INT [max_v_samp(ebp)] - test eax,eax - jle short .expand_end - - cld - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + + mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + + mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 .expandloop: - push eax - push ecx + push eax + push ecx - mov edi, JSAMPROW [esi] - add edi,edx - mov al, JSAMPLE [edi-1] + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] - rep stosb + rep stosb - pop ecx - pop eax + pop ecx + pop eax - add esi, byte SIZEOF_JSAMPROW - dec eax - jg short .expandloop + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop .expand_end: - pop ecx ; output_cols + pop ecx ; output_cols - ; -- h2v2_downsample + ; -- h2v2_downsample - mov eax, JDIMENSION [v_samp(ebp)] ; rowctr - test eax,eax - jle near .return + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return - mov edx, 0x00020001 ; bias pattern - movd xmm7,edx - pcmpeqw xmm6,xmm6 - pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + mov edx, 0x00020001 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, JSAMPARRAY [output_data(ebp)] ; output_data - alignx 16,7 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 .rowloop: - push ecx - push edi - push esi + push ecx + push edi + push esi - mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 - mov edi, JSAMPROW [edi] ; outptr + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae short .columnloop - alignx 16,7 + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16,7 .columnloop_r8: - movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - pxor xmm2,xmm2 - pxor xmm3,xmm3 - mov ecx, SIZEOF_XMMWORD - jmp short .downsample - alignx 16,7 + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16,7 .columnloop: - movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] .downsample: - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - pand xmm0,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm1,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm0,xmm4 - paddw xmm1,xmm5 - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - pand xmm2,xmm6 - psrlw xmm4,BYTE_BIT - pand xmm3,xmm6 - psrlw xmm5,BYTE_BIT - paddw xmm2,xmm4 - paddw xmm3,xmm5 - - paddw xmm0,xmm1 - paddw xmm2,xmm3 - paddw xmm0,xmm7 - paddw xmm2,xmm7 - psrlw xmm0,2 - psrlw xmm2,2 - - packuswb xmm0,xmm2 - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - - sub ecx, byte SIZEOF_XMMWORD ; outcol - add edx, byte 2*SIZEOF_XMMWORD ; inptr0 - add esi, byte 2*SIZEOF_XMMWORD ; inptr1 - add edi, byte 1*SIZEOF_XMMWORD ; outptr - cmp ecx, byte SIZEOF_XMMWORD - jae near .columnloop - test ecx,ecx - jnz near .columnloop_r8 - - pop esi - pop edi - pop ecx - - add esi, byte 2*SIZEOF_JSAMPROW ; input_data - add edi, byte 1*SIZEOF_JSAMPROW ; output_data - dec eax ; rowctr - jg near .rowloop + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + pand xmm0,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm0,xmm4 + paddw xmm1,xmm5 + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + pand xmm2,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm3,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm2,xmm4 + paddw xmm3,xmm5 + + paddw xmm0,xmm1 + paddw xmm2,xmm3 + paddw xmm0,xmm7 + paddw xmm2,xmm7 + psrlw xmm0,2 + psrlw xmm2,2 + + packuswb xmm0,xmm2 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add edx, byte 2*SIZEOF_XMMWORD ; inptr0 + add esi, byte 2*SIZEOF_XMMWORD ; inptr1 + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdclrmmx.asm b/simd/jdclrmmx.asm index 1c255e802..bb1d2fa06 100644 --- a/simd/jdclrmmx.asm +++ b/simd/jdclrmmx.asm @@ -28,378 +28,378 @@ ; JSAMPARRAY output_buf, int num_rows) ; -%define out_width(b) (b)+8 ; JDIMENSION out_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define input_row(b) (b)+16 ; JDIMENSION input_row -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define num_rows(b) (b)+24 ; int num_rows +%define out_width(b) (b)+8 ; JDIMENSION out_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define input_row(b) (b)+16 ; JDIMENSION input_row +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_ycc_rgb_convert_mmx) + align 16 + global EXTN(jsimd_ycc_rgb_convert_mmx) EXTN(jsimd_ycc_rgb_convert_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [out_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [input_row(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov edi, JSAMPARRAY [output_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 .rowloop: - push eax - push edi - push edx - push ebx - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr0 - mov ebx, JSAMPROW [ebx] ; inptr1 - mov edx, JSAMPROW [edx] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - movpic eax, POINTER [gotptr] ; load GOT address (eax) - alignx 16,7 + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16,7 .columnloop: - movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) - movq mm1, MMWORD [edx] ; mm1=Cr(01234567) - - pcmpeqw mm4,mm4 - pcmpeqw mm7,mm7 - psrlw mm4,BYTE_BIT - psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} - movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} - - pand mm4,mm5 ; mm4=Cb(0246)=CbE - psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO - pand mm0,mm1 ; mm0=Cr(0246)=CrE - psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO - - paddw mm4,mm7 - paddw mm5,mm7 - paddw mm0,mm7 - paddw mm1,mm7 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movq mm2,mm4 ; mm2=CbE - movq mm3,mm5 ; mm3=CbO - paddw mm4,mm4 ; mm4=2*CbE - paddw mm5,mm5 ; mm5=2*CbO - movq mm6,mm0 ; mm6=CrE - movq mm7,mm1 ; mm7=CrO - paddw mm0,mm0 ; mm0=2*CrE - paddw mm1,mm1 ; mm1=2*CrO - - pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) - pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) - pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) - pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) - - paddw mm4,[GOTOFF(eax,PW_ONE)] - paddw mm5,[GOTOFF(eax,PW_ONE)] - psraw mm4,1 ; mm4=(CbE * -FIX(0.22800)) - psraw mm5,1 ; mm5=(CbO * -FIX(0.22800)) - paddw mm0,[GOTOFF(eax,PW_ONE)] - paddw mm1,[GOTOFF(eax,PW_ONE)] - psraw mm0,1 ; mm0=(CrE * FIX(0.40200)) - psraw mm1,1 ; mm1=(CrO * FIX(0.40200)) - - paddw mm4,mm2 - paddw mm5,mm3 - paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E - paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O - paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E - paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O - - movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E - movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O - - movq mm4,mm2 - movq mm5,mm3 - punpcklwd mm2,mm6 - punpckhwd mm4,mm6 - pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd mm3,mm7 - punpckhwd mm5,mm7 - pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd mm2,[GOTOFF(eax,PD_ONEHALF)] - paddd mm4,[GOTOFF(eax,PD_ONEHALF)] - psrad mm2,SCALEBITS - psrad mm4,SCALEBITS - paddd mm3,[GOTOFF(eax,PD_ONEHALF)] - paddd mm5,[GOTOFF(eax,PD_ONEHALF)] - psrad mm3,SCALEBITS - psrad mm5,SCALEBITS - - packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) - packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) - psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E - psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O - - movq mm5, MMWORD [esi] ; mm5=Y(01234567) - - pcmpeqw mm4,mm4 - psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} - pand mm4,mm5 ; mm4=Y(0246)=YE - psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO - - paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) - paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) - packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) - packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) - - paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) - paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) - packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) - packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) - - paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) - paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) - packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) - packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) + movq mm1, MMWORD [edx] ; mm1=Cr(01234567) + + pcmpeqw mm4,mm4 + pcmpeqw mm7,mm7 + psrlw mm4,BYTE_BIT + psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} + movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} + + pand mm4,mm5 ; mm4=Cb(0246)=CbE + psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO + pand mm0,mm1 ; mm0=Cr(0246)=CrE + psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO + + paddw mm4,mm7 + paddw mm5,mm7 + paddw mm0,mm7 + paddw mm1,mm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm2,mm4 ; mm2=CbE + movq mm3,mm5 ; mm3=CbO + paddw mm4,mm4 ; mm4=2*CbE + paddw mm5,mm5 ; mm5=2*CbO + movq mm6,mm0 ; mm6=CrE + movq mm7,mm1 ; mm7=CrO + paddw mm0,mm0 ; mm0=2*CrE + paddw mm1,mm1 ; mm1=2*CrO + + pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) + pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) + pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) + pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) + + paddw mm4,[GOTOFF(eax,PW_ONE)] + paddw mm5,[GOTOFF(eax,PW_ONE)] + psraw mm4,1 ; mm4=(CbE * -FIX(0.22800)) + psraw mm5,1 ; mm5=(CbO * -FIX(0.22800)) + paddw mm0,[GOTOFF(eax,PW_ONE)] + paddw mm1,[GOTOFF(eax,PW_ONE)] + psraw mm0,1 ; mm0=(CrE * FIX(0.40200)) + psraw mm1,1 ; mm1=(CrO * FIX(0.40200)) + + paddw mm4,mm2 + paddw mm5,mm3 + paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E + paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O + paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E + paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O + + movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E + movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O + + movq mm4,mm2 + movq mm5,mm3 + punpcklwd mm2,mm6 + punpckhwd mm4,mm6 + pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm3,mm7 + punpckhwd mm5,mm7 + pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm2,[GOTOFF(eax,PD_ONEHALF)] + paddd mm4,[GOTOFF(eax,PD_ONEHALF)] + psrad mm2,SCALEBITS + psrad mm4,SCALEBITS + paddd mm3,[GOTOFF(eax,PD_ONEHALF)] + paddd mm5,[GOTOFF(eax,PD_ONEHALF)] + psrad mm3,SCALEBITS + psrad mm5,SCALEBITS + + packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movq mm5, MMWORD [esi] ; mm5=Y(01234567) + + pcmpeqw mm4,mm4 + psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} + pand mm4,mm5 ; mm4=Y(0246)=YE + psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO + + paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) + paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) + packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + + paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) + paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) + packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + + paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) + paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) + packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) %if RGB_PIXELSIZE == 3 ; --------------- - ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) - ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) - ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) - ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) - punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) - punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) - punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) - movq mmG,mmA - movq mmH,mmA - punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) - punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) + movq mmG,mmA + movq mmH,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) - psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) - psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) - movq mmC,mmD - movq mmB,mmD - punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) - punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) + movq mmC,mmD + movq mmB,mmD + punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) - psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) - movq mmF,mmE - punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) - punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) + movq mmF,mmE + punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) - punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) - punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) - punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) + punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st16 + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmE - movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC - sub ecx, byte SIZEOF_MMWORD - jz short .nextrow + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow - add esi, byte SIZEOF_MMWORD ; inptr0 - add ebx, byte SIZEOF_MMWORD ; inptr1 - add edx, byte SIZEOF_MMWORD ; inptr2 - add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr - jmp near .columnloop - alignx 16,7 + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16,7 .column_st16: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_MMWORD - jb short .column_st8 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmE - movq mmA,mmC - sub ecx, byte 2*SIZEOF_MMWORD - add edi, byte 2*SIZEOF_MMWORD - jmp short .column_st4 + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA,mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 .column_st8: - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st4 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq mmA,mmE - sub ecx, byte SIZEOF_MMWORD - add edi, byte SIZEOF_MMWORD + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD .column_st4: - movd eax,mmA - cmp ecx, byte SIZEOF_DWORD - jb short .column_st2 - mov DWORD [edi+0*SIZEOF_DWORD], eax - psrlq mmA,DWORD_BIT - movd eax,mmA - sub ecx, byte SIZEOF_DWORD - add edi, byte SIZEOF_DWORD + movd eax,mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov DWORD [edi+0*SIZEOF_DWORD], eax + psrlq mmA,DWORD_BIT + movd eax,mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD .column_st2: - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi+0*SIZEOF_WORD], ax - shr eax,WORD_BIT - sub ecx, byte SIZEOF_WORD - add edi, byte SIZEOF_WORD + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi+0*SIZEOF_WORD], ax + shr eax,WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD .column_st1: - cmp ecx, byte SIZEOF_BYTE - jb short .nextrow - mov BYTE [edi+0*SIZEOF_BYTE], al + cmp ecx, byte SIZEOF_BYTE + jb short .nextrow + mov BYTE [edi+0*SIZEOF_BYTE], al %else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) - pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) + pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) %else - pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) - pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) + pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) %endif - ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) - ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) - ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) - ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) - - punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) - punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) - punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) - punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) - - movq mmC,mmA - punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) - punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) - movq mmG,mmB - punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) - punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) - - movq mmD,mmA - punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) - punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) - movq mmH,mmC - punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) - punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) - - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st16 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmD - movq MMWORD [edi+2*SIZEOF_MMWORD], mmC - movq MMWORD [edi+3*SIZEOF_MMWORD], mmH - - sub ecx, byte SIZEOF_MMWORD - jz short .nextrow - - add esi, byte SIZEOF_MMWORD ; inptr0 - add ebx, byte SIZEOF_MMWORD ; inptr1 - add edx, byte SIZEOF_MMWORD ; inptr2 - add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr - jmp near .columnloop - alignx 16,7 + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG,mmB + punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD,mmA + punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH,mmC + punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .nextrow + + add esi, byte SIZEOF_MMWORD ; inptr0 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + jmp near .columnloop + alignx 16,7 .column_st16: - cmp ecx, byte SIZEOF_MMWORD/2 - jb short .column_st8 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmD - movq mmA,mmC - movq mmD,mmH - sub ecx, byte SIZEOF_MMWORD/2 - add edi, byte 2*SIZEOF_MMWORD + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA,mmC + movq mmD,mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD .column_st8: - cmp ecx, byte SIZEOF_MMWORD/4 - jb short .column_st4 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq mmA,mmD - sub ecx, byte SIZEOF_MMWORD/4 - add edi, byte 1*SIZEOF_MMWORD + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD .column_st4: - cmp ecx, byte SIZEOF_MMWORD/8 - jb short .nextrow - movd DWORD [edi+0*SIZEOF_DWORD], mmA + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .nextrow + movd DWORD [edi+0*SIZEOF_DWORD], mmA %endif ; RGB_PIXELSIZE ; --------------- - alignx 16,7 + alignx 16,7 .nextrow: - pop ecx - pop esi - pop ebx - pop edx - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - add edi, byte SIZEOF_JSAMPROW ; output_buf - dec eax ; num_rows - jg near .rowloop - - emms ; empty MMX state + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm index 7d17c52b8..88a9f1e4a 100644 --- a/simd/jdclrss2-64.asm +++ b/simd/jdclrss2-64.asm @@ -18,7 +18,7 @@ ; [TAB8] %include "jcolsamp.inc" - + ; -------------------------------------------------------------------------- ; ; Convert some rows of samples to the output colorspace. @@ -35,407 +35,407 @@ ; r13 = JSAMPARRAY output_buf ; r14 = int num_rows -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_ycc_rgb_convert_sse2) + align 16 + global EXTN(jsimd_ycc_rgb_convert_sse2) EXTN(jsimd_ycc_rgb_convert_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov rcx, r10 ; num_cols - test rcx,rcx - jz near .return - - push rcx - - mov rdi, r11 - mov rcx, r12 - mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] - mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] - mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] - lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] - lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] - lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] - - pop rcx - - mov rdi, r13 - mov eax, r14d - test rax,rax - jle near .return + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov rcx, r10 ; num_cols + test rcx,rcx + jz near .return + + push rcx + + mov rdi, r11 + mov rcx, r12 + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] + lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] + + pop rcx + + mov rdi, r13 + mov eax, r14d + test rax,rax + jle near .return .rowloop: - push rax - push rdi - push rdx - push rbx - push rsi - push rcx ; col - - mov rsi, JSAMPROW [rsi] ; inptr0 - mov rbx, JSAMPROW [rbx] ; inptr1 - mov rdx, JSAMPROW [rdx] ; inptr2 - mov rdi, JSAMPROW [rdi] ; outptr + push rax + push rdi + push rdx + push rbx + push rsi + push rcx ; col + + mov rsi, JSAMPROW [rsi] ; inptr0 + mov rbx, JSAMPROW [rbx] ; inptr1 + mov rdx, JSAMPROW [rdx] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr .columnloop: - movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) - movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - pcmpeqw xmm7,xmm7 - psrlw xmm4,BYTE_BIT - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} - - pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE - psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO - pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE - psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO - - paddw xmm4,xmm7 - paddw xmm5,xmm7 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm2,xmm4 ; xmm2=CbE - movdqa xmm3,xmm5 ; xmm3=CbO - paddw xmm4,xmm4 ; xmm4=2*CbE - paddw xmm5,xmm5 ; xmm5=2*CbO - movdqa xmm6,xmm0 ; xmm6=CrE - movdqa xmm7,xmm1 ; xmm7=CrO - paddw xmm0,xmm0 ; xmm0=2*CrE - paddw xmm1,xmm1 ; xmm1=2*CrO - - pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) - pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) - pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) - pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) - - paddw xmm4,[rel PW_ONE] - paddw xmm5,[rel PW_ONE] - psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) - psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) - paddw xmm0,[rel PW_ONE] - paddw xmm1,[rel PW_ONE] - psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) - psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) - - paddw xmm4,xmm2 - paddw xmm5,xmm3 - paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E - paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O - paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E - paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - punpcklwd xmm2,xmm6 - punpckhwd xmm4,xmm6 - pmaddwd xmm2,[rel PW_MF0344_F0285] - pmaddwd xmm4,[rel PW_MF0344_F0285] - punpcklwd xmm3,xmm7 - punpckhwd xmm5,xmm7 - pmaddwd xmm3,[rel PW_MF0344_F0285] - pmaddwd xmm5,[rel PW_MF0344_F0285] - - paddd xmm2,[rel PD_ONEHALF] - paddd xmm4,[rel PD_ONEHALF] - psrad xmm2,SCALEBITS - psrad xmm4,SCALEBITS - paddd xmm3,[rel PD_ONEHALF] - paddd xmm5,[rel PD_ONEHALF] - psrad xmm3,SCALEBITS - psrad xmm5,SCALEBITS - - packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) - packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) - psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E - psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O - - movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} - pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE - psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO - - paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) - paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - - paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) - paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - - paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) - paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + pcmpeqw xmm7,xmm7 + psrlw xmm4,BYTE_BIT + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4,xmm7 + paddw xmm5,xmm7 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2,xmm4 ; xmm2=CbE + movdqa xmm3,xmm5 ; xmm3=CbO + paddw xmm4,xmm4 ; xmm4=2*CbE + paddw xmm5,xmm5 ; xmm5=2*CbO + movdqa xmm6,xmm0 ; xmm6=CrE + movdqa xmm7,xmm1 ; xmm7=CrO + paddw xmm0,xmm0 ; xmm0=2*CrE + paddw xmm1,xmm1 ; xmm1=2*CrO + + pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4,[rel PW_ONE] + paddw xmm5,[rel PW_ONE] + psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0,[rel PW_ONE] + paddw xmm1,[rel PW_ONE] + psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4,xmm2 + paddw xmm5,xmm3 + paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + punpcklwd xmm2,xmm6 + punpckhwd xmm4,xmm6 + pmaddwd xmm2,[rel PW_MF0344_F0285] + pmaddwd xmm4,[rel PW_MF0344_F0285] + punpcklwd xmm3,xmm7 + punpckhwd xmm5,xmm7 + pmaddwd xmm3,[rel PW_MF0344_F0285] + pmaddwd xmm5,[rel PW_MF0344_F0285] + + paddd xmm2,[rel PD_ONEHALF] + paddd xmm4,[rel PD_ONEHALF] + psrad xmm2,SCALEBITS + psrad xmm4,SCALEBITS + paddd xmm3,[rel PD_ONEHALF] + paddd xmm5,[rel PD_ONEHALF] + psrad xmm3,SCALEBITS + psrad xmm5,SCALEBITS + + packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) %if RGB_PIXELSIZE == 3 ; --------------- - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test rdi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF .out0: - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub rcx, byte SIZEOF_XMMWORD - jz near .nextrow + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow - add rsi, byte SIZEOF_XMMWORD ; inptr0 - add rbx, byte SIZEOF_XMMWORD ; inptr1 - add rdx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop .column_st32: - lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE - cmp rcx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub rcx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 .column_st16: - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub rcx, byte SIZEOF_XMMWORD + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD .column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [rdi], xmmA - add rdi, byte SIZEOF_MMWORD - sub rcx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD .column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [rdi], xmmA - add rdi, byte SIZEOF_DWORD - sub rcx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD .column_st3: - ; Store the lower 2 bytes of rax to the output when it has enough - ; space. - movd eax, xmmA - cmp rcx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [rdi], ax - add rdi, byte SIZEOF_WORD - sub rcx, byte SIZEOF_WORD - shr rax, 16 + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 .column_st1: - ; Store the lower 1 byte of rax to the output when it has enough - ; space. - test rcx, rcx - jz short .nextrow - mov BYTE [rdi], al + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + mov BYTE [rdi], al %else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) %else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) %endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test rdi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH .out0: - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub rcx, byte SIZEOF_XMMWORD - jz near .nextrow + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .nextrow - add rsi, byte SIZEOF_XMMWORD ; inptr0 - add rbx, byte SIZEOF_XMMWORD ; inptr1 - add rdx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop + add rsi, byte SIZEOF_XMMWORD ; inptr0 + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop .column_st32: - cmp rcx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub rcx, byte SIZEOF_XMMWORD/2 + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub rcx, byte SIZEOF_XMMWORD/2 .column_st16: - cmp rcx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub rcx, byte SIZEOF_XMMWORD/4 + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD/4 .column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq MMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD/8*4 - sub rcx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 .column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test rcx, rcx - jz short .nextrow - movd XMM_DWORD [rdi], xmmA + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .nextrow + movd XMM_DWORD [rdi], xmmA %endif ; RGB_PIXELSIZE ; --------------- .nextrow: - pop rcx - pop rsi - pop rbx - pop rdx - pop rdi - pop rax - - add rsi, byte SIZEOF_JSAMPROW - add rbx, byte SIZEOF_JSAMPROW - add rdx, byte SIZEOF_JSAMPROW - add rdi, byte SIZEOF_JSAMPROW ; output_buf - dec rax ; num_rows - jg near .rowloop - - sfence ; flush the write buffer + pop rcx + pop rsi + pop rbx + pop rdx + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW + add rbx, byte SIZEOF_JSAMPROW + add rdx, byte SIZEOF_JSAMPROW + add rdi, byte SIZEOF_JSAMPROW ; output_buf + dec rax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm index 97754cb43..07b4fcf43 100644 --- a/simd/jdclrss2.asm +++ b/simd/jdclrss2.asm @@ -18,7 +18,7 @@ ; [TAB8] %include "jcolsamp.inc" - + ; -------------------------------------------------------------------------- ; ; Convert some rows of samples to the output colorspace. @@ -29,432 +29,432 @@ ; JSAMPARRAY output_buf, int num_rows) ; -%define out_width(b) (b)+8 ; JDIMENSION out_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define input_row(b) (b)+16 ; JDIMENSION input_row -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define num_rows(b) (b)+24 ; int num_rows +%define out_width(b) (b)+8 ; JDIMENSION out_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define input_row(b) (b)+16 ; JDIMENSION input_row +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define num_rows(b) (b)+24 ; int num_rows -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_ycc_rgb_convert_sse2) + align 16 + global EXTN(jsimd_ycc_rgb_convert_sse2) EXTN(jsimd_ycc_rgb_convert_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [out_width(eax)] ; num_cols - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [input_row(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] - lea edx, [edx+ecx*SIZEOF_JSAMPROW] - - pop ecx - - mov edi, JSAMPARRAY [output_buf(eax)] - mov eax, INT [num_rows(eax)] - test eax,eax - jle near .return - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 .rowloop: - push eax - push edi - push edx - push ebx - push esi - push ecx ; col - - mov esi, JSAMPROW [esi] ; inptr0 - mov ebx, JSAMPROW [ebx] ; inptr1 - mov edx, JSAMPROW [edx] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - movpic eax, POINTER [gotptr] ; load GOT address (eax) - alignx 16,7 + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16,7 .columnloop: - movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) - movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - pcmpeqw xmm7,xmm7 - psrlw xmm4,BYTE_BIT - psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} - movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} - - pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE - psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO - pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE - psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO - - paddw xmm4,xmm7 - paddw xmm5,xmm7 - paddw xmm0,xmm7 - paddw xmm1,xmm7 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm2,xmm4 ; xmm2=CbE - movdqa xmm3,xmm5 ; xmm3=CbO - paddw xmm4,xmm4 ; xmm4=2*CbE - paddw xmm5,xmm5 ; xmm5=2*CbO - movdqa xmm6,xmm0 ; xmm6=CrE - movdqa xmm7,xmm1 ; xmm7=CrO - paddw xmm0,xmm0 ; xmm0=2*CrE - paddw xmm1,xmm1 ; xmm1=2*CrO - - pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) - pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) - pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) - pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) - - paddw xmm4,[GOTOFF(eax,PW_ONE)] - paddw xmm5,[GOTOFF(eax,PW_ONE)] - psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) - psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) - paddw xmm0,[GOTOFF(eax,PW_ONE)] - paddw xmm1,[GOTOFF(eax,PW_ONE)] - psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) - psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) - - paddw xmm4,xmm2 - paddw xmm5,xmm3 - paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E - paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O - paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E - paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O - - movdqa xmm4,xmm2 - movdqa xmm5,xmm3 - punpcklwd xmm2,xmm6 - punpckhwd xmm4,xmm6 - pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd xmm3,xmm7 - punpckhwd xmm5,xmm7 - pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm2,SCALEBITS - psrad xmm4,SCALEBITS - paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm3,SCALEBITS - psrad xmm5,SCALEBITS - - packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) - packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) - psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E - psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O - - movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) - - pcmpeqw xmm4,xmm4 - psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} - pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE - psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO - - paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) - paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - - paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) - paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - - paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) - paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + pcmpeqw xmm7,xmm7 + psrlw xmm4,BYTE_BIT + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4,xmm7 + paddw xmm5,xmm7 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2,xmm4 ; xmm2=CbE + movdqa xmm3,xmm5 ; xmm3=CbO + paddw xmm4,xmm4 ; xmm4=2*CbE + paddw xmm5,xmm5 ; xmm5=2*CbO + movdqa xmm6,xmm0 ; xmm6=CrE + movdqa xmm7,xmm1 ; xmm7=CrO + paddw xmm0,xmm0 ; xmm0=2*CrE + paddw xmm1,xmm1 ; xmm1=2*CrO + + pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4,[GOTOFF(eax,PW_ONE)] + paddw xmm5,[GOTOFF(eax,PW_ONE)] + psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0,[GOTOFF(eax,PW_ONE)] + paddw xmm1,[GOTOFF(eax,PW_ONE)] + psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4,xmm2 + paddw xmm5,xmm3 + paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + punpcklwd xmm2,xmm6 + punpckhwd xmm4,xmm6 + pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm3,xmm7 + punpckhwd xmm5,xmm7 + pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm2,SCALEBITS + psrad xmm4,SCALEBITS + paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm3,SCALEBITS + psrad xmm5,SCALEBITS + + packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) %if RGB_PIXELSIZE == 3 ; --------------- - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF .out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .nextrow + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow - add esi, byte SIZEOF_XMMWORD ; inptr0 - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 .column_st32: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub ecx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 .column_st16: - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD .column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_MMWORD - sub ecx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD .column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [edi], xmmA - add edi, byte SIZEOF_DWORD - sub ecx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD .column_st3: - ; Store the lower 2 bytes of eax to the output when it has enough - ; space. - movd eax, xmmA - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi], ax - add edi, byte SIZEOF_WORD - sub ecx, byte SIZEOF_WORD - shr eax, 16 + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 .column_st1: - ; Store the lower 1 byte of eax to the output when it has enough - ; space. - test ecx, ecx - jz short .nextrow - mov BYTE [edi], al + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + mov BYTE [edi], al %else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) %else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) %endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH .out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .nextrow + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow - add esi, byte SIZEOF_XMMWORD ; inptr0 - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 .column_st32: - cmp ecx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub ecx, byte SIZEOF_XMMWORD/2 + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub ecx, byte SIZEOF_XMMWORD/2 .column_st16: - cmp ecx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD/4 + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD/4 .column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD/8*4 - sub ecx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 .column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test ecx, ecx - jz short .nextrow - movd XMM_DWORD [edi], xmmA + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .nextrow + movd XMM_DWORD [edi], xmmA %endif ; RGB_PIXELSIZE ; --------------- - alignx 16,7 + alignx 16,7 .nextrow: - pop ecx - pop esi - pop ebx - pop edx - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW - add ebx, byte SIZEOF_JSAMPROW - add edx, byte SIZEOF_JSAMPROW - add edi, byte SIZEOF_JSAMPROW ; output_buf - dec eax ; num_rows - jg near .rowloop - - sfence ; flush the write buffer + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdcolmmx.asm b/simd/jdcolmmx.asm index 5e4e47d42..d2966fe23 100644 --- a/simd/jdcolmmx.asm +++ b/simd/jdcolmmx.asm @@ -21,35 +21,35 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_ycc_rgb_convert_mmx) + alignz 16 + global EXTN(jconst_ycc_rgb_convert_mmx) EXTN(jconst_ycc_rgb_convert_mmx): -PW_F0402 times 4 dw F_0_402 -PW_MF0228 times 4 dw -F_0_228 -PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 -PW_ONE times 4 dw 1 -PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jdclrmmx.asm" diff --git a/simd/jdcolss2-64.asm b/simd/jdcolss2-64.asm index 01b3dce69..b484618da 100644 --- a/simd/jdcolss2-64.asm +++ b/simd/jdcolss2-64.asm @@ -21,35 +21,35 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_ycc_rgb_convert_sse2) + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): -PW_F0402 times 8 dw F_0_402 -PW_MF0228 times 8 dw -F_0_228 -PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 -PW_ONE times 8 dw 1 -PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 %include "jdclrss2-64.asm" diff --git a/simd/jdcolss2.asm b/simd/jdcolss2.asm index 1912d92e4..38ed4164a 100644 --- a/simd/jdcolss2.asm +++ b/simd/jdcolss2.asm @@ -21,35 +21,35 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_ycc_rgb_convert_sse2) + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) EXTN(jconst_ycc_rgb_convert_sse2): -PW_F0402 times 8 dw F_0_402 -PW_MF0228 times 8 dw -F_0_228 -PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 -PW_ONE times 8 dw 1 -PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jdclrss2.asm" diff --git a/simd/jdct.inc b/simd/jdct.inc index cc6270425..ad5890c6c 100644 --- a/simd/jdct.inc +++ b/simd/jdct.inc @@ -18,11 +18,11 @@ ; %define RANGE_MASK (MAXJSAMPLE * 4 + 3) ; 2 bits wider than legal samples -%define ROW(n,b,s) ((b)+(n)*(s)) -%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE) +%define ROW(n,b,s) ((b)+(n)*(s)) +%define COL(n,b,s) ((b)+(n)*(s)*DCTSIZE) -%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) -%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) -%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) +%define DWBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD) +%define MMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD) +%define XMMBLOCK(m,n,b,s) ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD) ; -------------------------------------------------------------------------- diff --git a/simd/jdmermmx.asm b/simd/jdmermmx.asm index 7b86c7493..c2093c2cd 100644 --- a/simd/jdmermmx.asm +++ b/simd/jdmermmx.asm @@ -21,35 +21,35 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_merged_upsample_mmx) + alignz 16 + global EXTN(jconst_merged_upsample_mmx) EXTN(jconst_merged_upsample_mmx): -PW_F0402 times 4 dw F_0_402 -PW_MF0228 times 4 dw -F_0_228 -PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 -PW_ONE times 4 dw 1 -PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) +PW_F0402 times 4 dw F_0_402 +PW_MF0228 times 4 dw -F_0_228 +PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285 +PW_ONE times 4 dw 1 +PD_ONEHALF times 2 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jdmrgmmx.asm" diff --git a/simd/jdmerss2-64.asm b/simd/jdmerss2-64.asm index a184ea69b..1f0b33985 100644 --- a/simd/jdmerss2-64.asm +++ b/simd/jdmerss2-64.asm @@ -21,35 +21,35 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_merged_upsample_sse2) + alignz 16 + global EXTN(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): -PW_F0402 times 8 dw F_0_402 -PW_MF0228 times 8 dw -F_0_228 -PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 -PW_ONE times 8 dw 1 -PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 %include "jdmrgss2-64.asm" diff --git a/simd/jdmerss2.asm b/simd/jdmerss2.asm index e536c802e..60d0ebb7f 100644 --- a/simd/jdmerss2.asm +++ b/simd/jdmerss2.asm @@ -21,35 +21,35 @@ ; -------------------------------------------------------------------------- -%define SCALEBITS 16 +%define SCALEBITS 16 -F_0_344 equ 22554 ; FIX(0.34414) -F_0_714 equ 46802 ; FIX(0.71414) -F_1_402 equ 91881 ; FIX(1.40200) -F_1_772 equ 116130 ; FIX(1.77200) -F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) -F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) -F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) +F_0_344 equ 22554 ; FIX(0.34414) +F_0_714 equ 46802 ; FIX(0.71414) +F_1_402 equ 91881 ; FIX(1.40200) +F_1_772 equ 116130 ; FIX(1.77200) +F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) +F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) +F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_merged_upsample_sse2) + alignz 16 + global EXTN(jconst_merged_upsample_sse2) EXTN(jconst_merged_upsample_sse2): -PW_F0402 times 8 dw F_0_402 -PW_MF0228 times 8 dw -F_0_228 -PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 -PW_ONE times 8 dw 1 -PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) +PW_F0402 times 8 dw F_0_402 +PW_MF0228 times 8 dw -F_0_228 +PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 +PW_ONE times 8 dw 1 +PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 %include "jdmrgss2.asm" diff --git a/simd/jdmrgmmx.asm b/simd/jdmrgmmx.asm index d0800a737..136f12cf1 100644 --- a/simd/jdmrgmmx.asm +++ b/simd/jdmrgmmx.asm @@ -29,368 +29,368 @@ ; JSAMPARRAY output_buf); ; -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 3 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_h2v1_merged_upsample_mmx) + align 16 + global EXTN(jsimd_h2v1_merged_upsample_mmx) EXTN(jsimd_h2v1_merged_upsample_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [output_width(eax)] ; col - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [in_row_group_ctr(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(eax)] - mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 - mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 - mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - - pop ecx ; col - - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16,7 .columnloop: - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) - movq mm7, MMWORD [edx] ; mm7=Cr(01234567) - - pxor mm1,mm1 ; mm1=(all 0's) - pcmpeqw mm3,mm3 - psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} - - movq mm4,mm6 - punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH - punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL - movq mm0,mm7 - punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH - punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL - - paddw mm6,mm3 - paddw mm4,mm3 - paddw mm7,mm3 - paddw mm0,mm3 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movq mm5,mm6 ; mm5=CbH - movq mm2,mm4 ; mm2=CbL - paddw mm6,mm6 ; mm6=2*CbH - paddw mm4,mm4 ; mm4=2*CbL - movq mm1,mm7 ; mm1=CrH - movq mm3,mm0 ; mm3=CrL - paddw mm7,mm7 ; mm7=2*CrH - paddw mm0,mm0 ; mm0=2*CrL - - pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) - pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) - pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) - pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) - - paddw mm6,[GOTOFF(eax,PW_ONE)] - paddw mm4,[GOTOFF(eax,PW_ONE)] - psraw mm6,1 ; mm6=(CbH * -FIX(0.22800)) - psraw mm4,1 ; mm4=(CbL * -FIX(0.22800)) - paddw mm7,[GOTOFF(eax,PW_ONE)] - paddw mm0,[GOTOFF(eax,PW_ONE)] - psraw mm7,1 ; mm7=(CrH * FIX(0.40200)) - psraw mm0,1 ; mm0=(CrL * FIX(0.40200)) - - paddw mm6,mm5 - paddw mm4,mm2 - paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H - paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L - paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H - paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L - - movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H - movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H - - movq mm6,mm5 - movq mm7,mm2 - punpcklwd mm5,mm1 - punpckhwd mm6,mm1 - pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd mm2,mm3 - punpckhwd mm7,mm3 - pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd mm5,[GOTOFF(eax,PD_ONEHALF)] - paddd mm6,[GOTOFF(eax,PD_ONEHALF)] - psrad mm5,SCALEBITS - psrad mm6,SCALEBITS - paddd mm2,[GOTOFF(eax,PD_ONEHALF)] - paddd mm7,[GOTOFF(eax,PD_ONEHALF)] - psrad mm2,SCALEBITS - psrad mm7,SCALEBITS - - packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) - packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) - psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H - psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L - - movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H - - mov al,2 ; Yctr - jmp short .Yloop_1st - alignx 16,7 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) + movq mm7, MMWORD [edx] ; mm7=Cr(01234567) + + pxor mm1,mm1 ; mm1=(all 0's) + pcmpeqw mm3,mm3 + psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} + + movq mm4,mm6 + punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH + punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL + movq mm0,mm7 + punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH + punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL + + paddw mm6,mm3 + paddw mm4,mm3 + paddw mm7,mm3 + paddw mm0,mm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movq mm5,mm6 ; mm5=CbH + movq mm2,mm4 ; mm2=CbL + paddw mm6,mm6 ; mm6=2*CbH + paddw mm4,mm4 ; mm4=2*CbL + movq mm1,mm7 ; mm1=CrH + movq mm3,mm0 ; mm3=CrL + paddw mm7,mm7 ; mm7=2*CrH + paddw mm0,mm0 ; mm0=2*CrL + + pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) + pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) + pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) + pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) + + paddw mm6,[GOTOFF(eax,PW_ONE)] + paddw mm4,[GOTOFF(eax,PW_ONE)] + psraw mm6,1 ; mm6=(CbH * -FIX(0.22800)) + psraw mm4,1 ; mm4=(CbL * -FIX(0.22800)) + paddw mm7,[GOTOFF(eax,PW_ONE)] + paddw mm0,[GOTOFF(eax,PW_ONE)] + psraw mm7,1 ; mm7=(CrH * FIX(0.40200)) + psraw mm0,1 ; mm0=(CrL * FIX(0.40200)) + + paddw mm6,mm5 + paddw mm4,mm2 + paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H + paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L + paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H + paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L + + movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H + movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H + + movq mm6,mm5 + movq mm7,mm2 + punpcklwd mm5,mm1 + punpckhwd mm6,mm1 + pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd mm2,mm3 + punpckhwd mm7,mm3 + pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd mm5,[GOTOFF(eax,PD_ONEHALF)] + paddd mm6,[GOTOFF(eax,PD_ONEHALF)] + psrad mm5,SCALEBITS + psrad mm6,SCALEBITS + paddd mm2,[GOTOFF(eax,PD_ONEHALF)] + paddd mm7,[GOTOFF(eax,PD_ONEHALF)] + psrad mm2,SCALEBITS + psrad mm7,SCALEBITS + + packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st + alignx 16,7 .Yloop_2nd: - movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H - movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H - movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H - alignx 16,7 + movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H + movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H + movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H + alignx 16,7 .Yloop_1st: - movq mm7, MMWORD [esi] ; mm7=Y(01234567) + movq mm7, MMWORD [esi] ; mm7=Y(01234567) - pcmpeqw mm6,mm6 - psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} - pand mm6,mm7 ; mm6=Y(0246)=YE - psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO + pcmpeqw mm6,mm6 + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} + pand mm6,mm7 ; mm6=Y(0246)=YE + psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO - movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H) - movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H) - movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H) + movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H) + movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H) + movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H) - paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) - paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) - packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) - packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) + paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) + paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) + packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) + packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) - paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) - paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) - packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) - packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) + paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) + paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) + packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) + packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) - paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) - paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) - packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) - packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) + paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) + paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) + packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) + packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) %if RGB_PIXELSIZE == 3 ; --------------- - ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) - ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) - ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) - ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) - punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) - punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) - punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) + punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) - movq mmG,mmA - movq mmH,mmA - punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) - punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) + movq mmG,mmA + movq mmH,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) + punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) - psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) - psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) + psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) + psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) - movq mmC,mmD - movq mmB,mmD - punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) - punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) + movq mmC,mmD + movq mmB,mmD + punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) + punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) - psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) + psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) - movq mmF,mmE - punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) - punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) + movq mmF,mmE + punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) + punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) - punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) - punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) - punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) + punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) + punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) + punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st16 + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmE - movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC - sub ecx, byte SIZEOF_MMWORD - jz near .endcolumn + sub ecx, byte SIZEOF_MMWORD + jz near .endcolumn - add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr - add esi, byte SIZEOF_MMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add ebx, byte SIZEOF_MMWORD ; inptr1 - add edx, byte SIZEOF_MMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 .column_st16: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_MMWORD - jb short .column_st8 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmE - movq mmA,mmC - sub ecx, byte 2*SIZEOF_MMWORD - add edi, byte 2*SIZEOF_MMWORD - jmp short .column_st4 + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_MMWORD + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE + movq mmA,mmC + sub ecx, byte 2*SIZEOF_MMWORD + add edi, byte 2*SIZEOF_MMWORD + jmp short .column_st4 .column_st8: - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st4 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq mmA,mmE - sub ecx, byte SIZEOF_MMWORD - add edi, byte SIZEOF_MMWORD + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmE + sub ecx, byte SIZEOF_MMWORD + add edi, byte SIZEOF_MMWORD .column_st4: - movd eax,mmA - cmp ecx, byte SIZEOF_DWORD - jb short .column_st2 - mov DWORD [edi+0*SIZEOF_DWORD], eax - psrlq mmA,DWORD_BIT - movd eax,mmA - sub ecx, byte SIZEOF_DWORD - add edi, byte SIZEOF_DWORD + movd eax,mmA + cmp ecx, byte SIZEOF_DWORD + jb short .column_st2 + mov DWORD [edi+0*SIZEOF_DWORD], eax + psrlq mmA,DWORD_BIT + movd eax,mmA + sub ecx, byte SIZEOF_DWORD + add edi, byte SIZEOF_DWORD .column_st2: - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi+0*SIZEOF_WORD], ax - shr eax,WORD_BIT - sub ecx, byte SIZEOF_WORD - add edi, byte SIZEOF_WORD + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi+0*SIZEOF_WORD], ax + shr eax,WORD_BIT + sub ecx, byte SIZEOF_WORD + add edi, byte SIZEOF_WORD .column_st1: - cmp ecx, byte SIZEOF_BYTE - jb short .endcolumn - mov BYTE [edi+0*SIZEOF_BYTE], al + cmp ecx, byte SIZEOF_BYTE + jb short .endcolumn + mov BYTE [edi+0*SIZEOF_BYTE], al %else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) - pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) + pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) %else - pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) - pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) + pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) + pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) %endif - ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) - ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) - ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) - ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) - - punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) - punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) - punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) - punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) - - movq mmC,mmA - punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) - punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) - movq mmG,mmB - punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) - punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) - - movq mmD,mmA - punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) - punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) - movq mmH,mmC - punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) - punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) - - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st16 - - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmD - movq MMWORD [edi+2*SIZEOF_MMWORD], mmC - movq MMWORD [edi+3*SIZEOF_MMWORD], mmH - - sub ecx, byte SIZEOF_MMWORD - jz short .endcolumn - - add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr - add esi, byte SIZEOF_MMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd - - add ebx, byte SIZEOF_MMWORD ; inptr1 - add edx, byte SIZEOF_MMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) + + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) + punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) + punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) + punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) + + movq mmC,mmA + punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) + punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) + movq mmG,mmB + punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) + punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) + + movq mmD,mmA + punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) + punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) + movq mmH,mmC + punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) + punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) + + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st16 + + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH + + sub ecx, byte SIZEOF_MMWORD + jz short .endcolumn + + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr + add esi, byte SIZEOF_MMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd + + add ebx, byte SIZEOF_MMWORD ; inptr1 + add edx, byte SIZEOF_MMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 .column_st16: - cmp ecx, byte SIZEOF_MMWORD/2 - jb short .column_st8 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq MMWORD [edi+1*SIZEOF_MMWORD], mmD - movq mmA,mmC - movq mmD,mmH - sub ecx, byte SIZEOF_MMWORD/2 - add edi, byte 2*SIZEOF_MMWORD + cmp ecx, byte SIZEOF_MMWORD/2 + jb short .column_st8 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD + movq mmA,mmC + movq mmD,mmH + sub ecx, byte SIZEOF_MMWORD/2 + add edi, byte 2*SIZEOF_MMWORD .column_st8: - cmp ecx, byte SIZEOF_MMWORD/4 - jb short .column_st4 - movq MMWORD [edi+0*SIZEOF_MMWORD], mmA - movq mmA,mmD - sub ecx, byte SIZEOF_MMWORD/4 - add edi, byte 1*SIZEOF_MMWORD + cmp ecx, byte SIZEOF_MMWORD/4 + jb short .column_st4 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA + movq mmA,mmD + sub ecx, byte SIZEOF_MMWORD/4 + add edi, byte 1*SIZEOF_MMWORD .column_st4: - cmp ecx, byte SIZEOF_MMWORD/8 - jb short .endcolumn - movd DWORD [edi+0*SIZEOF_DWORD], mmA + cmp ecx, byte SIZEOF_MMWORD/8 + jb short .endcolumn + movd DWORD [edi+0*SIZEOF_DWORD], mmA %endif ; RGB_PIXELSIZE ; --------------- .endcolumn: - emms ; empty MMX state + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -403,62 +403,62 @@ EXTN(jsimd_h2v1_merged_upsample_mmx): ; JSAMPARRAY output_buf); ; -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - align 16 - global EXTN(jsimd_h2v2_merged_upsample_mmx) + align 16 + global EXTN(jsimd_h2v2_merged_upsample_mmx) EXTN(jsimd_h2v2_merged_upsample_mmx): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov eax, JDIMENSION [output_width(ebp)] - - mov edi, JSAMPIMAGE [input_buf(ebp)] - mov ecx, JDIMENSION [in_row_group_ctr(ebp)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(ebp)] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - - push edx ; inptr2 - push ebx ; inptr1 - push esi ; inptr00 - mov ebx,esp - - push edi ; output_buf (outptr0) - push ecx ; in_row_group_ctr - push ebx ; input_buf - push eax ; output_width - - call near EXTN(jsimd_h2v1_merged_upsample_mmx) - - add esi, byte SIZEOF_JSAMPROW ; inptr01 - add edi, byte SIZEOF_JSAMPROW ; outptr1 - mov POINTER [ebx+0*SIZEOF_POINTER], esi - mov POINTER [ebx-1*SIZEOF_POINTER], edi - - call near EXTN(jsimd_h2v1_merged_upsample_mmx) - - add esp, byte 7*SIZEOF_DWORD - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, JDIMENSION [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx,esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_mmx) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm index ffbf6b25e..d0e1ea708 100644 --- a/simd/jdmrgss2-64.asm +++ b/simd/jdmrgss2-64.asm @@ -18,7 +18,7 @@ ; [TAB8] %include "jcolsamp.inc" - + ; -------------------------------------------------------------------------- ; ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. @@ -35,399 +35,399 @@ ; r12 = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 3 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 - align 16 - global EXTN(jsimd_h2v1_merged_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_merged_upsample_sse2) EXTN(jsimd_h2v1_merged_upsample_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov rcx, r10 ; col - test rcx,rcx - jz near .return - - push rcx - - mov rdi, r11 - mov rcx, r12 - mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] - mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] - mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] - mov rdi, r13 - mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 - mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 - mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 - mov rdi, JSAMPROW [rdi] ; outptr - - pop rcx ; col + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov rcx, r10 ; col + test rcx,rcx + jz near .return + + push rcx + + mov rdi, r11 + mov rcx, r12 + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 + mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 + mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 + mov rdi, JSAMPROW [rdi] ; outptr + + pop rcx ; col .columnloop: - movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) - movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) - - pxor xmm1,xmm1 ; xmm1=(all 0's) - pcmpeqw xmm3,xmm3 - psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - movdqa xmm4,xmm6 - punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH - punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL - movdqa xmm0,xmm7 - punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH - punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL - - paddw xmm6,xmm3 - paddw xmm4,xmm3 - paddw xmm7,xmm3 - paddw xmm0,xmm3 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm5,xmm6 ; xmm5=CbH - movdqa xmm2,xmm4 ; xmm2=CbL - paddw xmm6,xmm6 ; xmm6=2*CbH - paddw xmm4,xmm4 ; xmm4=2*CbL - movdqa xmm1,xmm7 ; xmm1=CrH - movdqa xmm3,xmm0 ; xmm3=CrL - paddw xmm7,xmm7 ; xmm7=2*CrH - paddw xmm0,xmm0 ; xmm0=2*CrL - - pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) - pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) - pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) - pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) - - paddw xmm6,[rel PW_ONE] - paddw xmm4,[rel PW_ONE] - psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) - psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) - paddw xmm7,[rel PW_ONE] - paddw xmm0,[rel PW_ONE] - psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) - psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) - - paddw xmm6,xmm5 - paddw xmm4,xmm2 - paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H - paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L - paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H - paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L - - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H - - movdqa xmm6,xmm5 - movdqa xmm7,xmm2 - punpcklwd xmm5,xmm1 - punpckhwd xmm6,xmm1 - pmaddwd xmm5,[rel PW_MF0344_F0285] - pmaddwd xmm6,[rel PW_MF0344_F0285] - punpcklwd xmm2,xmm3 - punpckhwd xmm7,xmm3 - pmaddwd xmm2,[rel PW_MF0344_F0285] - pmaddwd xmm7,[rel PW_MF0344_F0285] - - paddd xmm5,[rel PD_ONEHALF] - paddd xmm6,[rel PD_ONEHALF] - psrad xmm5,SCALEBITS - psrad xmm6,SCALEBITS - paddd xmm2,[rel PD_ONEHALF] - paddd xmm7,[rel PD_ONEHALF] - psrad xmm2,SCALEBITS - psrad xmm7,SCALEBITS - - packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) - packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) - psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H - psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L - - movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H - - mov al,2 ; Yctr - jmp short .Yloop_1st + movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1,xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3,xmm3 + psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4,xmm6 + punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0,xmm7 + punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6,xmm3 + paddw xmm4,xmm3 + paddw xmm7,xmm3 + paddw xmm0,xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5,xmm6 ; xmm5=CbH + movdqa xmm2,xmm4 ; xmm2=CbL + paddw xmm6,xmm6 ; xmm6=2*CbH + paddw xmm4,xmm4 ; xmm4=2*CbL + movdqa xmm1,xmm7 ; xmm1=CrH + movdqa xmm3,xmm0 ; xmm3=CrL + paddw xmm7,xmm7 ; xmm7=2*CrH + paddw xmm0,xmm0 ; xmm0=2*CrL + + pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6,[rel PW_ONE] + paddw xmm4,[rel PW_ONE] + psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7,[rel PW_ONE] + paddw xmm0,[rel PW_ONE] + psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6,xmm5 + paddw xmm4,xmm2 + paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6,xmm5 + movdqa xmm7,xmm2 + punpcklwd xmm5,xmm1 + punpckhwd xmm6,xmm1 + pmaddwd xmm5,[rel PW_MF0344_F0285] + pmaddwd xmm6,[rel PW_MF0344_F0285] + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + pmaddwd xmm2,[rel PW_MF0344_F0285] + pmaddwd xmm7,[rel PW_MF0344_F0285] + + paddd xmm5,[rel PD_ONEHALF] + paddd xmm6,[rel PD_ONEHALF] + psrad xmm5,SCALEBITS + psrad xmm6,SCALEBITS + paddd xmm2,[rel PD_ONEHALF] + paddd xmm7,[rel PD_ONEHALF] + psrad xmm2,SCALEBITS + psrad xmm7,SCALEBITS + + packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st .Yloop_2nd: - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H .Yloop_1st: - movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) + movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) - pcmpeqw xmm6,xmm6 - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE - psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO + pcmpeqw xmm6,xmm6 + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO - movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) - movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) - movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) + movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) - paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) - paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) - paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) - paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) %if RGB_PIXELSIZE == 3 ; --------------- - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test rdi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF .out0: - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub rcx, byte SIZEOF_XMMWORD - jz near .endcolumn + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn - add rsi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add rbx, byte SIZEOF_XMMWORD ; inptr1 - add rdx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop .column_st32: - lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE - cmp rcx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub rcx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 + lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE + cmp rcx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub rcx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 .column_st16: - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub rcx, byte SIZEOF_XMMWORD + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD .column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [rdi], xmmA - add rdi, byte SIZEOF_MMWORD - sub rcx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_MMWORD + sub rcx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD .column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [rdi], xmmA - add rdi, byte SIZEOF_DWORD - sub rcx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [rdi], xmmA + add rdi, byte SIZEOF_DWORD + sub rcx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD .column_st3: - ; Store the lower 2 bytes of rax to the output when it has enough - ; space. - movd eax, xmmA - cmp rcx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [rdi], ax - add rdi, byte SIZEOF_WORD - sub rcx, byte SIZEOF_WORD - shr rax, 16 + ; Store the lower 2 bytes of rax to the output when it has enough + ; space. + movd eax, xmmA + cmp rcx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [rdi], ax + add rdi, byte SIZEOF_WORD + sub rcx, byte SIZEOF_WORD + shr rax, 16 .column_st1: - ; Store the lower 1 byte of rax to the output when it has enough - ; space. - test rcx, rcx - jz short .endcolumn - mov BYTE [rdi], al + ; Store the lower 1 byte of rax to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + mov BYTE [rdi], al %else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) %else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) %endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp rcx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test rdi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp rcx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test rdi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH .out0: - add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub rcx, byte SIZEOF_XMMWORD - jz near .endcolumn + add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub rcx, byte SIZEOF_XMMWORD + jz near .endcolumn - add rsi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add rsi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add rbx, byte SIZEOF_XMMWORD ; inptr1 - add rdx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop + add rbx, byte SIZEOF_XMMWORD ; inptr1 + add rdx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop .column_st32: - cmp rcx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub rcx, byte SIZEOF_XMMWORD/2 + cmp rcx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub rcx, byte SIZEOF_XMMWORD/2 .column_st16: - cmp rcx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA - add rdi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub rcx, byte SIZEOF_XMMWORD/4 + cmp rcx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA + add rdi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub rcx, byte SIZEOF_XMMWORD/4 .column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp rcx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq XMM_MMWORD [rdi], xmmA - add rdi, byte SIZEOF_XMMWORD/8*4 - sub rcx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp rcx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [rdi], xmmA + add rdi, byte SIZEOF_XMMWORD/8*4 + sub rcx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 .column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test rcx, rcx - jz short .endcolumn - movd XMM_DWORD [rdi], xmmA + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test rcx, rcx + jz short .endcolumn + movd XMM_DWORD [rdi], xmmA %endif ; RGB_PIXELSIZE ; --------------- .endcolumn: - sfence ; flush the write buffer + sfence ; flush the write buffer .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -445,94 +445,94 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): ; r12 = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf - align 16 - global EXTN(jsimd_h2v2_merged_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_merged_upsample_sse2) EXTN(jsimd_h2v2_merged_upsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - mov rax, r10 - - mov rdi, r11 - mov rcx, r12 - mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] - mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] - mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] - mov rdi, r13 - lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] - - push rdx ; inptr2 - push rbx ; inptr1 - push rsi ; inptr00 - mov rbx,rsp - - push rdi - push rcx - push rax - - %ifdef WIN64 - mov r8, rcx - mov r9, rdi - mov rcx, rax - mov rdx, rbx - %else - mov rdx, rcx - mov rcx, rdi - mov rdi, rax - mov rsi, rbx - %endif - - call EXTN(jsimd_h2v1_merged_upsample_sse2) - - pop rax - pop rcx - pop rdi - pop rsi - pop rbx - pop rdx - - add rdi, byte SIZEOF_JSAMPROW ; outptr1 - add rsi, byte SIZEOF_JSAMPROW ; inptr01 - - push rdx ; inptr2 - push rbx ; inptr1 - push rsi ; inptr00 - mov rbx,rsp - - push rdi - push rcx - push rax - - %ifdef WIN64 - mov r8, rcx - mov r9, rdi - mov rcx, rax - mov rdx, rbx - %else - mov rdx, rcx - mov rcx, rdi - mov rdi, rax - mov rsi, rbx - %endif - - call EXTN(jsimd_h2v1_merged_upsample_sse2) - - pop rax - pop rcx - pop rdi - pop rsi - pop rbx - pop rdx - - pop rbx - uncollect_args - pop rbp - ret + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + mov rax, r10 + + mov rdi, r11 + mov rcx, r12 + mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] + mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] + mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] + mov rdi, r13 + lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx,rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + add rdi, byte SIZEOF_JSAMPROW ; outptr1 + add rsi, byte SIZEOF_JSAMPROW ; inptr01 + + push rdx ; inptr2 + push rbx ; inptr1 + push rsi ; inptr00 + mov rbx,rsp + + push rdi + push rcx + push rax + + %ifdef WIN64 + mov r8, rcx + mov r9, rdi + mov rcx, rax + mov rdx, rbx + %else + mov rdx, rcx + mov rcx, rdi + mov rdi, rax + mov rsi, rbx + %endif + + call EXTN(jsimd_h2v1_merged_upsample_sse2) + + pop rax + pop rcx + pop rdi + pop rsi + pop rbx + pop rdx + + pop rbx + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm index 6494340f2..0206f62b4 100644 --- a/simd/jdmrgss2.asm +++ b/simd/jdmrgss2.asm @@ -18,7 +18,7 @@ ; [TAB8] %include "jcolsamp.inc" - + ; -------------------------------------------------------------------------- ; ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. @@ -30,422 +30,422 @@ ; JSAMPARRAY output_buf); ; -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 3 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 3 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_h2v1_merged_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_merged_upsample_sse2) EXTN(jsimd_h2v1_merged_upsample_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov ecx, JDIMENSION [output_width(eax)] ; col - test ecx,ecx - jz near .return - - push ecx - - mov edi, JSAMPIMAGE [input_buf(eax)] - mov ecx, JDIMENSION [in_row_group_ctr(eax)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(eax)] - mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 - mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 - mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 - mov edi, JSAMPROW [edi] ; outptr - - pop ecx ; col - - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov ecx, JDIMENSION [output_width(eax)] ; col + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [in_row_group_ctr(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(eax)] + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + + pop ecx ; col + + alignx 16,7 .columnloop: - movpic eax, POINTER [gotptr] ; load GOT address (eax) - - movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) - movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) - - pxor xmm1,xmm1 ; xmm1=(all 0's) - pcmpeqw xmm3,xmm3 - psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} - - movdqa xmm4,xmm6 - punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH - punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL - movdqa xmm0,xmm7 - punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH - punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL - - paddw xmm6,xmm3 - paddw xmm4,xmm3 - paddw xmm7,xmm3 - paddw xmm0,xmm3 - - ; (Original) - ; R = Y + 1.40200 * Cr - ; G = Y - 0.34414 * Cb - 0.71414 * Cr - ; B = Y + 1.77200 * Cb - ; - ; (This implementation) - ; R = Y + 0.40200 * Cr + Cr - ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr - ; B = Y - 0.22800 * Cb + Cb + Cb - - movdqa xmm5,xmm6 ; xmm5=CbH - movdqa xmm2,xmm4 ; xmm2=CbL - paddw xmm6,xmm6 ; xmm6=2*CbH - paddw xmm4,xmm4 ; xmm4=2*CbL - movdqa xmm1,xmm7 ; xmm1=CrH - movdqa xmm3,xmm0 ; xmm3=CrL - paddw xmm7,xmm7 ; xmm7=2*CrH - paddw xmm0,xmm0 ; xmm0=2*CrL - - pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) - pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) - pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) - pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) - - paddw xmm6,[GOTOFF(eax,PW_ONE)] - paddw xmm4,[GOTOFF(eax,PW_ONE)] - psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) - psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) - paddw xmm7,[GOTOFF(eax,PW_ONE)] - paddw xmm0,[GOTOFF(eax,PW_ONE)] - psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) - psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) - - paddw xmm6,xmm5 - paddw xmm4,xmm2 - paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H - paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L - paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H - paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L - - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H - - movdqa xmm6,xmm5 - movdqa xmm7,xmm2 - punpcklwd xmm5,xmm1 - punpckhwd xmm6,xmm1 - pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] - punpcklwd xmm2,xmm3 - punpckhwd xmm7,xmm3 - pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] - pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] - - paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm5,SCALEBITS - psrad xmm6,SCALEBITS - paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] - paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] - psrad xmm2,SCALEBITS - psrad xmm7,SCALEBITS - - packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) - packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) - psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H - psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L - - movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H - - mov al,2 ; Yctr - jmp short .Yloop_1st - alignx 16,7 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) + movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) + + pxor xmm1,xmm1 ; xmm1=(all 0's) + pcmpeqw xmm3,xmm3 + psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + movdqa xmm4,xmm6 + punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH + punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL + movdqa xmm0,xmm7 + punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH + punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL + + paddw xmm6,xmm3 + paddw xmm4,xmm3 + paddw xmm7,xmm3 + paddw xmm0,xmm3 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm5,xmm6 ; xmm5=CbH + movdqa xmm2,xmm4 ; xmm2=CbL + paddw xmm6,xmm6 ; xmm6=2*CbH + paddw xmm4,xmm4 ; xmm4=2*CbL + movdqa xmm1,xmm7 ; xmm1=CrH + movdqa xmm3,xmm0 ; xmm3=CrL + paddw xmm7,xmm7 ; xmm7=2*CrH + paddw xmm0,xmm0 ; xmm0=2*CrL + + pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) + pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) + pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) + pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) + + paddw xmm6,[GOTOFF(eax,PW_ONE)] + paddw xmm4,[GOTOFF(eax,PW_ONE)] + psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) + psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) + paddw xmm7,[GOTOFF(eax,PW_ONE)] + paddw xmm0,[GOTOFF(eax,PW_ONE)] + psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) + psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) + + paddw xmm6,xmm5 + paddw xmm4,xmm2 + paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H + paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L + paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H + paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L + + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H + + movdqa xmm6,xmm5 + movdqa xmm7,xmm2 + punpcklwd xmm5,xmm1 + punpckhwd xmm6,xmm1 + pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm5,SCALEBITS + psrad xmm6,SCALEBITS + paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm2,SCALEBITS + psrad xmm7,SCALEBITS + + packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) + packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) + psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H + psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L + + movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H + + mov al,2 ; Yctr + jmp short .Yloop_1st + alignx 16,7 .Yloop_2nd: - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H - alignx 16,7 + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H + alignx 16,7 .Yloop_1st: - movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) + movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) - pcmpeqw xmm6,xmm6 - psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} - pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE - psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO + pcmpeqw xmm6,xmm6 + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE + psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO - movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) - movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) - movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) + movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) + movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) + movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) - paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) - paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) - packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) - packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) + paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) - paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) - paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) - packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) - packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) + paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) - paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) - paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) - packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) - packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) + paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) %if RGB_PIXELSIZE == 3 ; --------------- - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) - punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) - - movdqa xmmG,xmmA - movdqa xmmH,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) - punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) - - psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) - psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) - - movdqa xmmC,xmmD - movdqa xmmB,xmmD - punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) - punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) - - psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) - - movdqa xmmF,xmmE - punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) - punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) - - pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) - movdqa xmmB,xmmE - punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) - punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) - punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) - - pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) - movdqa xmmB,xmmF - punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) - punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) - punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) - - punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) - punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) - punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF .out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .endcolumn + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn - add esi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 .column_st32: - lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE - cmp ecx, byte 2*SIZEOF_XMMWORD - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmF - sub ecx, byte 2*SIZEOF_XMMWORD - jmp short .column_st15 + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 .column_st16: - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD .column_st15: - ; Store the lower 8 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_MMWORD - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_MMWORD - sub ecx, byte SIZEOF_MMWORD - psrldq xmmA, SIZEOF_MMWORD + ; Store the lower 8 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_MMWORD + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_MMWORD + sub ecx, byte SIZEOF_MMWORD + psrldq xmmA, SIZEOF_MMWORD .column_st7: - ; Store the lower 4 bytes of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_DWORD - jb short .column_st3 - movd XMM_DWORD [edi], xmmA - add edi, byte SIZEOF_DWORD - sub ecx, byte SIZEOF_DWORD - psrldq xmmA, SIZEOF_DWORD + ; Store the lower 4 bytes of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_DWORD + jb short .column_st3 + movd XMM_DWORD [edi], xmmA + add edi, byte SIZEOF_DWORD + sub ecx, byte SIZEOF_DWORD + psrldq xmmA, SIZEOF_DWORD .column_st3: - ; Store the lower 2 bytes of eax to the output when it has enough - ; space. - movd eax, xmmA - cmp ecx, byte SIZEOF_WORD - jb short .column_st1 - mov WORD [edi], ax - add edi, byte SIZEOF_WORD - sub ecx, byte SIZEOF_WORD - shr eax, 16 + ; Store the lower 2 bytes of eax to the output when it has enough + ; space. + movd eax, xmmA + cmp ecx, byte SIZEOF_WORD + jb short .column_st1 + mov WORD [edi], ax + add edi, byte SIZEOF_WORD + sub ecx, byte SIZEOF_WORD + shr eax, 16 .column_st1: - ; Store the lower 1 byte of eax to the output when it has enough - ; space. - test ecx, ecx - jz short .endcolumn - mov BYTE [edi], al + ; Store the lower 1 byte of eax to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + mov BYTE [edi], al %else ; RGB_PIXELSIZE == 4 ; ----------- %ifdef RGBX_FILLER_0XFF - pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) %else - pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) - pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) %endif - ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) - ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) - ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) - ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) - - punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) - punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) - punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) - punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) - - movdqa xmmC,xmmA - punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) - punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) - movdqa xmmG,xmmB - punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) - punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) - - movdqa xmmD,xmmA - punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) - punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) - movdqa xmmH,xmmC - punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) - punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) - - cmp ecx, byte SIZEOF_XMMWORD - jb short .column_st32 - - test edi, SIZEOF_XMMWORD-1 - jnz short .out1 - ; --(aligned)------------------- - movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH - jmp short .out0 -.out1: ; --(unaligned)----------------- - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC - movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + jmp short .out0 +.out1: ; --(unaligned)----------------- + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH .out0: - add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr - sub ecx, byte SIZEOF_XMMWORD - jz near .endcolumn + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + sub ecx, byte SIZEOF_XMMWORD + jz near .endcolumn - add esi, byte SIZEOF_XMMWORD ; inptr0 - dec al ; Yctr - jnz near .Yloop_2nd + add esi, byte SIZEOF_XMMWORD ; inptr0 + dec al ; Yctr + jnz near .Yloop_2nd - add ebx, byte SIZEOF_XMMWORD ; inptr1 - add edx, byte SIZEOF_XMMWORD ; inptr2 - jmp near .columnloop - alignx 16,7 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 .column_st32: - cmp ecx, byte SIZEOF_XMMWORD/2 - jb short .column_st16 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD - add edi, byte 2*SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmC - movdqa xmmD,xmmH - sub ecx, byte SIZEOF_XMMWORD/2 + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + add edi, byte 2*SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub ecx, byte SIZEOF_XMMWORD/2 .column_st16: - cmp ecx, byte SIZEOF_XMMWORD/4 - jb short .column_st15 - movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA - add edi, byte SIZEOF_XMMWORD ; outptr - movdqa xmmA,xmmD - sub ecx, byte SIZEOF_XMMWORD/4 + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD/4 .column_st15: - ; Store two pixels (8 bytes) of xmmA to the output when it has enough - ; space. - cmp ecx, byte SIZEOF_XMMWORD/8 - jb short .column_st7 - movq XMM_MMWORD [edi], xmmA - add edi, byte SIZEOF_XMMWORD/8*4 - sub ecx, byte SIZEOF_XMMWORD/8 - psrldq xmmA, SIZEOF_XMMWORD/8*4 + ; Store two pixels (8 bytes) of xmmA to the output when it has enough + ; space. + cmp ecx, byte SIZEOF_XMMWORD/8 + jb short .column_st7 + movq XMM_MMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD/8*4 + sub ecx, byte SIZEOF_XMMWORD/8 + psrldq xmmA, SIZEOF_XMMWORD/8*4 .column_st7: - ; Store one pixel (4 bytes) of xmmA to the output when it has enough - ; space. - test ecx, ecx - jz short .endcolumn - movd XMM_DWORD [edi], xmmA + ; Store one pixel (4 bytes) of xmmA to the output when it has enough + ; space. + test ecx, ecx + jz short .endcolumn + movd XMM_DWORD [edi], xmmA %endif ; RGB_PIXELSIZE ; --------------- .endcolumn: - sfence ; flush the write buffer + sfence ; flush the write buffer .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -458,62 +458,62 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): ; JSAMPARRAY output_buf); ; -%define output_width(b) (b)+8 ; JDIMENSION output_width -%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf -%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf +%define output_width(b) (b)+8 ; JDIMENSION output_width +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf - align 16 - global EXTN(jsimd_h2v2_merged_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_merged_upsample_sse2) EXTN(jsimd_h2v2_merged_upsample_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov eax, POINTER [output_width(ebp)] - - mov edi, JSAMPIMAGE [input_buf(ebp)] - mov ecx, JDIMENSION [in_row_group_ctr(ebp)] - mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] - mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] - mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] - mov edi, JSAMPARRAY [output_buf(ebp)] - lea esi, [esi+ecx*SIZEOF_JSAMPROW] - - push edx ; inptr2 - push ebx ; inptr1 - push esi ; inptr00 - mov ebx,esp - - push edi ; output_buf (outptr0) - push ecx ; in_row_group_ctr - push ebx ; input_buf - push eax ; output_width - - call near EXTN(jsimd_h2v1_merged_upsample_sse2) - - add esi, byte SIZEOF_JSAMPROW ; inptr01 - add edi, byte SIZEOF_JSAMPROW ; outptr1 - mov POINTER [ebx+0*SIZEOF_POINTER], esi - mov POINTER [ebx-1*SIZEOF_POINTER], edi - - call near EXTN(jsimd_h2v1_merged_upsample_sse2) - - add esp, byte 7*SIZEOF_DWORD - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov eax, POINTER [output_width(ebp)] + + mov edi, JSAMPIMAGE [input_buf(ebp)] + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + mov edi, JSAMPARRAY [output_buf(ebp)] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + + push edx ; inptr2 + push ebx ; inptr1 + push esi ; inptr00 + mov ebx,esp + + push edi ; output_buf (outptr0) + push ecx ; in_row_group_ctr + push ebx ; input_buf + push eax ; output_width + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esi, byte SIZEOF_JSAMPROW ; inptr01 + add edi, byte SIZEOF_JSAMPROW ; outptr1 + mov POINTER [ebx+0*SIZEOF_POINTER], esi + mov POINTER [ebx-1*SIZEOF_POINTER], edi + + call near EXTN(jsimd_h2v1_merged_upsample_sse2) + + add esp, byte 7*SIZEOF_DWORD + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdsammmx.asm b/simd/jdsammmx.asm index c09e5b96c..823fe191a 100644 --- a/simd/jdsammmx.asm +++ b/simd/jdsammmx.asm @@ -19,24 +19,24 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fancy_upsample_mmx) + alignz 16 + global EXTN(jconst_fancy_upsample_mmx) EXTN(jconst_fancy_upsample_mmx): -PW_ONE times 4 dw 1 -PW_TWO times 4 dw 2 -PW_THREE times 4 dw 3 -PW_SEVEN times 4 dw 7 -PW_EIGHT times 4 dw 8 +PW_ONE times 4 dw 1 +PW_TWO times 4 dw 2 +PW_THREE times 4 dw 3 +PW_SEVEN times 4 dw 7 +PW_EIGHT times 4 dw 8 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. ; @@ -52,146 +52,146 @@ PW_EIGHT times 4 dw 8 ; JSAMPARRAY * output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v1_fancy_upsample_mmx) + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_mmx) EXTN(jsimd_h2v1_fancy_upsample_mmx): - push ebp - mov ebp,esp - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp,esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 .rowloop: - push eax ; colctr - push edi - push esi + push eax ; colctr + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr - test eax, SIZEOF_MMWORD-1 - jz short .skip - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + test eax, SIZEOF_MMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample .skip: - pxor mm0,mm0 ; mm0=(all 0's) - pcmpeqb mm7,mm7 - psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT - pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] + pxor mm0,mm0 ; mm0=(all 0's) + pcmpeqb mm7,mm7 + psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT + pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] - add eax, byte SIZEOF_MMWORD-1 - and eax, byte -SIZEOF_MMWORD - cmp eax, byte SIZEOF_MMWORD - ja short .columnloop - alignx 16,7 + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16,7 .columnloop_last: - pcmpeqb mm6,mm6 - psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT - pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] - jmp short .upsample - alignx 16,7 + pcmpeqb mm6,mm6 + psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT + pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] + jmp short .upsample + alignx 16,7 .columnloop: - movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] - psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT + movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] + psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT .upsample: - movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] - movq mm2,mm1 - movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) - psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) - psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) - - por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) - por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) - - movq mm7,mm1 - psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) - - movq mm4,mm1 - punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) - punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) - movq mm5,mm2 - punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) - punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) - movq mm6,mm3 - punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) - punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) - - pmullw mm1,[GOTOFF(ebx,PW_THREE)] - pmullw mm4,[GOTOFF(ebx,PW_THREE)] - paddw mm2,[GOTOFF(ebx,PW_ONE)] - paddw mm5,[GOTOFF(ebx,PW_ONE)] - paddw mm3,[GOTOFF(ebx,PW_TWO)] - paddw mm6,[GOTOFF(ebx,PW_TWO)] - - paddw mm2,mm1 - paddw mm5,mm4 - psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) - psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) - paddw mm3,mm1 - paddw mm6,mm4 - psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) - psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) - - psllw mm3,BYTE_BIT - psllw mm6,BYTE_BIT - por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) - por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) - - movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 - - sub eax, byte SIZEOF_MMWORD - add esi, byte 1*SIZEOF_MMWORD ; inptr - add edi, byte 2*SIZEOF_MMWORD ; outptr - cmp eax, byte SIZEOF_MMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg near .rowloop - - emms ; empty MMX state + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm2,mm1 + movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) + psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) + psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) + + por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) + por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) + + movq mm7,mm1 + psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) + + movq mm4,mm1 + punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) + punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) + movq mm5,mm2 + punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) + punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) + movq mm6,mm3 + punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) + punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) + + pmullw mm1,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + paddw mm2,[GOTOFF(ebx,PW_ONE)] + paddw mm5,[GOTOFF(ebx,PW_ONE)] + paddw mm3,[GOTOFF(ebx,PW_TWO)] + paddw mm6,[GOTOFF(ebx,PW_TWO)] + + paddw mm2,mm1 + paddw mm5,mm4 + psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) + psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) + paddw mm3,mm1 + paddw mm6,mm4 + psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) + psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) + + psllw mm3,BYTE_BIT + psllw mm6,BYTE_BIT + por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) + por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 + + sub eax, byte SIZEOF_MMWORD + add esi, byte 1*SIZEOF_MMWORD ; inptr + add edi, byte 2*SIZEOF_MMWORD ; outptr + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -205,324 +205,324 @@ EXTN(jsimd_h2v1_fancy_upsample_mmx): ; JSAMPARRAY * output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 4 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_h2v2_fancy_upsample_mmx) + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_mmx) EXTN(jsimd_h2v2_fancy_upsample_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov edx,eax ; edx = original ebp - mov eax, JDIMENSION [downsamp_width(edx)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(edx)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(edx)] ; input_data - mov edi, POINTER [output_data_ptr(edx)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx,eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 .rowloop: - push eax ; colctr - push ecx - push edi - push esi - - mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - - test eax, SIZEOF_MMWORD-1 - jz short .skip - push edx - mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample - pop edx + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_MMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx .skip: - ; -- process the first column block + ; -- process the first column block - movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] - movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] - movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] + movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] + movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] + movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pxor mm3,mm3 ; mm3=(all 0's) - movq mm4,mm0 - punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) - punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) - movq mm5,mm1 - punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) - punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) - movq mm6,mm2 - punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) - punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) + pxor mm3,mm3 ; mm3=(all 0's) + movq mm4,mm0 + punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) + punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) + movq mm5,mm1 + punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) + punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) + movq mm6,mm2 + punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) + punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) - pmullw mm0,[GOTOFF(ebx,PW_THREE)] - pmullw mm4,[GOTOFF(ebx,PW_THREE)] + pmullw mm0,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] - pcmpeqb mm7,mm7 - psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT + pcmpeqb mm7,mm7 + psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT - paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) - paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) - paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) - paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) + paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) - movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save - movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data - movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 - pand mm1,mm7 ; mm1=( 0 - - -) - pand mm2,mm7 ; mm2=( 0 - - -) + pand mm1,mm7 ; mm1=( 0 - - -) + pand mm2,mm7 ; mm2=( 0 - - -) - movq MMWORD [wk(0)], mm1 - movq MMWORD [wk(1)], mm2 + movq MMWORD [wk(0)], mm1 + movq MMWORD [wk(1)], mm2 - poppic ebx + poppic ebx - add eax, byte SIZEOF_MMWORD-1 - and eax, byte -SIZEOF_MMWORD - cmp eax, byte SIZEOF_MMWORD - ja short .columnloop - alignx 16,7 + add eax, byte SIZEOF_MMWORD-1 + and eax, byte -SIZEOF_MMWORD + cmp eax, byte SIZEOF_MMWORD + ja short .columnloop + alignx 16,7 .columnloop_last: - ; -- process the last column block + ; -- process the last column block - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pcmpeqb mm1,mm1 - psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT - movq mm2,mm1 + pcmpeqb mm1,mm1 + psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT + movq mm2,mm1 - pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) - pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) + pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) + pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) - movq MMWORD [wk(2)], mm1 - movq MMWORD [wk(3)], mm2 + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 - jmp short .upsample - alignx 16,7 + jmp short .upsample + alignx 16,7 .columnloop: - ; -- process the next column block + ; -- process the next column block - movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] - movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] - movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] + movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] + movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pxor mm3,mm3 ; mm3=(all 0's) - movq mm4,mm0 - punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) - punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) - movq mm5,mm1 - punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) - punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) - movq mm6,mm2 - punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) - punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) + pxor mm3,mm3 ; mm3=(all 0's) + movq mm4,mm0 + punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) + punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) + movq mm5,mm1 + punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) + punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) + movq mm6,mm2 + punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) + punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) - pmullw mm0,[GOTOFF(ebx,PW_THREE)] - pmullw mm4,[GOTOFF(ebx,PW_THREE)] + pmullw mm0,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] - paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) - paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) - paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) - paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) + paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) + paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) + paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) + paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) - movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save - movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data - movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 + movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save + movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 - psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) - psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) + psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) + psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) - movq MMWORD [wk(2)], mm1 - movq MMWORD [wk(3)], mm2 + movq MMWORD [wk(2)], mm1 + movq MMWORD [wk(3)], mm2 .upsample: - ; -- process the upper row - - movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) - movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) - - movq mm0,mm7 - movq mm4,mm3 - psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) - psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) - movq mm5,mm7 - movq mm6,mm3 - psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) - psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) - - por mm0,mm4 ; mm0=( 1 2 3 4) - por mm5,mm6 ; mm5=( 3 4 5 6) - - movq mm1,mm7 - movq mm2,mm3 - psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) - psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) - movq mm4,mm3 - psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) - - por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) - por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) - - movq MMWORD [wk(0)], mm4 - - pmullw mm7,[GOTOFF(ebx,PW_THREE)] - pmullw mm3,[GOTOFF(ebx,PW_THREE)] - paddw mm1,[GOTOFF(ebx,PW_EIGHT)] - paddw mm5,[GOTOFF(ebx,PW_EIGHT)] - paddw mm0,[GOTOFF(ebx,PW_SEVEN)] - paddw mm2,[GOTOFF(ebx,PW_SEVEN)] - - paddw mm1,mm7 - paddw mm5,mm3 - psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) - psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) - paddw mm0,mm7 - paddw mm2,mm3 - psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) - psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) - - psllw mm0,BYTE_BIT - psllw mm2,BYTE_BIT - por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) - por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) - - movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 - movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 - - ; -- process the lower row - - movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) - movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) - - movq mm7,mm6 - movq mm3,mm4 - psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) - psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) - movq mm0,mm6 - movq mm2,mm4 - psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) - psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) - - por mm7,mm3 ; mm7=( 1 2 3 4) - por mm0,mm2 ; mm0=( 3 4 5 6) - - movq mm1,mm6 - movq mm5,mm4 - psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) - psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) - movq mm3,mm4 - psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) - - por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) - por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) - - movq MMWORD [wk(1)], mm3 - - pmullw mm6,[GOTOFF(ebx,PW_THREE)] - pmullw mm4,[GOTOFF(ebx,PW_THREE)] - paddw mm1,[GOTOFF(ebx,PW_EIGHT)] - paddw mm0,[GOTOFF(ebx,PW_EIGHT)] - paddw mm7,[GOTOFF(ebx,PW_SEVEN)] - paddw mm5,[GOTOFF(ebx,PW_SEVEN)] - - paddw mm1,mm6 - paddw mm0,mm4 - psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) - psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) - paddw mm7,mm6 - paddw mm5,mm4 - psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) - psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) - - psllw mm7,BYTE_BIT - psllw mm5,BYTE_BIT - por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) - por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) - - movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 - - poppic ebx - - sub eax, byte SIZEOF_MMWORD - add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) - add ebx, byte 1*SIZEOF_MMWORD ; inptr0 - add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) - add edx, byte 2*SIZEOF_MMWORD ; outptr0 - add edi, byte 2*SIZEOF_MMWORD ; outptr1 - cmp eax, byte SIZEOF_MMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop ecx - pop eax - - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg near .rowloop - - emms ; empty MMX state + ; -- process the upper row + + movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) + movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) + + movq mm0,mm7 + movq mm4,mm3 + psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) + psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) + movq mm5,mm7 + movq mm6,mm3 + psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) + psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) + + por mm0,mm4 ; mm0=( 1 2 3 4) + por mm5,mm6 ; mm5=( 3 4 5 6) + + movq mm1,mm7 + movq mm2,mm3 + psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) + movq mm4,mm3 + psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) + + por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) + por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) + + movq MMWORD [wk(0)], mm4 + + pmullw mm7,[GOTOFF(ebx,PW_THREE)] + pmullw mm3,[GOTOFF(ebx,PW_THREE)] + paddw mm1,[GOTOFF(ebx,PW_EIGHT)] + paddw mm5,[GOTOFF(ebx,PW_EIGHT)] + paddw mm0,[GOTOFF(ebx,PW_SEVEN)] + paddw mm2,[GOTOFF(ebx,PW_SEVEN)] + + paddw mm1,mm7 + paddw mm5,mm3 + psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) + psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) + paddw mm0,mm7 + paddw mm2,mm3 + psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) + psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) + + psllw mm0,BYTE_BIT + psllw mm2,BYTE_BIT + por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) + por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 + + ; -- process the lower row + + movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) + movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) + + movq mm7,mm6 + movq mm3,mm4 + psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) + psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) + movq mm0,mm6 + movq mm2,mm4 + psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) + psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) + + por mm7,mm3 ; mm7=( 1 2 3 4) + por mm0,mm2 ; mm0=( 3 4 5 6) + + movq mm1,mm6 + movq mm5,mm4 + psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) + psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) + movq mm3,mm4 + psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) + + por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) + por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) + + movq MMWORD [wk(1)], mm3 + + pmullw mm6,[GOTOFF(ebx,PW_THREE)] + pmullw mm4,[GOTOFF(ebx,PW_THREE)] + paddw mm1,[GOTOFF(ebx,PW_EIGHT)] + paddw mm0,[GOTOFF(ebx,PW_EIGHT)] + paddw mm7,[GOTOFF(ebx,PW_SEVEN)] + paddw mm5,[GOTOFF(ebx,PW_SEVEN)] + + paddw mm1,mm6 + paddw mm0,mm4 + psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) + psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) + paddw mm7,mm6 + paddw mm5,mm4 + psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) + psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) + + psllw mm7,BYTE_BIT + psllw mm5,BYTE_BIT + por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) + por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) + + movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 + + poppic ebx + + sub eax, byte SIZEOF_MMWORD + add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_MMWORD ; inptr0 + add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_MMWORD ; outptr0 + add edi, byte 2*SIZEOF_MMWORD ; outptr1 + cmp eax, byte SIZEOF_MMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -536,94 +536,94 @@ EXTN(jsimd_h2v2_fancy_upsample_mmx): ; JSAMPARRAY * output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v1_upsample_mmx) + align 16 + global EXTN(jsimd_h2v1_upsample_mmx) EXTN(jsimd_h2v1_upsample_mmx): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_MMWORD)-1 - and edx, byte -(2*SIZEOF_MMWORD) - jz short .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz short .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 .rowloop: - push edi - push esi + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - mov eax,edx ; colctr - alignx 16,7 + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax,edx ; colctr + alignx 16,7 .columnloop: - movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] - movq mm1,mm0 - punpcklbw mm0,mm0 - punpckhbw mm1,mm1 + movq mm1,mm0 + punpcklbw mm0,mm0 + punpckhbw mm1,mm1 - movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 - sub eax, byte 2*SIZEOF_MMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow - movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] - movq mm3,mm2 - punpcklbw mm2,mm2 - punpckhbw mm3,mm3 + movq mm3,mm2 + punpcklbw mm2,mm2 + punpckhbw mm3,mm3 - movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 - sub eax, byte 2*SIZEOF_MMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow - add esi, byte 2*SIZEOF_MMWORD ; inptr - add edi, byte 4*SIZEOF_MMWORD ; outptr - jmp short .columnloop - alignx 16,7 + add esi, byte 2*SIZEOF_MMWORD ; inptr + add edi, byte 4*SIZEOF_MMWORD ; outptr + jmp short .columnloop + alignx 16,7 .nextrow: - pop esi - pop edi + pop esi + pop edi - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg short .rowloop + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop - emms ; empty MMX state + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -637,101 +637,101 @@ EXTN(jsimd_h2v1_upsample_mmx): ; JSAMPARRAY * output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v2_upsample_mmx) + align 16 + global EXTN(jsimd_h2v2_upsample_mmx) EXTN(jsimd_h2v2_upsample_mmx): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_MMWORD)-1 - and edx, byte -(2*SIZEOF_MMWORD) - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz short .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_MMWORD)-1 + and edx, byte -(2*SIZEOF_MMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 .rowloop: - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - mov eax,edx ; colctr - alignx 16,7 + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax,edx ; colctr + alignx 16,7 .columnloop: - movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] - movq mm1,mm0 - punpcklbw mm0,mm0 - punpckhbw mm1,mm1 + movq mm1,mm0 + punpcklbw mm0,mm0 + punpckhbw mm1,mm1 - movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 - movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 - movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 - movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 + movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 + movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 - sub eax, byte 2*SIZEOF_MMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow - movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] - movq mm3,mm2 - punpcklbw mm2,mm2 - punpckhbw mm3,mm3 + movq mm3,mm2 + punpcklbw mm2,mm2 + punpckhbw mm3,mm3 - movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 - movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 - movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 - movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 + movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 + movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 - sub eax, byte 2*SIZEOF_MMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_MMWORD + jz short .nextrow - add esi, byte 2*SIZEOF_MMWORD ; inptr - add ebx, byte 4*SIZEOF_MMWORD ; outptr0 - add edi, byte 4*SIZEOF_MMWORD ; outptr1 - jmp short .columnloop - alignx 16,7 + add esi, byte 2*SIZEOF_MMWORD ; inptr + add ebx, byte 4*SIZEOF_MMWORD ; outptr0 + add edi, byte 4*SIZEOF_MMWORD ; outptr1 + jmp short .columnloop + alignx 16,7 .nextrow: - pop esi - pop edi + pop esi + pop edi - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg short .rowloop + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop - emms ; empty MMX state + emms ; empty MMX state .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdsamss2-64.asm b/simd/jdsamss2-64.asm index f36c15622..a41d05934 100644 --- a/simd/jdsamss2-64.asm +++ b/simd/jdsamss2-64.asm @@ -20,24 +20,24 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fancy_upsample_sse2) + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) EXTN(jconst_fancy_upsample_sse2): -PW_ONE times 8 dw 1 -PW_TWO times 8 dw 2 -PW_THREE times 8 dw 3 -PW_SEVEN times 8 dw 7 -PW_EIGHT times 8 dw 8 +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. ; @@ -58,127 +58,127 @@ PW_EIGHT times 8 dw 8 ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) EXTN(jsimd_h2v1_fancy_upsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov rax, r11 ; colctr - test rax,rax - jz near .return - - mov rcx, r10 ; rowctr - test rcx,rcx - jz near .return - - mov rsi, r12 ; input_data - mov rdi, r13 - mov rdi, JSAMPARRAY [rdi] ; output_data + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov rax, r11 ; colctr + test rax,rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data .rowloop: - push rax ; colctr - push rdi - push rsi + push rax ; colctr + push rdi + push rsi - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr - test rax, SIZEOF_XMMWORD-1 - jz short .skip - mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + test rax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample .skip: - pxor xmm0,xmm0 ; xmm0=(all 0's) - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-1) - pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] + pxor xmm0,xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] - add rax, byte SIZEOF_XMMWORD-1 - and rax, byte -SIZEOF_XMMWORD - cmp rax, byte SIZEOF_XMMWORD - ja short .columnloop + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop .columnloop_last: - pcmpeqb xmm6,xmm6 - pslldq xmm6,(SIZEOF_XMMWORD-1) - pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] - jmp short .upsample + pcmpeqb xmm6,xmm6 + pslldq xmm6,(SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] + jmp short .upsample .columnloop: - movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] - pslldq xmm6,(SIZEOF_XMMWORD-1) + movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] + pslldq xmm6,(SIZEOF_XMMWORD-1) .upsample: - movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm2,xmm1 - movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) - pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) - psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) - - por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) - por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) - - movdqa xmm7,xmm1 - psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) - - movdqa xmm4,xmm1 - punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm2 - punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) - punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) - movdqa xmm6,xmm3 - punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) - punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) - - pmullw xmm1,[rel PW_THREE] - pmullw xmm4,[rel PW_THREE] - paddw xmm2,[rel PW_ONE] - paddw xmm5,[rel PW_ONE] - paddw xmm3,[rel PW_TWO] - paddw xmm6,[rel PW_TWO] - - paddw xmm2,xmm1 - paddw xmm5,xmm4 - psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) - paddw xmm3,xmm1 - paddw xmm6,xmm4 - psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) - psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) - - psllw xmm3,BYTE_BIT - psllw xmm6,BYTE_BIT - por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) - por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 - - sub rax, byte SIZEOF_XMMWORD - add rsi, byte 1*SIZEOF_XMMWORD ; inptr - add rdi, byte 2*SIZEOF_XMMWORD ; outptr - cmp rax, byte SIZEOF_XMMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop rsi - pop rdi - pop rax - - add rsi, byte SIZEOF_JSAMPROW ; input_data - add rdi, byte SIZEOF_JSAMPROW ; output_data - dec rcx ; rowctr - jg near .rowloop + movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7,xmm1 + psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4,xmm1 + punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm2 + punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6,xmm3 + punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + paddw xmm2,[rel PW_ONE] + paddw xmm5,[rel PW_ONE] + paddw xmm3,[rel PW_TWO] + paddw xmm6,[rel PW_TWO] + + paddw xmm2,xmm1 + paddw xmm5,xmm4 + psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3,xmm1 + paddw xmm6,xmm4 + psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3,BYTE_BIT + psllw xmm6,BYTE_BIT + por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 + + sub rax, byte SIZEOF_XMMWORD + add rsi, byte 1*SIZEOF_XMMWORD ; inptr + add rdi, byte 2*SIZEOF_XMMWORD ; outptr + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rax + + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg near .rowloop .return: - uncollect_args - pop rbp - ret + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -197,288 +197,288 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY * output_data_ptr -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 4 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 - align 16 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) EXTN(jsimd_h2v2_fancy_upsample_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - push rbx - - mov rax, r11 ; colctr - test rax,rax - jz near .return - - mov rcx, r10 ; rowctr - test rcx,rcx - jz near .return - - mov rsi, r12 ; input_data - mov rdi, r13 - mov rdi, JSAMPARRAY [rdi] ; output_data + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + push rbx + + mov rax, r11 ; colctr + test rax,rax + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data .rowloop: - push rax ; colctr - push rcx - push rdi - push rsi - - mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) - mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 - mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 - mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 - - test rax, SIZEOF_XMMWORD-1 - jz short .skip - push rdx - mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample - pop rdx + push rax ; colctr + push rcx + push rdi + push rsi + + mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 + mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + + test rax, SIZEOF_XMMWORD-1 + jz short .skip + push rdx + mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop rdx .skip: - ; -- process the first column block - - movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] - movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] - movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] - - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - - pmullw xmm0,[rel PW_THREE] - pmullw xmm4,[rel PW_THREE] - - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-2) - - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - - movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 - - pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) - pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) - - movdqa XMMWORD [wk(0)], xmm1 - movdqa XMMWORD [wk(1)], xmm2 - - add rax, byte SIZEOF_XMMWORD-1 - and rax, byte -SIZEOF_XMMWORD - cmp rax, byte SIZEOF_XMMWORD - ja short .columnloop + ; -- process the first column block + + movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-2) + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + add rax, byte SIZEOF_XMMWORD-1 + and rax, byte -SIZEOF_XMMWORD + cmp rax, byte SIZEOF_XMMWORD + ja short .columnloop .columnloop_last: - ; -- process the last column block + ; -- process the last column block - pcmpeqb xmm1,xmm1 - pslldq xmm1,(SIZEOF_XMMWORD-2) - movdqa xmm2,xmm1 + pcmpeqb xmm1,xmm1 + pslldq xmm1,(SIZEOF_XMMWORD-2) + movdqa xmm2,xmm1 - pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] - pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] + pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] - movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) - movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) - jmp near .upsample + jmp near .upsample .columnloop: - ; -- process the next column block - - movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] - movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] - movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] - - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - - pmullw xmm0,[rel PW_THREE] - pmullw xmm4,[rel PW_THREE] - - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - - movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 - - pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) - pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) - - movdqa XMMWORD [wk(2)], xmm1 - movdqa XMMWORD [wk(3)], xmm2 + ; -- process the next column block + + movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 .upsample: - ; -- process the upper row - - movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] - - movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) - movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) - psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) - pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) - movdqa xmm5,xmm7 - movdqa xmm6,xmm3 - psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) - pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) - - por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) - por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm7 - movdqa xmm2,xmm3 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) - movdqa xmm4,xmm3 - psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(0)], xmm4 - - pmullw xmm7,[rel PW_THREE] - pmullw xmm3,[rel PW_THREE] - paddw xmm1,[rel PW_EIGHT] - paddw xmm5,[rel PW_EIGHT] - paddw xmm0,[rel PW_SEVEN] - paddw xmm2,[rel PW_SEVEN] - - paddw xmm1,xmm7 - paddw xmm5,xmm3 - psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) - paddw xmm0,xmm7 - paddw xmm2,xmm3 - psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) - psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) - - psllw xmm0,BYTE_BIT - psllw xmm2,BYTE_BIT - por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) - por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 - - ; -- process the lower row - - movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] - movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] - - movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) - movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) - psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) - pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) - movdqa xmm0,xmm6 - movdqa xmm2,xmm4 - psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) - pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) - - por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) - por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) - movdqa xmm3,xmm4 - psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(1)], xmm3 - - pmullw xmm6,[rel PW_THREE] - pmullw xmm4,[rel PW_THREE] - paddw xmm1,[rel PW_EIGHT] - paddw xmm0,[rel PW_EIGHT] - paddw xmm7,[rel PW_SEVEN] - paddw xmm5,[rel PW_SEVEN] - - paddw xmm1,xmm6 - paddw xmm0,xmm4 - psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) - psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) - paddw xmm7,xmm6 - paddw xmm5,xmm4 - psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) - psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) - - psllw xmm7,BYTE_BIT - psllw xmm5,BYTE_BIT - por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) - por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 - - sub rax, byte SIZEOF_XMMWORD - add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) - add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 - add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) - add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 - add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 - cmp rax, byte SIZEOF_XMMWORD - ja near .columnloop - test rax,rax - jnz near .columnloop_last - - pop rsi - pop rdi - pop rcx - pop rax - - add rsi, byte 1*SIZEOF_JSAMPROW ; input_data - add rdi, byte 2*SIZEOF_JSAMPROW ; output_data - sub rcx, byte 2 ; rowctr - jg near .rowloop + ; -- process the upper row + + movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] + + movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5,xmm7 + movdqa xmm6,xmm3 + psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm7 + movdqa xmm2,xmm3 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4,xmm3 + psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7,[rel PW_THREE] + pmullw xmm3,[rel PW_THREE] + paddw xmm1,[rel PW_EIGHT] + paddw xmm5,[rel PW_EIGHT] + paddw xmm0,[rel PW_SEVEN] + paddw xmm2,[rel PW_SEVEN] + + paddw xmm1,xmm7 + paddw xmm5,xmm3 + psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0,xmm7 + paddw xmm2,xmm3 + psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0,BYTE_BIT + psllw xmm2,BYTE_BIT + por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] + + movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0,xmm6 + movdqa xmm2,xmm4 + psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3,xmm4 + psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6,[rel PW_THREE] + pmullw xmm4,[rel PW_THREE] + paddw xmm1,[rel PW_EIGHT] + paddw xmm0,[rel PW_EIGHT] + paddw xmm7,[rel PW_SEVEN] + paddw xmm5,[rel PW_SEVEN] + + paddw xmm1,xmm6 + paddw xmm0,xmm4 + psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7,xmm6 + paddw xmm5,xmm4 + psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7,BYTE_BIT + psllw xmm5,BYTE_BIT + por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 + + sub rax, byte SIZEOF_XMMWORD + add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 + add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp rax, byte SIZEOF_XMMWORD + ja near .columnloop + test rax,rax + jnz near .columnloop_last + + pop rsi + pop rdi + pop rcx + pop rax + + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop .return: - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -497,77 +497,77 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v1_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_upsample_sse2) EXTN(jsimd_h2v1_upsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - - mov rdx, r11 - add rdx, byte (2*SIZEOF_XMMWORD)-1 - and rdx, byte -(2*SIZEOF_XMMWORD) - jz near .return - - mov rcx, r10 ; rowctr - test rcx,rcx - jz short .return - - mov rsi, r12 ; input_data - mov rdi, r13 - mov rdi, JSAMPARRAY [rdi] ; output_data + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + + mov rdx, r11 + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz short .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data .rowloop: - push rdi - push rsi + push rdi + push rsi - mov rsi, JSAMPROW [rsi] ; inptr - mov rdi, JSAMPROW [rdi] ; outptr - mov rax,rdx ; colctr + mov rsi, JSAMPROW [rsi] ; inptr + mov rdi, JSAMPROW [rdi] ; outptr + mov rax,rdx ; colctr .columnloop: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 - sub rax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 - movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 - sub rax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - add rsi, byte 2*SIZEOF_XMMWORD ; inptr - add rdi, byte 4*SIZEOF_XMMWORD ; outptr - jmp short .columnloop + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rdi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop .nextrow: - pop rsi - pop rdi + pop rsi + pop rdi - add rsi, byte SIZEOF_JSAMPROW ; input_data - add rdi, byte SIZEOF_JSAMPROW ; output_data - dec rcx ; rowctr - jg short .rowloop + add rsi, byte SIZEOF_JSAMPROW ; input_data + add rdi, byte SIZEOF_JSAMPROW ; output_data + dec rcx ; rowctr + jg short .rowloop .return: - uncollect_args - pop rbp - ret + uncollect_args + pop rbp + ret ; -------------------------------------------------------------------------- ; @@ -586,86 +586,86 @@ EXTN(jsimd_h2v1_upsample_sse2): ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v2_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_upsample_sse2) EXTN(jsimd_h2v2_upsample_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - mov rdx, r11 - add rdx, byte (2*SIZEOF_XMMWORD)-1 - and rdx, byte -(2*SIZEOF_XMMWORD) - jz near .return - - mov rcx, r10 ; rowctr - test rcx,rcx - jz near .return - - mov rsi, r12 ; input_data - mov rdi, r13 - mov rdi, JSAMPARRAY [rdi] ; output_data + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + mov rdx, r11 + add rdx, byte (2*SIZEOF_XMMWORD)-1 + and rdx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov rcx, r10 ; rowctr + test rcx,rcx + jz near .return + + mov rsi, r12 ; input_data + mov rdi, r13 + mov rdi, JSAMPARRAY [rdi] ; output_data .rowloop: - push rdi - push rsi + push rdi + push rsi - mov rsi, JSAMPROW [rsi] ; inptr - mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 - mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 - mov rax,rdx ; colctr + mov rsi, JSAMPROW [rsi] ; inptr + mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 + mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 + mov rax,rdx ; colctr .columnloop: - movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 - movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 - sub rax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 - movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 - movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 - sub rax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub rax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - add rsi, byte 2*SIZEOF_XMMWORD ; inptr - add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 - add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 - jmp short .columnloop + add rsi, byte 2*SIZEOF_XMMWORD ; inptr + add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 + add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop .nextrow: - pop rsi - pop rdi + pop rsi + pop rdi - add rsi, byte 1*SIZEOF_JSAMPROW ; input_data - add rdi, byte 2*SIZEOF_JSAMPROW ; output_data - sub rcx, byte 2 ; rowctr - jg near .rowloop + add rsi, byte 1*SIZEOF_JSAMPROW ; input_data + add rdi, byte 2*SIZEOF_JSAMPROW ; output_data + sub rcx, byte 2 ; rowctr + jg near .rowloop .return: - pop rbx - uncollect_args - pop rbp - ret + pop rbx + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jdsamss2.asm b/simd/jdsamss2.asm index b5c863b46..2259026b0 100644 --- a/simd/jdsamss2.asm +++ b/simd/jdsamss2.asm @@ -19,24 +19,24 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fancy_upsample_sse2) + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) EXTN(jconst_fancy_upsample_sse2): -PW_ONE times 8 dw 1 -PW_TWO times 8 dw 2 -PW_THREE times 8 dw 3 -PW_SEVEN times 8 dw 7 -PW_EIGHT times 8 dw 8 +PW_ONE times 8 dw 1 +PW_TWO times 8 dw 2 +PW_THREE times 8 dw 3 +PW_SEVEN times 8 dw 7 +PW_EIGHT times 8 dw 8 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. ; @@ -52,144 +52,144 @@ PW_EIGHT times 8 dw 8 ; JSAMPARRAY * output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v1_fancy_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) EXTN(jsimd_h2v1_fancy_upsample_sse2): - push ebp - mov ebp,esp - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp,esp + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 .rowloop: - push eax ; colctr - push edi - push esi + push eax ; colctr + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr - test eax, SIZEOF_XMMWORD-1 - jz short .skip - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + test eax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample .skip: - pxor xmm0,xmm0 ; xmm0=(all 0's) - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-1) - pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm0,xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] - add eax, byte SIZEOF_XMMWORD-1 - and eax, byte -SIZEOF_XMMWORD - cmp eax, byte SIZEOF_XMMWORD - ja short .columnloop - alignx 16,7 + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16,7 .columnloop_last: - pcmpeqb xmm6,xmm6 - pslldq xmm6,(SIZEOF_XMMWORD-1) - pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] - jmp short .upsample - alignx 16,7 + pcmpeqb xmm6,xmm6 + pslldq xmm6,(SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] + jmp short .upsample + alignx 16,7 .columnloop: - movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] - pslldq xmm6,(SIZEOF_XMMWORD-1) + movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] + pslldq xmm6,(SIZEOF_XMMWORD-1) .upsample: - movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm2,xmm1 - movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) - pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) - psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) - - por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) - por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) - - movdqa xmm7,xmm1 - psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) - - movdqa xmm4,xmm1 - punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm2 - punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) - punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) - movdqa xmm6,xmm3 - punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) - punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) - - pmullw xmm1,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - paddw xmm2,[GOTOFF(ebx,PW_ONE)] - paddw xmm5,[GOTOFF(ebx,PW_ONE)] - paddw xmm3,[GOTOFF(ebx,PW_TWO)] - paddw xmm6,[GOTOFF(ebx,PW_TWO)] - - paddw xmm2,xmm1 - paddw xmm5,xmm4 - psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) - paddw xmm3,xmm1 - paddw xmm6,xmm4 - psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) - psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) - - psllw xmm3,BYTE_BIT - psllw xmm6,BYTE_BIT - por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) - por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 - - sub eax, byte SIZEOF_XMMWORD - add esi, byte 1*SIZEOF_XMMWORD ; inptr - add edi, byte 2*SIZEOF_XMMWORD ; outptr - cmp eax, byte SIZEOF_XMMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop eax - - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg near .rowloop + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7,xmm1 + psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4,xmm1 + punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm2 + punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6,xmm3 + punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + paddw xmm2,[GOTOFF(ebx,PW_ONE)] + paddw xmm5,[GOTOFF(ebx,PW_ONE)] + paddw xmm3,[GOTOFF(ebx,PW_TWO)] + paddw xmm6,[GOTOFF(ebx,PW_TWO)] + + paddw xmm2,xmm1 + paddw xmm5,xmm4 + psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3,xmm1 + paddw xmm6,xmm4 + psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3,BYTE_BIT + psllw xmm6,BYTE_BIT + por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 + + sub eax, byte SIZEOF_XMMWORD + add esi, byte 1*SIZEOF_XMMWORD ; inptr + add edi, byte 2*SIZEOF_XMMWORD ; outptr + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -203,322 +203,322 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): ; JSAMPARRAY * output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 4 -%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 4 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr - align 16 - global EXTN(jsimd_h2v2_fancy_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) EXTN(jsimd_h2v2_fancy_upsample_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov edx,eax ; edx = original ebp - mov eax, JDIMENSION [downsamp_width(edx)] ; colctr - test eax,eax - jz near .return - - mov ecx, INT [max_v_samp(edx)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(edx)] ; input_data - mov edi, POINTER [output_data_ptr(edx)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx,eax ; edx = original ebp + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax,eax + jz near .return + + mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 .rowloop: - push eax ; colctr - push ecx - push edi - push esi - - mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - - test eax, SIZEOF_XMMWORD-1 - jz short .skip - push edx - mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl - mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] - mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample - pop edx + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx .skip: - ; -- process the first column block + ; -- process the first column block - movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] - movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] - movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - pmullw xmm0,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - pcmpeqb xmm7,xmm7 - psrldq xmm7,(SIZEOF_XMMWORD-2) + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-2) - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 - pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) - pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) - movdqa XMMWORD [wk(0)], xmm1 - movdqa XMMWORD [wk(1)], xmm2 + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 - poppic ebx + poppic ebx - add eax, byte SIZEOF_XMMWORD-1 - and eax, byte -SIZEOF_XMMWORD - cmp eax, byte SIZEOF_XMMWORD - ja short .columnloop - alignx 16,7 + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16,7 .columnloop_last: - ; -- process the last column block + ; -- process the last column block - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pcmpeqb xmm1,xmm1 - pslldq xmm1,(SIZEOF_XMMWORD-2) - movdqa xmm2,xmm1 + pcmpeqb xmm1,xmm1 + pslldq xmm1,(SIZEOF_XMMWORD-2) + movdqa xmm2,xmm1 - pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] - pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] + pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] - movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) - movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) - jmp near .upsample - alignx 16,7 + jmp near .upsample + alignx 16,7 .columnloop: - ; -- process the next column block + ; -- process the next column block - movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] - movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address - pxor xmm3,xmm3 ; xmm3=(all 0's) - movdqa xmm4,xmm0 - punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm1 - punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm2 - punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - pmullw xmm0,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) - paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) - paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) - paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) - movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save - movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 + movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 - pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) - pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) - movdqa XMMWORD [wk(2)], xmm1 - movdqa XMMWORD [wk(3)], xmm2 + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 .upsample: - ; -- process the upper row - - movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] - movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] - - movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) - movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) - psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) - pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) - movdqa xmm5,xmm7 - movdqa xmm6,xmm3 - psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) - pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) - - por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) - por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm7 - movdqa xmm2,xmm3 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) - movdqa xmm4,xmm3 - psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(0)], xmm4 - - pmullw xmm7,[GOTOFF(ebx,PW_THREE)] - pmullw xmm3,[GOTOFF(ebx,PW_THREE)] - paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] - paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] - - paddw xmm1,xmm7 - paddw xmm5,xmm3 - psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) - psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) - paddw xmm0,xmm7 - paddw xmm2,xmm3 - psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) - psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) - - psllw xmm0,BYTE_BIT - psllw xmm2,BYTE_BIT - por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) - por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 - - ; -- process the lower row - - movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] - movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] - - movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) - movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) - psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) - pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) - movdqa xmm0,xmm6 - movdqa xmm2,xmm4 - psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) - pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) - - por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) - por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) - - movdqa xmm1,xmm6 - movdqa xmm5,xmm4 - pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) - psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) - movdqa xmm3,xmm4 - psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) - - por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) - por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) - - movdqa XMMWORD [wk(1)], xmm3 - - pmullw xmm6,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] - paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] - paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] - - paddw xmm1,xmm6 - paddw xmm0,xmm4 - psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) - psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) - paddw xmm7,xmm6 - paddw xmm5,xmm4 - psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) - psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) - - psllw xmm7,BYTE_BIT - psllw xmm5,BYTE_BIT - por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) - por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) - - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 - - poppic ebx - - sub eax, byte SIZEOF_XMMWORD - add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) - add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 - add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) - add edx, byte 2*SIZEOF_XMMWORD ; outptr0 - add edi, byte 2*SIZEOF_XMMWORD ; outptr1 - cmp eax, byte SIZEOF_XMMWORD - ja near .columnloop - test eax,eax - jnz near .columnloop_last - - pop esi - pop edi - pop ecx - pop eax - - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg near .rowloop + ; -- process the upper row + + movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] + + movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5,xmm7 + movdqa xmm6,xmm3 + psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm7 + movdqa xmm2,xmm3 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4,xmm3 + psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7,[GOTOFF(ebx,PW_THREE)] + pmullw xmm3,[GOTOFF(ebx,PW_THREE)] + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] + paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1,xmm7 + paddw xmm5,xmm3 + psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0,xmm7 + paddw xmm2,xmm3 + psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0,BYTE_BIT + psllw xmm2,BYTE_BIT + por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0,xmm6 + movdqa xmm2,xmm4 + psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3,xmm4 + psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] + paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1,xmm6 + paddw xmm0,xmm4 + psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7,xmm6 + paddw xmm5,xmm4 + psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7,BYTE_BIT + psllw xmm5,BYTE_BIT + por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 + + poppic ebx + + sub eax, byte SIZEOF_XMMWORD + add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 + add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_XMMWORD ; outptr0 + add edi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -532,92 +532,92 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): ; JSAMPARRAY * output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v1_upsample_sse2) + align 16 + global EXTN(jsimd_h2v1_upsample_sse2) EXTN(jsimd_h2v1_upsample_sse2): - push ebp - mov ebp,esp -; push ebx ; unused -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_XMMWORD)-1 - and edx, byte -(2*SIZEOF_XMMWORD) - jz short .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz short .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp,esp +; push ebx ; unused +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz short .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 .rowloop: - push edi - push esi + push edi + push esi - mov esi, JSAMPROW [esi] ; inptr - mov edi, JSAMPROW [edi] ; outptr - mov eax,edx ; colctr - alignx 16,7 + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax,edx ; colctr + alignx 16,7 .columnloop: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add edi, byte 4*SIZEOF_XMMWORD ; outptr - jmp short .columnloop - alignx 16,7 + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + alignx 16,7 .nextrow: - pop esi - pop edi + pop esi + pop edi - add esi, byte SIZEOF_JSAMPROW ; input_data - add edi, byte SIZEOF_JSAMPROW ; output_data - dec ecx ; rowctr - jg short .rowloop + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved -; pop ebx ; unused - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved +; pop ebx ; unused + pop ebp + ret ; -------------------------------------------------------------------------- ; @@ -631,99 +631,99 @@ EXTN(jsimd_h2v1_upsample_sse2): ; JSAMPARRAY * output_data_ptr); ; -%define max_v_samp(b) (b)+8 ; int max_v_samp_factor -%define output_width(b) (b)+12 ; JDIMENSION output_width -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor +%define output_width(b) (b)+12 ; JDIMENSION output_width +%define input_data(b) (b)+16 ; JSAMPARRAY input_data +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - align 16 - global EXTN(jsimd_h2v2_upsample_sse2) + align 16 + global EXTN(jsimd_h2v2_upsample_sse2) EXTN(jsimd_h2v2_upsample_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - mov edx, JDIMENSION [output_width(ebp)] - add edx, byte (2*SIZEOF_XMMWORD)-1 - and edx, byte -(2*SIZEOF_XMMWORD) - jz near .return - - mov ecx, INT [max_v_samp(ebp)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz near .return + + mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 .rowloop: - push edi - push esi - - mov esi, JSAMPROW [esi] ; inptr - mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - mov eax,edx ; colctr - alignx 16,7 + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax,edx ; colctr + alignx 16,7 .columnloop: - movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] - movdqa xmm1,xmm0 - punpcklbw xmm0,xmm0 - punpckhbw xmm1,xmm1 + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 - movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 - movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 - movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] - movdqa xmm3,xmm2 - punpcklbw xmm2,xmm2 - punpckhbw xmm3,xmm3 + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 - movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 - movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 - movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 - sub eax, byte 2*SIZEOF_XMMWORD - jz short .nextrow + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow - add esi, byte 2*SIZEOF_XMMWORD ; inptr - add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 - add edi, byte 4*SIZEOF_XMMWORD ; outptr1 - jmp short .columnloop - alignx 16,7 + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 + add edi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + alignx 16,7 .nextrow: - pop esi - pop edi + pop esi + pop edi - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg short .rowloop + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop .return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jf3dnflt.asm b/simd/jf3dnflt.asm index 542672dc5..06712e8e8 100644 --- a/simd/jf3dnflt.asm +++ b/simd/jf3dnflt.asm @@ -24,23 +24,23 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_float_3dnow) + alignz 16 + global EXTN(jconst_fdct_float_3dnow) EXTN(jconst_fdct_float_3dnow): -PD_0_382 times 2 dd 0.382683432365089771728460 -PD_0_707 times 2 dd 0.707106781186547524400844 -PD_0_541 times 2 dd 0.541196100146196984399723 -PD_1_306 times 2 dd 1.306562964876376527856643 +PD_0_382 times 2 dd 0.382683432365089771728460 +PD_0_707 times 2 dd 0.707106781186547524400844 +PD_0_541 times 2 dd 0.541196100146196984399723 +PD_1_306 times 2 dd 1.306562964876376527856643 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -48,273 +48,273 @@ PD_1_306 times 2 dd 1.306562964876376527856643 ; jsimd_fdct_float_3dnow (FAST_FLOAT * data) ; -%define data(b) (b)+8 ; FAST_FLOAT * data +%define data(b) (b)+8 ; FAST_FLOAT * data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_float_3dnow) + align 16 + global EXTN(jsimd_fdct_float_3dnow) EXTN(jsimd_fdct_float_3dnow): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/2 - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16,7 .rowloop: - movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] - - ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) - - movq mm4,mm0 ; transpose coefficients - punpckldq mm0,mm1 ; mm0=(00 10)=data0 - punpckhdq mm4,mm1 ; mm4=(01 11)=data1 - movq mm5,mm2 ; transpose coefficients - punpckldq mm2,mm3 ; mm2=(06 16)=data6 - punpckhdq mm5,mm3 ; mm5=(07 17)=data7 - - movq mm6,mm4 - movq mm7,mm0 - pfsub mm4,mm2 ; mm4=data1-data6=tmp6 - pfsub mm0,mm5 ; mm0=data0-data7=tmp7 - pfadd mm6,mm2 ; mm6=data1+data6=tmp1 - pfadd mm7,mm5 ; mm7=data0+data7=tmp0 - - movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] - movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] - - ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) - - movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 - - movq mm4,mm1 ; transpose coefficients - punpckldq mm1,mm3 ; mm1=(02 12)=data2 - punpckhdq mm4,mm3 ; mm4=(03 13)=data3 - movq mm0,mm2 ; transpose coefficients - punpckldq mm2,mm5 ; mm2=(04 14)=data4 - punpckhdq mm0,mm5 ; mm0=(05 15)=data5 - - movq mm3,mm4 - movq mm5,mm1 - pfadd mm4,mm2 ; mm4=data3+data4=tmp3 - pfadd mm1,mm0 ; mm1=data2+data5=tmp2 - pfsub mm3,mm2 ; mm3=data3-data4=tmp4 - pfsub mm5,mm0 ; mm5=data2-data5=tmp5 - - ; -- Even part - - movq mm2,mm7 - movq mm0,mm6 - pfsub mm7,mm4 ; mm7=tmp13 - pfsub mm6,mm1 ; mm6=tmp12 - pfadd mm2,mm4 ; mm2=tmp10 - pfadd mm0,mm1 ; mm0=tmp11 - - pfadd mm6,mm7 - pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 - - movq mm4,mm2 - movq mm1,mm7 - pfsub mm2,mm0 ; mm2=data4 - pfsub mm7,mm6 ; mm7=data6 - pfadd mm4,mm0 ; mm4=data0 - pfadd mm1,mm6 ; mm1=data2 - - movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 - movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 - - ; -- Odd part - - movq mm0, MMWORD [wk(0)] ; mm0=tmp6 - movq mm6, MMWORD [wk(1)] ; mm6=tmp7 - - pfadd mm3,mm5 ; mm3=tmp10 - pfadd mm5,mm0 ; mm5=tmp11 - pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 - - pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 - - movq mm2,mm3 ; mm2=tmp10 - pfsub mm3,mm0 - pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 - pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) - pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) - pfadd mm2,mm3 ; mm2=z2 - pfadd mm0,mm3 ; mm0=z4 - - movq mm7,mm6 - pfsub mm6,mm5 ; mm6=z13 - pfadd mm7,mm5 ; mm7=z11 - - movq mm4,mm6 - movq mm1,mm7 - pfsub mm6,mm2 ; mm6=data3 - pfsub mm7,mm0 ; mm7=data7 - pfadd mm4,mm2 ; mm4=data5 - pfadd mm1,mm0 ; mm1=data1 - - movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 - movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 - - add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/2 - alignx 16,7 + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) + + movq mm4,mm0 ; transpose coefficients + punpckldq mm0,mm1 ; mm0=(00 10)=data0 + punpckhdq mm4,mm1 ; mm4=(01 11)=data1 + movq mm5,mm2 ; transpose coefficients + punpckldq mm2,mm3 ; mm2=(06 16)=data6 + punpckhdq mm5,mm3 ; mm5=(07 17)=data7 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm2 ; mm4=data1-data6=tmp6 + pfsub mm0,mm5 ; mm0=data0-data7=tmp7 + pfadd mm6,mm2 ; mm6=data1+data6=tmp1 + pfadd mm7,mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4,mm1 ; transpose coefficients + punpckldq mm1,mm3 ; mm1=(02 12)=data2 + punpckhdq mm4,mm3 ; mm4=(03 13)=data3 + movq mm0,mm2 ; transpose coefficients + punpckldq mm2,mm5 ; mm2=(04 14)=data4 + punpckhdq mm0,mm5 ; mm0=(05 15)=data5 + + movq mm3,mm4 + movq mm5,mm1 + pfadd mm4,mm2 ; mm4=data3+data4=tmp3 + pfadd mm1,mm0 ; mm1=data2+data5=tmp2 + pfsub mm3,mm2 ; mm3=data3-data4=tmp4 + pfsub mm5,mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2,mm7 + movq mm0,mm6 + pfsub mm7,mm4 ; mm7=tmp13 + pfsub mm6,mm1 ; mm6=tmp12 + pfadd mm2,mm4 ; mm2=tmp10 + pfadd mm0,mm1 ; mm0=tmp11 + + pfadd mm6,mm7 + pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4,mm2 + movq mm1,mm7 + pfsub mm2,mm0 ; mm2=data4 + pfsub mm7,mm6 ; mm7=data6 + pfadd mm4,mm0 ; mm4=data0 + pfadd mm1,mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3,mm5 ; mm3=tmp10 + pfadd mm5,mm0 ; mm5=tmp11 + pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2,mm3 ; mm2=tmp10 + pfsub mm3,mm0 + pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2,mm3 ; mm2=z2 + pfadd mm0,mm3 ; mm0=z4 + + movq mm7,mm6 + pfsub mm6,mm5 ; mm6=z13 + pfadd mm7,mm5 ; mm7=z11 + + movq mm4,mm6 + movq mm1,mm7 + pfsub mm6,mm2 ; mm6=data3 + pfsub mm7,mm0 ; mm7=data7 + pfadd mm4,mm2 ; mm4=data5 + pfadd mm1,mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/2 + alignx 16,7 .columnloop: - movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] - - ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) - - movq mm4,mm0 ; transpose coefficients - punpckldq mm0,mm1 ; mm0=(00 01)=data0 - punpckhdq mm4,mm1 ; mm4=(10 11)=data1 - movq mm5,mm2 ; transpose coefficients - punpckldq mm2,mm3 ; mm2=(60 61)=data6 - punpckhdq mm5,mm3 ; mm5=(70 71)=data7 - - movq mm6,mm4 - movq mm7,mm0 - pfsub mm4,mm2 ; mm4=data1-data6=tmp6 - pfsub mm0,mm5 ; mm0=data0-data7=tmp7 - pfadd mm6,mm2 ; mm6=data1+data6=tmp1 - pfadd mm7,mm5 ; mm7=data0+data7=tmp0 - - movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] - movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] - - ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) - - movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 - - movq mm4,mm1 ; transpose coefficients - punpckldq mm1,mm3 ; mm1=(20 21)=data2 - punpckhdq mm4,mm3 ; mm4=(30 31)=data3 - movq mm0,mm2 ; transpose coefficients - punpckldq mm2,mm5 ; mm2=(40 41)=data4 - punpckhdq mm0,mm5 ; mm0=(50 51)=data5 - - movq mm3,mm4 - movq mm5,mm1 - pfadd mm4,mm2 ; mm4=data3+data4=tmp3 - pfadd mm1,mm0 ; mm1=data2+data5=tmp2 - pfsub mm3,mm2 ; mm3=data3-data4=tmp4 - pfsub mm5,mm0 ; mm5=data2-data5=tmp5 - - ; -- Even part - - movq mm2,mm7 - movq mm0,mm6 - pfsub mm7,mm4 ; mm7=tmp13 - pfsub mm6,mm1 ; mm6=tmp12 - pfadd mm2,mm4 ; mm2=tmp10 - pfadd mm0,mm1 ; mm0=tmp11 - - pfadd mm6,mm7 - pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 - - movq mm4,mm2 - movq mm1,mm7 - pfsub mm2,mm0 ; mm2=data4 - pfsub mm7,mm6 ; mm7=data6 - pfadd mm4,mm0 ; mm4=data0 - pfadd mm1,mm6 ; mm1=data2 - - movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 - movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 - - ; -- Odd part - - movq mm0, MMWORD [wk(0)] ; mm0=tmp6 - movq mm6, MMWORD [wk(1)] ; mm6=tmp7 - - pfadd mm3,mm5 ; mm3=tmp10 - pfadd mm5,mm0 ; mm5=tmp11 - pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 - - pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 - - movq mm2,mm3 ; mm2=tmp10 - pfsub mm3,mm0 - pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 - pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) - pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) - pfadd mm2,mm3 ; mm2=z2 - pfadd mm0,mm3 ; mm0=z4 - - movq mm7,mm6 - pfsub mm6,mm5 ; mm6=z13 - pfadd mm7,mm5 ; mm7=z11 - - movq mm4,mm6 - movq mm1,mm7 - pfsub mm6,mm2 ; mm6=data3 - pfsub mm7,mm0 ; mm7=data7 - pfadd mm4,mm2 ; mm4=data5 - pfadd mm1,mm0 ; mm1=data1 - - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 - movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 - - add edx, byte 2*SIZEOF_FAST_FLOAT - dec ecx - jnz near .columnloop - - femms ; empty MMX/3DNow! state - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) + + movq mm4,mm0 ; transpose coefficients + punpckldq mm0,mm1 ; mm0=(00 01)=data0 + punpckhdq mm4,mm1 ; mm4=(10 11)=data1 + movq mm5,mm2 ; transpose coefficients + punpckldq mm2,mm3 ; mm2=(60 61)=data6 + punpckhdq mm5,mm3 ; mm5=(70 71)=data7 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm2 ; mm4=data1-data6=tmp6 + pfsub mm0,mm5 ; mm0=data0-data7=tmp7 + pfadd mm6,mm2 ; mm6=data1+data6=tmp1 + pfadd mm7,mm5 ; mm7=data0+data7=tmp0 + + movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) + + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 + + movq mm4,mm1 ; transpose coefficients + punpckldq mm1,mm3 ; mm1=(20 21)=data2 + punpckhdq mm4,mm3 ; mm4=(30 31)=data3 + movq mm0,mm2 ; transpose coefficients + punpckldq mm2,mm5 ; mm2=(40 41)=data4 + punpckhdq mm0,mm5 ; mm0=(50 51)=data5 + + movq mm3,mm4 + movq mm5,mm1 + pfadd mm4,mm2 ; mm4=data3+data4=tmp3 + pfadd mm1,mm0 ; mm1=data2+data5=tmp2 + pfsub mm3,mm2 ; mm3=data3-data4=tmp4 + pfsub mm5,mm0 ; mm5=data2-data5=tmp5 + + ; -- Even part + + movq mm2,mm7 + movq mm0,mm6 + pfsub mm7,mm4 ; mm7=tmp13 + pfsub mm6,mm1 ; mm6=tmp12 + pfadd mm2,mm4 ; mm2=tmp10 + pfadd mm0,mm1 ; mm0=tmp11 + + pfadd mm6,mm7 + pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 + + movq mm4,mm2 + movq mm1,mm7 + pfsub mm2,mm0 ; mm2=data4 + pfsub mm7,mm6 ; mm7=data6 + pfadd mm4,mm0 ; mm4=data0 + pfadd mm1,mm6 ; mm1=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + ; -- Odd part + + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 + + pfadd mm3,mm5 ; mm3=tmp10 + pfadd mm5,mm0 ; mm5=tmp11 + pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 + + pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 + + movq mm2,mm3 ; mm2=tmp10 + pfsub mm3,mm0 + pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 + pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) + pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) + pfadd mm2,mm3 ; mm2=z2 + pfadd mm0,mm3 ; mm0=z4 + + movq mm7,mm6 + pfsub mm6,mm5 ; mm6=z13 + pfadd mm7,mm5 ; mm7=z11 + + movq mm4,mm6 + movq mm1,mm7 + pfsub mm6,mm2 ; mm6=data3 + pfsub mm7,mm0 ; mm7=data7 + pfadd mm4,mm2 ; mm4=data5 + pfadd mm1,mm0 ; mm1=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 + + add edx, byte 2*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + + femms ; empty MMX/3DNow! state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfmmxfst.asm b/simd/jfmmxfst.asm index 0647242a9..d8cd4b9e9 100644 --- a/simd/jfmmxfst.asm +++ b/simd/jfmmxfst.asm @@ -26,24 +26,24 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. +%define CONST_BITS 8 ; 14 is also OK. %if CONST_BITS == 8 -F_0_382 equ 98 ; FIX(0.382683433) -F_0_541 equ 139 ; FIX(0.541196100) -F_0_707 equ 181 ; FIX(0.707106781) -F_1_306 equ 334 ; FIX(1.306562965) +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) -F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) @@ -51,21 +51,21 @@ F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_fdct_ifast_mmx) + alignz 16 + global EXTN(jconst_fdct_ifast_mmx) EXTN(jconst_fdct_ifast_mmx): -PW_F0707 times 4 dw F_0_707 << CONST_SHIFT -PW_F0382 times 4 dw F_0_382 << CONST_SHIFT -PW_F0541 times 4 dw F_0_541 << CONST_SHIFT -PW_F1306 times 4 dw F_1_306 << CONST_SHIFT +PW_F0707 times 4 dw F_0_707 << CONST_SHIFT +PW_F0382 times 4 dw F_0_382 << CONST_SHIFT +PW_F0541 times 4 dw F_0_541 << CONST_SHIFT +PW_F1306 times 4 dw F_1_306 << CONST_SHIFT - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -73,325 +73,325 @@ PW_F1306 times 4 dw F_1_306 << CONST_SHIFT ; jsimd_fdct_ifast_mmx (DCTELEM * data) ; -%define data(b) (b)+8 ; DCTELEM * data +%define data(b) (b)+8 ; DCTELEM * data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_ifast_mmx) + align 16 + global EXTN(jsimd_fdct_ifast_mmx) EXTN(jsimd_fdct_ifast_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 .rowloop: - movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] - - ; mm0=(20 21 22 23), mm2=(24 25 26 27) - ; mm1=(30 31 32 33), mm3=(34 35 36 37) - - movq mm4,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm1 ; mm0=(20 30 21 31) - punpckhwd mm4,mm1 ; mm4=(22 32 23 33) - movq mm5,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm3 ; mm2=(24 34 25 35) - punpckhwd mm5,mm3 ; mm5=(26 36 27 37) - - movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] - - ; mm6=(00 01 02 03), mm1=(04 05 06 07) - ; mm7=(10 11 12 13), mm3=(14 15 16 17) - - movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) - movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) - - movq mm4,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 10 01 11) - punpckhwd mm4,mm7 ; mm4=(02 12 03 13) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm3 ; mm1=(04 14 05 15) - punpckhwd mm2,mm3 ; mm2=(06 16 07 17) - - movq mm7,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 - punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 - movq mm3,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 - punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 - - movq mm0,mm7 - movq mm5,mm6 - psubw mm7,mm2 ; mm7=data1-data6=tmp6 - psubw mm6,mm3 ; mm6=data0-data7=tmp7 - paddw mm0,mm2 ; mm0=data1+data6=tmp1 - paddw mm5,mm3 ; mm5=data0+data7=tmp0 - - movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) - movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 - - movq mm7,mm4 ; transpose coefficients(phase 2) - punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 - punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 - punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 - - movq mm2,mm7 - movq mm3,mm4 - paddw mm7,mm1 ; mm7=data3+data4=tmp3 - paddw mm4,mm6 ; mm4=data2+data5=tmp2 - psubw mm2,mm1 ; mm2=data3-data4=tmp4 - psubw mm3,mm6 ; mm3=data2-data5=tmp5 - - ; -- Even part - - movq mm1,mm5 - movq mm6,mm0 - psubw mm5,mm7 ; mm5=tmp13 - psubw mm0,mm4 ; mm0=tmp12 - paddw mm1,mm7 ; mm1=tmp10 - paddw mm6,mm4 ; mm6=tmp11 - - paddw mm0,mm5 - psllw mm0,PRE_MULTIPLY_SCALE_BITS - pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 - - movq mm7,mm1 - movq mm4,mm5 - psubw mm1,mm6 ; mm1=data4 - psubw mm5,mm0 ; mm5=data6 - paddw mm7,mm6 ; mm7=data0 - paddw mm4,mm0 ; mm4=data2 - - movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 - - ; -- Odd part - - movq mm6, MMWORD [wk(0)] ; mm6=tmp6 - movq mm0, MMWORD [wk(1)] ; mm0=tmp7 - - paddw mm2,mm3 ; mm2=tmp10 - paddw mm3,mm6 ; mm3=tmp11 - paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 - - psllw mm2,PRE_MULTIPLY_SCALE_BITS - psllw mm6,PRE_MULTIPLY_SCALE_BITS - - psllw mm3,PRE_MULTIPLY_SCALE_BITS - pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 - - movq mm1,mm2 ; mm1=tmp10 - psubw mm2,mm6 - pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 - pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) - pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) - paddw mm1,mm2 ; mm1=z2 - paddw mm6,mm2 ; mm6=z4 - - movq mm5,mm0 - psubw mm0,mm3 ; mm0=z13 - paddw mm5,mm3 ; mm5=z11 - - movq mm7,mm0 - movq mm4,mm5 - psubw mm0,mm1 ; mm0=data3 - psubw mm5,mm6 ; mm5=data7 - paddw mm7,mm1 ; mm7=data5 - paddw mm4,mm6 ; mm4=data1 - - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 - - add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(24 34 25 35) + punpckhwd mm5,mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm4,mm7 ; mm4=(02 12 03 13) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(04 14 05 15) + punpckhwd mm2,mm3 ; mm2=(06 16 07 17) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + psubw mm5,mm7 ; mm5=tmp13 + psubw mm0,mm4 ; mm0=tmp12 + paddw mm1,mm7 ; mm1=tmp10 + paddw mm6,mm4 ; mm6=tmp11 + + paddw mm0,mm5 + psllw mm0,PRE_MULTIPLY_SCALE_BITS + pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7,mm1 + movq mm4,mm5 + psubw mm1,mm6 ; mm1=data4 + psubw mm5,mm0 ; mm5=data6 + paddw mm7,mm6 ; mm7=data0 + paddw mm4,mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2,mm3 ; mm2=tmp10 + paddw mm3,mm6 ; mm3=tmp11 + paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm6,PRE_MULTIPLY_SCALE_BITS + + psllw mm3,PRE_MULTIPLY_SCALE_BITS + pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1,mm2 ; mm1=tmp10 + psubw mm2,mm6 + pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1,mm2 ; mm1=z2 + paddw mm6,mm2 ; mm6=z4 + + movq mm5,mm0 + psubw mm0,mm3 ; mm0=z13 + paddw mm5,mm3 ; mm5=z11 + + movq mm7,mm0 + movq mm4,mm5 + psubw mm0,mm1 ; mm0=data3 + psubw mm5,mm6 ; mm5=data7 + paddw mm7,mm1 ; mm7=data5 + paddw mm4,mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 .columnloop: - movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; mm0=(02 12 22 32), mm2=(42 52 62 72) - ; mm1=(03 13 23 33), mm3=(43 53 63 73) - - movq mm4,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm1 ; mm0=(02 03 12 13) - punpckhwd mm4,mm1 ; mm4=(22 23 32 33) - movq mm5,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm3 ; mm2=(42 43 52 53) - punpckhwd mm5,mm3 ; mm5=(62 63 72 73) - - movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - - ; mm6=(00 10 20 30), mm1=(40 50 60 70) - ; mm7=(01 11 21 31), mm3=(41 51 61 71) - - movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) - movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) - - movq mm4,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 01 10 11) - punpckhwd mm4,mm7 ; mm4=(20 21 30 31) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm3 ; mm1=(40 41 50 51) - punpckhwd mm2,mm3 ; mm2=(60 61 70 71) - - movq mm7,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 - punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 - movq mm3,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 - punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 - - movq mm0,mm7 - movq mm5,mm6 - psubw mm7,mm2 ; mm7=data1-data6=tmp6 - psubw mm6,mm3 ; mm6=data0-data7=tmp7 - paddw mm0,mm2 ; mm0=data1+data6=tmp1 - paddw mm5,mm3 ; mm5=data0+data7=tmp0 - - movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) - movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 - - movq mm7,mm4 ; transpose coefficients(phase 2) - punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 - punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 - punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 - - movq mm2,mm7 - movq mm3,mm4 - paddw mm7,mm1 ; mm7=data3+data4=tmp3 - paddw mm4,mm6 ; mm4=data2+data5=tmp2 - psubw mm2,mm1 ; mm2=data3-data4=tmp4 - psubw mm3,mm6 ; mm3=data2-data5=tmp5 - - ; -- Even part - - movq mm1,mm5 - movq mm6,mm0 - psubw mm5,mm7 ; mm5=tmp13 - psubw mm0,mm4 ; mm0=tmp12 - paddw mm1,mm7 ; mm1=tmp10 - paddw mm6,mm4 ; mm6=tmp11 - - paddw mm0,mm5 - psllw mm0,PRE_MULTIPLY_SCALE_BITS - pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 - - movq mm7,mm1 - movq mm4,mm5 - psubw mm1,mm6 ; mm1=data4 - psubw mm5,mm0 ; mm5=data6 - paddw mm7,mm6 ; mm7=data0 - paddw mm4,mm0 ; mm4=data2 - - movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 - - ; -- Odd part - - movq mm6, MMWORD [wk(0)] ; mm6=tmp6 - movq mm0, MMWORD [wk(1)] ; mm0=tmp7 - - paddw mm2,mm3 ; mm2=tmp10 - paddw mm3,mm6 ; mm3=tmp11 - paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 - - psllw mm2,PRE_MULTIPLY_SCALE_BITS - psllw mm6,PRE_MULTIPLY_SCALE_BITS - - psllw mm3,PRE_MULTIPLY_SCALE_BITS - pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 - - movq mm1,mm2 ; mm1=tmp10 - psubw mm2,mm6 - pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 - pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) - pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) - paddw mm1,mm2 ; mm1=z2 - paddw mm6,mm2 ; mm6=z4 - - movq mm5,mm0 - psubw mm0,mm3 ; mm0=z13 - paddw mm5,mm3 ; mm5=z11 - - movq mm7,mm0 - movq mm4,mm5 - psubw mm0,mm1 ; mm0=data3 - psubw mm5,mm6 ; mm5=data7 - paddw mm7,mm1 ; mm7=data5 - paddw mm4,mm6 ; mm4=data1 - - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 - movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 - - add edx, byte 4*SIZEOF_DCTELEM - dec ecx - jnz near .columnloop - - emms ; empty MMX state - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(02 03 12 13) + punpckhwd mm4,mm1 ; mm4=(22 23 32 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(42 43 52 53) + punpckhwd mm5,mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 01 10 11) + punpckhwd mm4,mm7 ; mm4=(20 21 30 31) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(40 41 50 51) + punpckhwd mm2,mm3 ; mm2=(60 61 70 71) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + psubw mm5,mm7 ; mm5=tmp13 + psubw mm0,mm4 ; mm0=tmp12 + paddw mm1,mm7 ; mm1=tmp10 + paddw mm6,mm4 ; mm6=tmp11 + + paddw mm0,mm5 + psllw mm0,PRE_MULTIPLY_SCALE_BITS + pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 + + movq mm7,mm1 + movq mm4,mm5 + psubw mm1,mm6 ; mm1=data4 + psubw mm5,mm0 ; mm5=data6 + paddw mm7,mm6 ; mm7=data0 + paddw mm4,mm0 ; mm4=data2 + + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + + ; -- Odd part + + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 + + paddw mm2,mm3 ; mm2=tmp10 + paddw mm3,mm6 ; mm3=tmp11 + paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 + + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm6,PRE_MULTIPLY_SCALE_BITS + + psllw mm3,PRE_MULTIPLY_SCALE_BITS + pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 + + movq mm1,mm2 ; mm1=tmp10 + psubw mm2,mm6 + pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 + pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) + pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) + paddw mm1,mm2 ; mm1=z2 + paddw mm6,mm2 ; mm6=z4 + + movq mm5,mm0 + psubw mm0,mm3 ; mm0=z13 + paddw mm5,mm3 ; mm5=z11 + + movq mm7,mm0 + movq mm4,mm5 + psubw mm0,mm1 ; mm0=data3 + psubw mm5,mm6 ; mm5=data7 + paddw mm7,mm1 ; mm7=data5 + paddw mm4,mm6 ; mm4=data1 + + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfmmxint.asm b/simd/jfmmxint.asm index a7e73f73a..c16f26218 100644 --- a/simd/jfmmxint.asm +++ b/simd/jfmmxint.asm @@ -26,67 +26,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_islow_mmx) + alignz 16 + global EXTN(jconst_fdct_islow_mmx) EXTN(jconst_fdct_islow_mmx): -PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) -PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) +PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -94,529 +94,529 @@ PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) ; jsimd_fdct_islow_mmx (DCTELEM * data) ; -%define data(b) (b)+8 ; DCTELEM * data +%define data(b) (b)+8 ; DCTELEM * data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_islow_mmx) + align 16 + global EXTN(jsimd_fdct_islow_mmx) EXTN(jsimd_fdct_islow_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 .rowloop: - movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] - - ; mm0=(20 21 22 23), mm2=(24 25 26 27) - ; mm1=(30 31 32 33), mm3=(34 35 36 37) - - movq mm4,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm1 ; mm0=(20 30 21 31) - punpckhwd mm4,mm1 ; mm4=(22 32 23 33) - movq mm5,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm3 ; mm2=(24 34 25 35) - punpckhwd mm5,mm3 ; mm5=(26 36 27 37) - - movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] - - ; mm6=(00 01 02 03), mm1=(04 05 06 07) - ; mm7=(10 11 12 13), mm3=(14 15 16 17) - - movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) - movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) - - movq mm4,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 10 01 11) - punpckhwd mm4,mm7 ; mm4=(02 12 03 13) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm3 ; mm1=(04 14 05 15) - punpckhwd mm2,mm3 ; mm2=(06 16 07 17) - - movq mm7,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 - punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 - movq mm3,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 - punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 - - movq mm0,mm7 - movq mm5,mm6 - psubw mm7,mm2 ; mm7=data1-data6=tmp6 - psubw mm6,mm3 ; mm6=data0-data7=tmp7 - paddw mm0,mm2 ; mm0=data1+data6=tmp1 - paddw mm5,mm3 ; mm5=data0+data7=tmp0 - - movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) - movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 - - movq mm7,mm4 ; transpose coefficients(phase 2) - punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 - punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 - punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 - - movq mm2,mm7 - movq mm3,mm4 - paddw mm7,mm1 ; mm7=data3+data4=tmp3 - paddw mm4,mm6 ; mm4=data2+data5=tmp2 - psubw mm2,mm1 ; mm2=data3-data4=tmp4 - psubw mm3,mm6 ; mm3=data2-data5=tmp5 - - ; -- Even part - - movq mm1,mm5 - movq mm6,mm0 - paddw mm5,mm7 ; mm5=tmp10 - paddw mm0,mm4 ; mm0=tmp11 - psubw mm1,mm7 ; mm1=tmp13 - psubw mm6,mm4 ; mm6=tmp12 - - movq mm7,mm5 - paddw mm5,mm0 ; mm5=tmp10+tmp11 - psubw mm7,mm0 ; mm7=tmp10-tmp11 - - psllw mm5,PASS1_BITS ; mm5=data0 - psllw mm7,PASS1_BITS ; mm7=data4 - - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movq mm4,mm1 ; mm1=tmp13 - movq mm0,mm1 - punpcklwd mm4,mm6 ; mm6=tmp12 - punpckhwd mm0,mm6 - movq mm1,mm4 - movq mm6,mm0 - pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L - pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H - pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L - pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H - - paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm4,DESCALE_P1 - psrad mm0,DESCALE_P1 - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm1,DESCALE_P1 - psrad mm6,DESCALE_P1 - - packssdw mm4,mm0 ; mm4=data2 - packssdw mm1,mm6 ; mm1=data6 - - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 - - ; -- Odd part - - movq mm5, MMWORD [wk(0)] ; mm5=tmp6 - movq mm7, MMWORD [wk(1)] ; mm7=tmp7 - - movq mm0,mm2 ; mm2=tmp4 - movq mm6,mm3 ; mm3=tmp5 - paddw mm0,mm5 ; mm0=z3 - paddw mm6,mm7 ; mm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movq mm4,mm0 - movq mm1,mm0 - punpcklwd mm4,mm6 - punpckhwd mm1,mm6 - movq mm0,mm4 - movq mm6,mm1 - pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L - pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H - pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L - pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H - - movq MMWORD [wk(0)], mm4 ; wk(0)=z3L - movq MMWORD [wk(1)], mm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movq mm4,mm2 - movq mm1,mm2 - punpcklwd mm4,mm7 - punpckhwd mm1,mm7 - movq mm2,mm4 - movq mm7,mm1 - pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L - pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H - pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L - pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H - - paddd mm4, MMWORD [wk(0)] ; mm4=data7L - paddd mm1, MMWORD [wk(1)] ; mm1=data7H - paddd mm2,mm0 ; mm2=data1L - paddd mm7,mm6 ; mm7=data1H - - paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm4,DESCALE_P1 - psrad mm1,DESCALE_P1 - paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm2,DESCALE_P1 - psrad mm7,DESCALE_P1 - - packssdw mm4,mm1 ; mm4=data7 - packssdw mm2,mm7 ; mm2=data1 - - movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 - - movq mm1,mm3 - movq mm7,mm3 - punpcklwd mm1,mm5 - punpckhwd mm7,mm5 - movq mm3,mm1 - movq mm5,mm7 - pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L - pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H - pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L - pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H - - paddd mm1,mm0 ; mm1=data5L - paddd mm7,mm6 ; mm7=data5H - paddd mm3, MMWORD [wk(0)] ; mm3=data3L - paddd mm5, MMWORD [wk(1)] ; mm5=data3H - - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm1,DESCALE_P1 - psrad mm7,DESCALE_P1 - paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad mm3,DESCALE_P1 - psrad mm5,DESCALE_P1 - - packssdw mm1,mm7 ; mm1=data5 - packssdw mm3,mm5 ; mm3=data3 - - movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 - - add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - mov ecx, DCTSIZE/4 - alignx 16,7 + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] + + ; mm0=(20 21 22 23), mm2=(24 25 26 27) + ; mm1=(30 31 32 33), mm3=(34 35 36 37) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(24 34 25 35) + punpckhwd mm5,mm3 ; mm5=(26 36 27 37) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 01 02 03), mm1=(04 05 06 07) + ; mm7=(10 11 12 13), mm3=(14 15 16 17) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm4,mm7 ; mm4=(02 12 03 13) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(04 14 05 15) + punpckhwd mm2,mm3 ; mm2=(06 16 07 17) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 + punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 + punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 + punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 + punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + paddw mm5,mm7 ; mm5=tmp10 + paddw mm0,mm4 ; mm0=tmp11 + psubw mm1,mm7 ; mm1=tmp13 + psubw mm6,mm4 ; mm6=tmp12 + + movq mm7,mm5 + paddw mm5,mm0 ; mm5=tmp10+tmp11 + psubw mm7,mm0 ; mm7=tmp10-tmp11 + + psllw mm5,PASS1_BITS ; mm5=data0 + psllw mm7,PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4,mm1 ; mm1=tmp13 + movq mm0,mm1 + punpcklwd mm4,mm6 ; mm6=tmp12 + punpckhwd mm0,mm6 + movq mm1,mm4 + movq mm6,mm0 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4,DESCALE_P1 + psrad mm0,DESCALE_P1 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm4,mm0 ; mm4=data2 + packssdw mm1,mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0,mm2 ; mm2=tmp4 + movq mm6,mm3 ; mm3=tmp5 + paddw mm0,mm5 ; mm0=z3 + paddw mm6,mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4,mm0 + movq mm1,mm0 + punpcklwd mm4,mm6 + punpckhwd mm1,mm6 + movq mm0,mm4 + movq mm6,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4,mm2 + movq mm1,mm2 + punpcklwd mm4,mm7 + punpckhwd mm1,mm7 + movq mm2,mm4 + movq mm7,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2,mm0 ; mm2=data1L + paddd mm7,mm6 ; mm7=data1H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm4,DESCALE_P1 + psrad mm1,DESCALE_P1 + paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm2,DESCALE_P1 + psrad mm7,DESCALE_P1 + + packssdw mm4,mm1 ; mm4=data7 + packssdw mm2,mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1,mm3 + movq mm7,mm3 + punpcklwd mm1,mm5 + punpckhwd mm7,mm5 + movq mm3,mm1 + movq mm5,mm7 + pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1,mm0 ; mm1=data5L + paddd mm7,mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm1,DESCALE_P1 + psrad mm7,DESCALE_P1 + paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad mm3,DESCALE_P1 + psrad mm5,DESCALE_P1 + + packssdw mm1,mm7 ; mm1=data5 + packssdw mm3,mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 .columnloop: - movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; mm0=(02 12 22 32), mm2=(42 52 62 72) - ; mm1=(03 13 23 33), mm3=(43 53 63 73) - - movq mm4,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm1 ; mm0=(02 03 12 13) - punpckhwd mm4,mm1 ; mm4=(22 23 32 33) - movq mm5,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm3 ; mm2=(42 43 52 53) - punpckhwd mm5,mm3 ; mm5=(62 63 72 73) - - movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - - ; mm6=(00 10 20 30), mm1=(40 50 60 70) - ; mm7=(01 11 21 31), mm3=(41 51 61 71) - - movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) - movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) - - movq mm4,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 01 10 11) - punpckhwd mm4,mm7 ; mm4=(20 21 30 31) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm3 ; mm1=(40 41 50 51) - punpckhwd mm2,mm3 ; mm2=(60 61 70 71) - - movq mm7,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 - punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 - movq mm3,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 - punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 - - movq mm0,mm7 - movq mm5,mm6 - psubw mm7,mm2 ; mm7=data1-data6=tmp6 - psubw mm6,mm3 ; mm6=data0-data7=tmp7 - paddw mm0,mm2 ; mm0=data1+data6=tmp1 - paddw mm5,mm3 ; mm5=data0+data7=tmp0 - - movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) - movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 - movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 - - movq mm7,mm4 ; transpose coefficients(phase 2) - punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 - punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 - punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 - - movq mm2,mm7 - movq mm3,mm4 - paddw mm7,mm1 ; mm7=data3+data4=tmp3 - paddw mm4,mm6 ; mm4=data2+data5=tmp2 - psubw mm2,mm1 ; mm2=data3-data4=tmp4 - psubw mm3,mm6 ; mm3=data2-data5=tmp5 - - ; -- Even part - - movq mm1,mm5 - movq mm6,mm0 - paddw mm5,mm7 ; mm5=tmp10 - paddw mm0,mm4 ; mm0=tmp11 - psubw mm1,mm7 ; mm1=tmp13 - psubw mm6,mm4 ; mm6=tmp12 - - movq mm7,mm5 - paddw mm5,mm0 ; mm5=tmp10+tmp11 - psubw mm7,mm0 ; mm7=tmp10-tmp11 - - paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)] - paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)] - psraw mm5,PASS1_BITS ; mm5=data0 - psraw mm7,PASS1_BITS ; mm7=data4 - - movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 - movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movq mm4,mm1 ; mm1=tmp13 - movq mm0,mm1 - punpcklwd mm4,mm6 ; mm6=tmp12 - punpckhwd mm0,mm6 - movq mm1,mm4 - movq mm6,mm0 - pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L - pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H - pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L - pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H - - paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm4,DESCALE_P2 - psrad mm0,DESCALE_P2 - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm1,DESCALE_P2 - psrad mm6,DESCALE_P2 - - packssdw mm4,mm0 ; mm4=data2 - packssdw mm1,mm6 ; mm1=data6 - - movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 - - ; -- Odd part - - movq mm5, MMWORD [wk(0)] ; mm5=tmp6 - movq mm7, MMWORD [wk(1)] ; mm7=tmp7 - - movq mm0,mm2 ; mm2=tmp4 - movq mm6,mm3 ; mm3=tmp5 - paddw mm0,mm5 ; mm0=z3 - paddw mm6,mm7 ; mm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movq mm4,mm0 - movq mm1,mm0 - punpcklwd mm4,mm6 - punpckhwd mm1,mm6 - movq mm0,mm4 - movq mm6,mm1 - pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L - pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H - pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L - pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H - - movq MMWORD [wk(0)], mm4 ; wk(0)=z3L - movq MMWORD [wk(1)], mm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movq mm4,mm2 - movq mm1,mm2 - punpcklwd mm4,mm7 - punpckhwd mm1,mm7 - movq mm2,mm4 - movq mm7,mm1 - pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L - pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H - pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L - pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H - - paddd mm4, MMWORD [wk(0)] ; mm4=data7L - paddd mm1, MMWORD [wk(1)] ; mm1=data7H - paddd mm2,mm0 ; mm2=data1L - paddd mm7,mm6 ; mm7=data1H - - paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm4,DESCALE_P2 - psrad mm1,DESCALE_P2 - paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm2,DESCALE_P2 - psrad mm7,DESCALE_P2 - - packssdw mm4,mm1 ; mm4=data7 - packssdw mm2,mm7 ; mm2=data1 - - movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 - movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 - - movq mm1,mm3 - movq mm7,mm3 - punpcklwd mm1,mm5 - punpckhwd mm7,mm5 - movq mm3,mm1 - movq mm5,mm7 - pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L - pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H - pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L - pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H - - paddd mm1,mm0 ; mm1=data5L - paddd mm7,mm6 ; mm7=data5H - paddd mm3, MMWORD [wk(0)] ; mm3=data3L - paddd mm5, MMWORD [wk(1)] ; mm5=data3H - - paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm1,DESCALE_P2 - psrad mm7,DESCALE_P2 - paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad mm3,DESCALE_P2 - psrad mm5,DESCALE_P2 - - packssdw mm1,mm7 ; mm1=data5 - packssdw mm3,mm5 ; mm3=data3 - - movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 - movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 - - add edx, byte 4*SIZEOF_DCTELEM - dec ecx - jnz near .columnloop - - emms ; empty MMX state - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; mm0=(02 12 22 32), mm2=(42 52 62 72) + ; mm1=(03 13 23 33), mm3=(43 53 63 73) + + movq mm4,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm1 ; mm0=(02 03 12 13) + punpckhwd mm4,mm1 ; mm4=(22 23 32 33) + movq mm5,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm3 ; mm2=(42 43 52 53) + punpckhwd mm5,mm3 ; mm5=(62 63 72 73) + + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + + ; mm6=(00 10 20 30), mm1=(40 50 60 70) + ; mm7=(01 11 21 31), mm3=(41 51 61 71) + + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) + + movq mm4,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 01 10 11) + punpckhwd mm4,mm7 ; mm4=(20 21 30 31) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm3 ; mm1=(40 41 50 51) + punpckhwd mm2,mm3 ; mm2=(60 61 70 71) + + movq mm7,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 + punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 + movq mm3,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 + punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 + + movq mm0,mm7 + movq mm5,mm6 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 + + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 + + movq mm7,mm4 ; transpose coefficients(phase 2) + punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 + punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 + punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 + + movq mm2,mm7 + movq mm3,mm4 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 + + ; -- Even part + + movq mm1,mm5 + movq mm6,mm0 + paddw mm5,mm7 ; mm5=tmp10 + paddw mm0,mm4 ; mm0=tmp11 + psubw mm1,mm7 ; mm1=tmp13 + psubw mm6,mm4 ; mm6=tmp12 + + movq mm7,mm5 + paddw mm5,mm0 ; mm5=tmp10+tmp11 + psubw mm7,mm0 ; mm7=tmp10-tmp11 + + paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)] + paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)] + psraw mm5,PASS1_BITS ; mm5=data0 + psraw mm7,PASS1_BITS ; mm7=data4 + + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movq mm4,mm1 ; mm1=tmp13 + movq mm0,mm1 + punpcklwd mm4,mm6 ; mm6=tmp12 + punpckhwd mm0,mm6 + movq mm1,mm4 + movq mm6,mm0 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L + pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L + pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4,DESCALE_P2 + psrad mm0,DESCALE_P2 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1,DESCALE_P2 + psrad mm6,DESCALE_P2 + + packssdw mm4,mm0 ; mm4=data2 + packssdw mm1,mm6 ; mm1=data6 + + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 + + ; -- Odd part + + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 + + movq mm0,mm2 ; mm2=tmp4 + movq mm6,mm3 ; mm3=tmp5 + paddw mm0,mm5 ; mm0=z3 + paddw mm6,mm7 ; mm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm4,mm0 + movq mm1,mm0 + punpcklwd mm4,mm6 + punpckhwd mm1,mm6 + movq mm0,mm4 + movq mm6,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L + pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H + pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L + pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H + + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movq mm4,mm2 + movq mm1,mm2 + punpcklwd mm4,mm7 + punpckhwd mm1,mm7 + movq mm2,mm4 + movq mm7,mm1 + pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L + pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H + pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L + pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H + + paddd mm4, MMWORD [wk(0)] ; mm4=data7L + paddd mm1, MMWORD [wk(1)] ; mm1=data7H + paddd mm2,mm0 ; mm2=data1L + paddd mm7,mm6 ; mm7=data1H + + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm4,DESCALE_P2 + psrad mm1,DESCALE_P2 + paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm2,DESCALE_P2 + psrad mm7,DESCALE_P2 + + packssdw mm4,mm1 ; mm4=data7 + packssdw mm2,mm7 ; mm2=data1 + + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 + + movq mm1,mm3 + movq mm7,mm3 + punpcklwd mm1,mm5 + punpckhwd mm7,mm5 + movq mm3,mm1 + movq mm5,mm7 + pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L + pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H + pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L + pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H + + paddd mm1,mm0 ; mm1=data5L + paddd mm7,mm6 ; mm7=data5H + paddd mm3, MMWORD [wk(0)] ; mm3=data3L + paddd mm5, MMWORD [wk(1)] ; mm5=data3H + + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm1,DESCALE_P2 + psrad mm7,DESCALE_P2 + paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad mm3,DESCALE_P2 + psrad mm5,DESCALE_P2 + + packssdw mm1,mm7 ; mm1=data5 + packssdw mm3,mm5 ; mm3=data3 + + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 + + add edx, byte 4*SIZEOF_DCTELEM + dec ecx + jnz near .columnloop + + emms ; empty MMX state + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfss2fst-64.asm b/simd/jfss2fst-64.asm index 6953caf3b..41483bf6a 100644 --- a/simd/jfss2fst-64.asm +++ b/simd/jfss2fst-64.asm @@ -27,24 +27,24 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. +%define CONST_BITS 8 ; 14 is also OK. %if CONST_BITS == 8 -F_0_382 equ 98 ; FIX(0.382683433) -F_0_541 equ 139 ; FIX(0.541196100) -F_0_707 equ 181 ; FIX(0.707106781) -F_1_306 equ 334 ; FIX(1.306562965) +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) -F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) @@ -52,21 +52,21 @@ F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_fdct_ifast_sse2) + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) EXTN(jconst_fdct_ifast_sse2): -PW_F0707 times 8 dw F_0_707 << CONST_SHIFT -PW_F0382 times 8 dw F_0_382 << CONST_SHIFT -PW_F0541 times 8 dw F_0_541 << CONST_SHIFT -PW_F1306 times 8 dw F_1_306 << CONST_SHIFT +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform the forward DCT on one block of samples. ; @@ -76,317 +76,317 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT ; r10 = DCTELEM * data -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_ifast_sse2) + align 16 + global EXTN(jsimd_fdct_ifast_sse2) EXTN(jsimd_fdct_ifast_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - - ; ---- Pass 1: process rows. - - mov rdx, r10 ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - psubw xmm3,xmm1 ; xmm3=tmp13 - psubw xmm6,xmm7 ; xmm6=tmp12 - paddw xmm4,xmm1 ; xmm4=tmp10 - paddw xmm0,xmm7 ; xmm0=tmp11 - - paddw xmm6,xmm3 - psllw xmm6,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm6,[rel PW_F0707] ; xmm6=z1 - - movdqa xmm1,xmm4 - movdqa xmm7,xmm3 - psubw xmm4,xmm0 ; xmm4=data4 - psubw xmm3,xmm6 ; xmm3=data6 - paddw xmm1,xmm0 ; xmm1=data0 - paddw xmm7,xmm6 ; xmm7=data2 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 - - ; -- Odd part - - paddw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm5,xmm0 ; xmm5=tmp11 - paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[rel PW_F0707] ; xmm5=z3 - - movdqa xmm4,xmm2 ; xmm4=tmp10 - psubw xmm2,xmm0 - pmulhw xmm2,[rel PW_F0382] ; xmm2=z5 - pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm2 ; xmm4=z2 - paddw xmm0,xmm2 ; xmm0=z4 - - movdqa xmm3,xmm6 - psubw xmm6,xmm5 ; xmm6=z13 - paddw xmm3,xmm5 ; xmm3=z11 - - movdqa xmm2,xmm6 - movdqa xmm5,xmm3 - psubw xmm6,xmm4 ; xmm6=data3 - psubw xmm3,xmm0 ; xmm3=data7 - paddw xmm2,xmm4 ; xmm2=data5 - paddw xmm5,xmm0 ; xmm5=data1 - - ; ---- Pass 2: process columns. - - ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) - ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) - punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 - - ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) - ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm7,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) - punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) - movdqa xmm0,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) - punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) - - movdqa xmm2,xmm5 ; transpose coefficients(phase 2) - punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) - punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) - movdqa xmm3,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) - punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) - - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) - - movdqa xmm2,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) - punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) - movdqa xmm7,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) - punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm0,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm5,xmm6 - movdqa xmm3,xmm1 - psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 - psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 - paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 - paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 - - movdqa xmm6,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm1,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm7,xmm6 - movdqa xmm0,xmm2 - paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 - paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 - psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 - psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm1,xmm5 - psubw xmm3,xmm6 ; xmm3=tmp13 - psubw xmm5,xmm2 ; xmm5=tmp12 - paddw xmm4,xmm6 ; xmm4=tmp10 - paddw xmm1,xmm2 ; xmm1=tmp11 - - paddw xmm5,xmm3 - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[rel PW_F0707] ; xmm5=z1 - - movdqa xmm6,xmm4 - movdqa xmm2,xmm3 - psubw xmm4,xmm1 ; xmm4=data4 - psubw xmm3,xmm5 ; xmm3=data6 - paddw xmm6,xmm1 ; xmm6=data0 - paddw xmm2,xmm5 ; xmm2=data2 - - movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 - - ; -- Odd part - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - paddw xmm7,xmm0 ; xmm7=tmp10 - paddw xmm0,xmm1 ; xmm0=tmp11 - paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 - - psllw xmm7,PRE_MULTIPLY_SCALE_BITS - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm0,[rel PW_F0707] ; xmm0=z3 - - movdqa xmm4,xmm7 ; xmm4=tmp10 - psubw xmm7,xmm1 - pmulhw xmm7,[rel PW_F0382] ; xmm7=z5 - pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm7 ; xmm4=z2 - paddw xmm1,xmm7 ; xmm1=z4 - - movdqa xmm3,xmm5 - psubw xmm5,xmm0 ; xmm5=z13 - paddw xmm3,xmm0 ; xmm3=z11 - - movdqa xmm6,xmm5 - movdqa xmm2,xmm3 - psubw xmm5,xmm4 ; xmm5=data3 - psubw xmm3,xmm1 ; xmm3=data7 - paddw xmm6,xmm4 ; xmm6=data5 - paddw xmm2,xmm1 ; xmm2=data1 - - movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 - movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + psubw xmm3,xmm1 ; xmm3=tmp13 + psubw xmm6,xmm7 ; xmm6=tmp12 + paddw xmm4,xmm1 ; xmm4=tmp10 + paddw xmm0,xmm7 ; xmm0=tmp11 + + paddw xmm6,xmm3 + psllw xmm6,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6,[rel PW_F0707] ; xmm6=z1 + + movdqa xmm1,xmm4 + movdqa xmm7,xmm3 + psubw xmm4,xmm0 ; xmm4=data4 + psubw xmm3,xmm6 ; xmm3=data6 + paddw xmm1,xmm0 ; xmm1=data0 + paddw xmm7,xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm5,xmm0 ; xmm5=tmp11 + paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F0707] ; xmm5=z3 + + movdqa xmm4,xmm2 ; xmm4=tmp10 + psubw xmm2,xmm0 + pmulhw xmm2,[rel PW_F0382] ; xmm2=z5 + pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm2 ; xmm4=z2 + paddw xmm0,xmm2 ; xmm0=z4 + + movdqa xmm3,xmm6 + psubw xmm6,xmm5 ; xmm6=z13 + paddw xmm3,xmm5 ; xmm3=z11 + + movdqa xmm2,xmm6 + movdqa xmm5,xmm3 + psubw xmm6,xmm4 ; xmm6=data3 + psubw xmm3,xmm0 ; xmm3=data7 + paddw xmm2,xmm4 ; xmm2=data5 + paddw xmm5,xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2,xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5,xmm6 + movdqa xmm3,xmm1 + psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7,xmm6 + movdqa xmm0,xmm2 + paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm1,xmm5 + psubw xmm3,xmm6 ; xmm3=tmp13 + psubw xmm5,xmm2 ; xmm5=tmp12 + paddw xmm4,xmm6 ; xmm4=tmp10 + paddw xmm1,xmm2 ; xmm1=tmp11 + + paddw xmm5,xmm3 + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F0707] ; xmm5=z1 + + movdqa xmm6,xmm4 + movdqa xmm2,xmm3 + psubw xmm4,xmm1 ; xmm4=data4 + psubw xmm3,xmm5 ; xmm3=data6 + paddw xmm6,xmm1 ; xmm6=data0 + paddw xmm2,xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7,xmm0 ; xmm7=tmp10 + paddw xmm0,xmm1 ; xmm0=tmp11 + paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7,PRE_MULTIPLY_SCALE_BITS + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0,[rel PW_F0707] ; xmm0=z3 + + movdqa xmm4,xmm7 ; xmm4=tmp10 + psubw xmm7,xmm1 + pmulhw xmm7,[rel PW_F0382] ; xmm7=z5 + pmulhw xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm7 ; xmm4=z2 + paddw xmm1,xmm7 ; xmm1=z4 + + movdqa xmm3,xmm5 + psubw xmm5,xmm0 ; xmm5=z13 + paddw xmm3,xmm0 ; xmm3=z11 + + movdqa xmm6,xmm5 + movdqa xmm2,xmm3 + psubw xmm5,xmm4 ; xmm5=data3 + psubw xmm3,xmm1 ; xmm3=data7 + paddw xmm6,xmm4 ; xmm6=data5 + paddw xmm2,xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfss2fst.asm b/simd/jfss2fst.asm index 73fc9e51a..8bc05f4fc 100644 --- a/simd/jfss2fst.asm +++ b/simd/jfss2fst.asm @@ -26,24 +26,24 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. +%define CONST_BITS 8 ; 14 is also OK. %if CONST_BITS == 8 -F_0_382 equ 98 ; FIX(0.382683433) -F_0_541 equ 139 ; FIX(0.541196100) -F_0_707 equ 181 ; FIX(0.707106781) -F_1_306 equ 334 ; FIX(1.306562965) +F_0_382 equ 98 ; FIX(0.382683433) +F_0_541 equ 139 ; FIX(0.541196100) +F_0_707 equ 181 ; FIX(0.707106781) +F_1_306 equ 334 ; FIX(1.306562965) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) -F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) @@ -51,21 +51,21 @@ F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_fdct_ifast_sse2) + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) EXTN(jconst_fdct_ifast_sse2): -PW_F0707 times 8 dw F_0_707 << CONST_SHIFT -PW_F0382 times 8 dw F_0_382 << CONST_SHIFT -PW_F0541 times 8 dw F_0_541 << CONST_SHIFT -PW_F1306 times 8 dw F_1_306 << CONST_SHIFT +PW_F0707 times 8 dw F_0_707 << CONST_SHIFT +PW_F0382 times 8 dw F_0_382 << CONST_SHIFT +PW_F0541 times 8 dw F_0_541 << CONST_SHIFT +PW_F1306 times 8 dw F_1_306 << CONST_SHIFT - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -73,332 +73,332 @@ PW_F1306 times 8 dw F_1_306 << CONST_SHIFT ; jsimd_fdct_ifast_sse2 (DCTELEM * data) ; -%define data(b) (b)+8 ; DCTELEM * data +%define data(b) (b)+8 ; DCTELEM * data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_ifast_sse2) + align 16 + global EXTN(jsimd_fdct_ifast_sse2) EXTN(jsimd_fdct_ifast_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - psubw xmm3,xmm1 ; xmm3=tmp13 - psubw xmm6,xmm7 ; xmm6=tmp12 - paddw xmm4,xmm1 ; xmm4=tmp10 - paddw xmm0,xmm7 ; xmm0=tmp11 - - paddw xmm6,xmm3 - psllw xmm6,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 - - movdqa xmm1,xmm4 - movdqa xmm7,xmm3 - psubw xmm4,xmm0 ; xmm4=data4 - psubw xmm3,xmm6 ; xmm3=data6 - paddw xmm1,xmm0 ; xmm1=data0 - paddw xmm7,xmm6 ; xmm7=data2 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 - - ; -- Odd part - - paddw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm5,xmm0 ; xmm5=tmp11 - paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 - - movdqa xmm4,xmm2 ; xmm4=tmp10 - psubw xmm2,xmm0 - pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 - pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm2 ; xmm4=z2 - paddw xmm0,xmm2 ; xmm0=z4 - - movdqa xmm3,xmm6 - psubw xmm6,xmm5 ; xmm6=z13 - paddw xmm3,xmm5 ; xmm3=z11 - - movdqa xmm2,xmm6 - movdqa xmm5,xmm3 - psubw xmm6,xmm4 ; xmm6=data3 - psubw xmm3,xmm0 ; xmm3=data7 - paddw xmm2,xmm4 ; xmm2=data5 - paddw xmm5,xmm0 ; xmm5=data1 - - ; ---- Pass 2: process columns. - -; mov edx, POINTER [data(eax)] ; (DCTELEM *) - - ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) - ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) - punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 - - ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) - ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm7,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) - punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) - movdqa xmm0,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) - punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) - - movdqa xmm2,xmm5 ; transpose coefficients(phase 2) - punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) - punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) - movdqa xmm3,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) - punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) - - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) - - movdqa xmm2,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) - punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) - movdqa xmm7,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) - punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm0,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm5,xmm6 - movdqa xmm3,xmm1 - psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 - psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 - paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 - paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) - movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 - - movdqa xmm6,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm1,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm7,xmm6 - movdqa xmm0,xmm2 - paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 - paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 - psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 - psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm1,xmm5 - psubw xmm3,xmm6 ; xmm3=tmp13 - psubw xmm5,xmm2 ; xmm5=tmp12 - paddw xmm4,xmm6 ; xmm4=tmp10 - paddw xmm1,xmm2 ; xmm1=tmp11 - - paddw xmm5,xmm3 - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 - - movdqa xmm6,xmm4 - movdqa xmm2,xmm3 - psubw xmm4,xmm1 ; xmm4=data4 - psubw xmm3,xmm5 ; xmm3=data6 - paddw xmm6,xmm1 ; xmm6=data0 - paddw xmm2,xmm5 ; xmm2=data2 - - movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 - - ; -- Odd part - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - paddw xmm7,xmm0 ; xmm7=tmp10 - paddw xmm0,xmm1 ; xmm0=tmp11 - paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 - - psllw xmm7,PRE_MULTIPLY_SCALE_BITS - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 - - movdqa xmm4,xmm7 ; xmm4=tmp10 - psubw xmm7,xmm1 - pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 - pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) - pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) - paddw xmm4,xmm7 ; xmm4=z2 - paddw xmm1,xmm7 ; xmm1=z4 - - movdqa xmm3,xmm5 - psubw xmm5,xmm0 ; xmm5=z13 - paddw xmm3,xmm0 ; xmm3=z11 - - movdqa xmm6,xmm5 - movdqa xmm2,xmm3 - psubw xmm5,xmm4 ; xmm5=data3 - psubw xmm3,xmm1 ; xmm3=data7 - paddw xmm6,xmm4 ; xmm6=data5 - paddw xmm2,xmm1 ; xmm2=data1 - - movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 - movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 - movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 - movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + psubw xmm3,xmm1 ; xmm3=tmp13 + psubw xmm6,xmm7 ; xmm6=tmp12 + paddw xmm4,xmm1 ; xmm4=tmp10 + paddw xmm0,xmm7 ; xmm0=tmp11 + + paddw xmm6,xmm3 + psllw xmm6,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 + + movdqa xmm1,xmm4 + movdqa xmm7,xmm3 + psubw xmm4,xmm0 ; xmm4=data4 + psubw xmm3,xmm6 ; xmm3=data6 + paddw xmm1,xmm0 ; xmm1=data0 + paddw xmm7,xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm5,xmm0 ; xmm5=tmp11 + paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 + + movdqa xmm4,xmm2 ; xmm4=tmp10 + psubw xmm2,xmm0 + pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 + pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm2 ; xmm4=z2 + paddw xmm0,xmm2 ; xmm0=z4 + + movdqa xmm3,xmm6 + psubw xmm6,xmm5 ; xmm6=z13 + paddw xmm3,xmm5 ; xmm3=z11 + + movdqa xmm2,xmm6 + movdqa xmm5,xmm3 + psubw xmm6,xmm4 ; xmm6=data3 + psubw xmm3,xmm0 ; xmm3=data7 + paddw xmm2,xmm4 ; xmm2=data5 + paddw xmm5,xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2,xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5,xmm6 + movdqa xmm3,xmm1 + psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7,xmm6 + movdqa xmm0,xmm2 + paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm1,xmm5 + psubw xmm3,xmm6 ; xmm3=tmp13 + psubw xmm5,xmm2 ; xmm5=tmp12 + paddw xmm4,xmm6 ; xmm4=tmp10 + paddw xmm1,xmm2 ; xmm1=tmp11 + + paddw xmm5,xmm3 + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 + + movdqa xmm6,xmm4 + movdqa xmm2,xmm3 + psubw xmm4,xmm1 ; xmm4=data4 + psubw xmm3,xmm5 ; xmm3=data6 + paddw xmm6,xmm1 ; xmm6=data0 + paddw xmm2,xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7,xmm0 ; xmm7=tmp10 + paddw xmm0,xmm1 ; xmm0=tmp11 + paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7,PRE_MULTIPLY_SCALE_BITS + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 + + movdqa xmm4,xmm7 ; xmm4=tmp10 + psubw xmm7,xmm1 + pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 + pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm7 ; xmm4=z2 + paddw xmm1,xmm7 ; xmm1=z4 + + movdqa xmm3,xmm5 + psubw xmm5,xmm0 ; xmm5=z13 + paddw xmm3,xmm0 ; xmm3=z11 + + movdqa xmm6,xmm5 + movdqa xmm2,xmm3 + psubw xmm5,xmm4 ; xmm5=data3 + psubw xmm3,xmm1 ; xmm3=data7 + paddw xmm6,xmm4 ; xmm6=data5 + paddw xmm2,xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfss2int-64.asm b/simd/jfss2int-64.asm index bd1bd45ab..f5ec3465f 100644 --- a/simd/jfss2int-64.asm +++ b/simd/jfss2int-64.asm @@ -27,67 +27,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_islow_sse2) + alignz 16 + global EXTN(jconst_fdct_islow_sse2) EXTN(jconst_fdct_islow_sse2): -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform the forward DCT on one block of samples. ; @@ -97,526 +97,526 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) ; r10 = DCTELEM * data -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 6 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 - align 16 - global EXTN(jsimd_fdct_islow_sse2) + align 16 + global EXTN(jsimd_fdct_islow_sse2) EXTN(jsimd_fdct_islow_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - - ; ---- Pass 1: process rows. - - mov rdx, r10 ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - paddw xmm3,xmm1 ; xmm3=tmp10 - paddw xmm6,xmm7 ; xmm6=tmp11 - psubw xmm4,xmm1 ; xmm4=tmp13 - psubw xmm0,xmm7 ; xmm0=tmp12 - - movdqa xmm1,xmm3 - paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 - psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 - - psllw xmm3,PASS1_BITS ; xmm3=data0 - psllw xmm1,PASS1_BITS ; xmm1=data4 - - movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 - movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm7,xmm4 ; xmm4=tmp13 - movdqa xmm6,xmm4 - punpcklwd xmm7,xmm0 ; xmm0=tmp12 - punpckhwd xmm6,xmm0 - movdqa xmm4,xmm7 - movdqa xmm0,xmm6 - pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L - pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H - pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L - pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H - - paddd xmm7,[rel PD_DESCALE_P1] - paddd xmm6,[rel PD_DESCALE_P1] - psrad xmm7,DESCALE_P1 - psrad xmm6,DESCALE_P1 - paddd xmm4,[rel PD_DESCALE_P1] - paddd xmm0,[rel PD_DESCALE_P1] - psrad xmm4,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm7,xmm6 ; xmm7=data2 - packssdw xmm4,xmm0 ; xmm4=data6 - - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 - - ; -- Odd part - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 - - movdqa xmm6,xmm2 ; xmm2=tmp4 - movdqa xmm0,xmm5 ; xmm5=tmp5 - paddw xmm6,xmm3 ; xmm6=z3 - paddw xmm0,xmm1 ; xmm0=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm7,xmm6 - movdqa xmm4,xmm6 - punpcklwd xmm7,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm6,xmm7 - movdqa xmm0,xmm4 - pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L - pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H - pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L - pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm7,xmm2 - movdqa xmm4,xmm2 - punpcklwd xmm7,xmm1 - punpckhwd xmm4,xmm1 - movdqa xmm2,xmm7 - movdqa xmm1,xmm4 - pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L - pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H - pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L - pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H - - paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L - paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H - paddd xmm2,xmm6 ; xmm2=data1L - paddd xmm1,xmm0 ; xmm1=data1H - - paddd xmm7,[rel PD_DESCALE_P1] - paddd xmm4,[rel PD_DESCALE_P1] - psrad xmm7,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm2,[rel PD_DESCALE_P1] - paddd xmm1,[rel PD_DESCALE_P1] - psrad xmm2,DESCALE_P1 - psrad xmm1,DESCALE_P1 - - packssdw xmm7,xmm4 ; xmm7=data7 - packssdw xmm2,xmm1 ; xmm2=data1 - - movdqa xmm4,xmm5 - movdqa xmm1,xmm5 - punpcklwd xmm4,xmm3 - punpckhwd xmm1,xmm3 - movdqa xmm5,xmm4 - movdqa xmm3,xmm1 - pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L - pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H - pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L - pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H - - paddd xmm4,xmm6 ; xmm4=data5L - paddd xmm1,xmm0 ; xmm1=data5H - paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L - paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H - - paddd xmm4,[rel PD_DESCALE_P1] - paddd xmm1,[rel PD_DESCALE_P1] - psrad xmm4,DESCALE_P1 - psrad xmm1,DESCALE_P1 - paddd xmm5,[rel PD_DESCALE_P1] - paddd xmm3,[rel PD_DESCALE_P1] - psrad xmm5,DESCALE_P1 - psrad xmm3,DESCALE_P1 - - packssdw xmm4,xmm1 ; xmm4=data5 - packssdw xmm5,xmm3 ; xmm5=data3 - - ; ---- Pass 2: process columns. - - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 - movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 - - ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) - ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) - - movdqa xmm1,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) - punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) - movdqa xmm3,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) - punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) - - movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 - movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 - - ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) - ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm0,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) - punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) - movdqa xmm3,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) - punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) - - movdqa xmm4,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) - punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) - punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) - punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) - punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) - - movdqa xmm5,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm2,xmm5 - movdqa xmm7,xmm6 - psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 - psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 - paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 - paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 - - movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) - movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movdqa xmm5,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm0,xmm5 - movdqa xmm3,xmm4 - paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 - paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 - psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 - psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm1,xmm7 - movdqa xmm6,xmm2 - paddw xmm7,xmm5 ; xmm7=tmp10 - paddw xmm2,xmm4 ; xmm2=tmp11 - psubw xmm1,xmm5 ; xmm1=tmp13 - psubw xmm6,xmm4 ; xmm6=tmp12 - - movdqa xmm5,xmm7 - paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 - psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 - - paddw xmm7,[rel PW_DESCALE_P2X] - paddw xmm5,[rel PW_DESCALE_P2X] - psraw xmm7,PASS1_BITS ; xmm7=data0 - psraw xmm5,PASS1_BITS ; xmm5=data4 - - movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 - movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm4,xmm1 ; xmm1=tmp13 - movdqa xmm2,xmm1 - punpcklwd xmm4,xmm6 ; xmm6=tmp12 - punpckhwd xmm2,xmm6 - movdqa xmm1,xmm4 - movdqa xmm6,xmm2 - pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L - pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H - pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L - pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H - - paddd xmm4,[rel PD_DESCALE_P2] - paddd xmm2,[rel PD_DESCALE_P2] - psrad xmm4,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm1,[rel PD_DESCALE_P2] - paddd xmm6,[rel PD_DESCALE_P2] - psrad xmm1,DESCALE_P2 - psrad xmm6,DESCALE_P2 - - packssdw xmm4,xmm2 ; xmm4=data2 - packssdw xmm1,xmm6 ; xmm1=data6 - - movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 - - ; -- Odd part - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - movdqa xmm2,xmm0 ; xmm0=tmp4 - movdqa xmm6,xmm3 ; xmm3=tmp5 - paddw xmm2,xmm7 ; xmm2=z3 - paddw xmm6,xmm5 ; xmm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm4,xmm2 - movdqa xmm1,xmm2 - punpcklwd xmm4,xmm6 - punpckhwd xmm1,xmm6 - movdqa xmm2,xmm4 - movdqa xmm6,xmm1 - pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L - pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H - pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L - pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm4,xmm0 - movdqa xmm1,xmm0 - punpcklwd xmm4,xmm5 - punpckhwd xmm1,xmm5 - movdqa xmm0,xmm4 - movdqa xmm5,xmm1 - pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L - pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H - pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L - pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H - - paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L - paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H - paddd xmm0,xmm2 ; xmm0=data1L - paddd xmm5,xmm6 ; xmm5=data1H - - paddd xmm4,[rel PD_DESCALE_P2] - paddd xmm1,[rel PD_DESCALE_P2] - psrad xmm4,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm0,[rel PD_DESCALE_P2] - paddd xmm5,[rel PD_DESCALE_P2] - psrad xmm0,DESCALE_P2 - psrad xmm5,DESCALE_P2 - - packssdw xmm4,xmm1 ; xmm4=data7 - packssdw xmm0,xmm5 ; xmm0=data1 - - movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 - - movdqa xmm1,xmm3 - movdqa xmm5,xmm3 - punpcklwd xmm1,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm3,xmm1 - movdqa xmm7,xmm5 - pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L - pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H - pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L - pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H - - paddd xmm1,xmm2 ; xmm1=data5L - paddd xmm5,xmm6 ; xmm5=data5H - paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L - paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H - - paddd xmm1,[rel PD_DESCALE_P2] - paddd xmm5,[rel PD_DESCALE_P2] - psrad xmm1,DESCALE_P2 - psrad xmm5,DESCALE_P2 - paddd xmm3,[rel PD_DESCALE_P2] - paddd xmm7,[rel PD_DESCALE_P2] - psrad xmm3,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm1,xmm5 ; xmm1=data5 - packssdw xmm3,xmm7 ; xmm3=data3 - - movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + paddw xmm3,xmm1 ; xmm3=tmp10 + paddw xmm6,xmm7 ; xmm6=tmp11 + psubw xmm4,xmm1 ; xmm4=tmp13 + psubw xmm0,xmm7 ; xmm0=tmp12 + + movdqa xmm1,xmm3 + paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3,PASS1_BITS ; xmm3=data0 + psllw xmm1,PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7,xmm4 ; xmm4=tmp13 + movdqa xmm6,xmm4 + punpcklwd xmm7,xmm0 ; xmm0=tmp12 + punpckhwd xmm6,xmm0 + movdqa xmm4,xmm7 + movdqa xmm0,xmm6 + pmaddwd xmm7,[rel PW_F130_F054] ; xmm7=data2L + pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=data2H + pmaddwd xmm4,[rel PW_F054_MF130] ; xmm4=data6L + pmaddwd xmm0,[rel PW_F054_MF130] ; xmm0=data6H + + paddd xmm7,[rel PD_DESCALE_P1] + paddd xmm6,[rel PD_DESCALE_P1] + psrad xmm7,DESCALE_P1 + psrad xmm6,DESCALE_P1 + paddd xmm4,[rel PD_DESCALE_P1] + paddd xmm0,[rel PD_DESCALE_P1] + psrad xmm4,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm7,xmm6 ; xmm7=data2 + packssdw xmm4,xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6,xmm2 ; xmm2=tmp4 + movdqa xmm0,xmm5 ; xmm5=tmp5 + paddw xmm6,xmm3 ; xmm6=z3 + paddw xmm0,xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7,xmm6 + movdqa xmm4,xmm6 + punpcklwd xmm7,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm6,xmm7 + movdqa xmm0,xmm4 + pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3L + pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3H + pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4L + pmaddwd xmm0,[rel PW_F117_F078] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7,xmm2 + movdqa xmm4,xmm2 + punpcklwd xmm7,xmm1 + punpckhwd xmm4,xmm1 + movdqa xmm2,xmm7 + movdqa xmm1,xmm4 + pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp4L + pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4H + pmaddwd xmm2,[rel PW_MF089_F060] ; xmm2=tmp7L + pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2,xmm6 ; xmm2=data1L + paddd xmm1,xmm0 ; xmm1=data1H + + paddd xmm7,[rel PD_DESCALE_P1] + paddd xmm4,[rel PD_DESCALE_P1] + psrad xmm7,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm2,[rel PD_DESCALE_P1] + paddd xmm1,[rel PD_DESCALE_P1] + psrad xmm2,DESCALE_P1 + psrad xmm1,DESCALE_P1 + + packssdw xmm7,xmm4 ; xmm7=data7 + packssdw xmm2,xmm1 ; xmm2=data1 + + movdqa xmm4,xmm5 + movdqa xmm1,xmm5 + punpcklwd xmm4,xmm3 + punpckhwd xmm1,xmm3 + movdqa xmm5,xmm4 + movdqa xmm3,xmm1 + pmaddwd xmm4,[rel PW_MF050_MF256] ; xmm4=tmp5L + pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5H + pmaddwd xmm5,[rel PW_MF256_F050] ; xmm5=tmp6L + pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6H + + paddd xmm4,xmm6 ; xmm4=data5L + paddd xmm1,xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4,[rel PD_DESCALE_P1] + paddd xmm1,[rel PD_DESCALE_P1] + psrad xmm4,DESCALE_P1 + psrad xmm1,DESCALE_P1 + paddd xmm5,[rel PD_DESCALE_P1] + paddd xmm3,[rel PD_DESCALE_P1] + psrad xmm5,DESCALE_P1 + psrad xmm3,DESCALE_P1 + + packssdw xmm4,xmm1 ; xmm4=data5 + packssdw xmm5,xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2,xmm5 + movdqa xmm7,xmm6 + psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0,xmm5 + movdqa xmm3,xmm4 + paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1,xmm7 + movdqa xmm6,xmm2 + paddw xmm7,xmm5 ; xmm7=tmp10 + paddw xmm2,xmm4 ; xmm2=tmp11 + psubw xmm1,xmm5 ; xmm1=tmp13 + psubw xmm6,xmm4 ; xmm6=tmp12 + + movdqa xmm5,xmm7 + paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7,[rel PW_DESCALE_P2X] + paddw xmm5,[rel PW_DESCALE_P2X] + psraw xmm7,PASS1_BITS ; xmm7=data0 + psraw xmm5,PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4,xmm1 ; xmm1=tmp13 + movdqa xmm2,xmm1 + punpcklwd xmm4,xmm6 ; xmm6=tmp12 + punpckhwd xmm2,xmm6 + movdqa xmm1,xmm4 + movdqa xmm6,xmm2 + pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=data2L + pmaddwd xmm2,[rel PW_F130_F054] ; xmm2=data2H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=data6L + pmaddwd xmm6,[rel PW_F054_MF130] ; xmm6=data6H + + paddd xmm4,[rel PD_DESCALE_P2] + paddd xmm2,[rel PD_DESCALE_P2] + psrad xmm4,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm1,[rel PD_DESCALE_P2] + paddd xmm6,[rel PD_DESCALE_P2] + psrad xmm1,DESCALE_P2 + psrad xmm6,DESCALE_P2 + + packssdw xmm4,xmm2 ; xmm4=data2 + packssdw xmm1,xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2,xmm0 ; xmm0=tmp4 + movdqa xmm6,xmm3 ; xmm3=tmp5 + paddw xmm2,xmm7 ; xmm2=z3 + paddw xmm6,xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4,xmm2 + movdqa xmm1,xmm2 + punpcklwd xmm4,xmm6 + punpckhwd xmm1,xmm6 + movdqa xmm2,xmm4 + movdqa xmm6,xmm1 + pmaddwd xmm4,[rel PW_MF078_F117] ; xmm4=z3L + pmaddwd xmm1,[rel PW_MF078_F117] ; xmm1=z3H + pmaddwd xmm2,[rel PW_F117_F078] ; xmm2=z4L + pmaddwd xmm6,[rel PW_F117_F078] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4,xmm0 + movdqa xmm1,xmm0 + punpcklwd xmm4,xmm5 + punpckhwd xmm1,xmm5 + movdqa xmm0,xmm4 + movdqa xmm5,xmm1 + pmaddwd xmm4,[rel PW_MF060_MF089] ; xmm4=tmp4L + pmaddwd xmm1,[rel PW_MF060_MF089] ; xmm1=tmp4H + pmaddwd xmm0,[rel PW_MF089_F060] ; xmm0=tmp7L + pmaddwd xmm5,[rel PW_MF089_F060] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0,xmm2 ; xmm0=data1L + paddd xmm5,xmm6 ; xmm5=data1H + + paddd xmm4,[rel PD_DESCALE_P2] + paddd xmm1,[rel PD_DESCALE_P2] + psrad xmm4,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm0,[rel PD_DESCALE_P2] + paddd xmm5,[rel PD_DESCALE_P2] + psrad xmm0,DESCALE_P2 + psrad xmm5,DESCALE_P2 + + packssdw xmm4,xmm1 ; xmm4=data7 + packssdw xmm0,xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1,xmm3 + movdqa xmm5,xmm3 + punpcklwd xmm1,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm3,xmm1 + movdqa xmm7,xmm5 + pmaddwd xmm1,[rel PW_MF050_MF256] ; xmm1=tmp5L + pmaddwd xmm5,[rel PW_MF050_MF256] ; xmm5=tmp5H + pmaddwd xmm3,[rel PW_MF256_F050] ; xmm3=tmp6L + pmaddwd xmm7,[rel PW_MF256_F050] ; xmm7=tmp6H + + paddd xmm1,xmm2 ; xmm1=data5L + paddd xmm5,xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1,[rel PD_DESCALE_P2] + paddd xmm5,[rel PD_DESCALE_P2] + psrad xmm1,DESCALE_P2 + psrad xmm5,DESCALE_P2 + paddd xmm3,[rel PD_DESCALE_P2] + paddd xmm7,[rel PD_DESCALE_P2] + psrad xmm3,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm1,xmm5 ; xmm1=data5 + packssdw xmm3,xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfss2int.asm b/simd/jfss2int.asm index 5e3f2aaa9..d1bcb2ed2 100644 --- a/simd/jfss2int.asm +++ b/simd/jfss2int.asm @@ -26,67 +26,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_islow_sse2) + alignz 16 + global EXTN(jconst_fdct_islow_sse2) EXTN(jconst_fdct_islow_sse2): -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -94,541 +94,541 @@ PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) ; jsimd_fdct_islow_sse2 (DCTELEM * data) ; -%define data(b) (b)+8 ; DCTELEM * data +%define data(b) (b)+8 ; DCTELEM * data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 6 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 6 - align 16 - global EXTN(jsimd_fdct_islow_sse2) + align 16 + global EXTN(jsimd_fdct_islow_sse2) EXTN(jsimd_fdct_islow_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] - movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] - - ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) - ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) - punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) - movdqa xmm5,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) - - movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] - movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] - movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] - - ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) - ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) - movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) - - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) - movdqa xmm5,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) - punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) - punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) - movdqa xmm3,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) - movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) - - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) - punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) - movdqa xmm2,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) - punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa xmm1,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 - punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 - movdqa xmm5,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 - punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 - - movdqa xmm6,xmm1 - movdqa xmm3,xmm0 - psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 - psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 - paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 - paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 - - movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) - movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 - punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 - movdqa xmm0,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 - punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 - - movdqa xmm2,xmm1 - movdqa xmm5,xmm7 - paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 - paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 - psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 - psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm4,xmm3 - movdqa xmm0,xmm6 - paddw xmm3,xmm1 ; xmm3=tmp10 - paddw xmm6,xmm7 ; xmm6=tmp11 - psubw xmm4,xmm1 ; xmm4=tmp13 - psubw xmm0,xmm7 ; xmm0=tmp12 - - movdqa xmm1,xmm3 - paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 - psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 - - psllw xmm3,PASS1_BITS ; xmm3=data0 - psllw xmm1,PASS1_BITS ; xmm1=data4 - - movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 - movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm7,xmm4 ; xmm4=tmp13 - movdqa xmm6,xmm4 - punpcklwd xmm7,xmm0 ; xmm0=tmp12 - punpckhwd xmm6,xmm0 - movdqa xmm4,xmm7 - movdqa xmm0,xmm6 - pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L - pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H - pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L - pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H - - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm7,DESCALE_P1 - psrad xmm6,DESCALE_P1 - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm4,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm7,xmm6 ; xmm7=data2 - packssdw xmm4,xmm0 ; xmm4=data6 - - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 - - ; -- Odd part - - movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 - - movdqa xmm6,xmm2 ; xmm2=tmp4 - movdqa xmm0,xmm5 ; xmm5=tmp5 - paddw xmm6,xmm3 ; xmm6=z3 - paddw xmm0,xmm1 ; xmm0=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm7,xmm6 - movdqa xmm4,xmm6 - punpcklwd xmm7,xmm0 - punpckhwd xmm4,xmm0 - movdqa xmm6,xmm7 - movdqa xmm0,xmm4 - pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H - pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L - pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm7,xmm2 - movdqa xmm4,xmm2 - punpcklwd xmm7,xmm1 - punpckhwd xmm4,xmm1 - movdqa xmm2,xmm7 - movdqa xmm1,xmm4 - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H - pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H - - paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L - paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H - paddd xmm2,xmm6 ; xmm2=data1L - paddd xmm1,xmm0 ; xmm1=data1H - - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm7,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm2,DESCALE_P1 - psrad xmm1,DESCALE_P1 - - packssdw xmm7,xmm4 ; xmm7=data7 - packssdw xmm2,xmm1 ; xmm2=data1 - - movdqa xmm4,xmm5 - movdqa xmm1,xmm5 - punpcklwd xmm4,xmm3 - punpckhwd xmm1,xmm3 - movdqa xmm5,xmm4 - movdqa xmm3,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H - pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L - pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H - - paddd xmm4,xmm6 ; xmm4=data5L - paddd xmm1,xmm0 ; xmm1=data5H - paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L - paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm4,DESCALE_P1 - psrad xmm1,DESCALE_P1 - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)] - paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] - psrad xmm5,DESCALE_P1 - psrad xmm3,DESCALE_P1 - - packssdw xmm4,xmm1 ; xmm4=data5 - packssdw xmm5,xmm3 ; xmm5=data3 - - ; ---- Pass 2: process columns. - -; mov edx, POINTER [data(eax)] ; (DCTELEM *) - - movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 - movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 - - ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) - ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) - - movdqa xmm1,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) - punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) - movdqa xmm3,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) - punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) - - movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 - movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 - - ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) - ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) - - movdqa xmm0,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) - punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) - movdqa xmm3,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) - punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) - - movdqa xmm4,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) - punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) - movdqa xmm7,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) - punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) - movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) - movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) - punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) - punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) - - movdqa xmm5,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 - punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 - punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 - - movdqa xmm2,xmm5 - movdqa xmm7,xmm6 - psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 - psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 - paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 - paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 - - movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) - movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movdqa xmm5,xmm4 ; transpose coefficients(phase 3) - punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 - punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 - movdqa xmm6,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 - punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 - - movdqa xmm0,xmm5 - movdqa xmm3,xmm4 - paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 - paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 - psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 - psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movdqa xmm1,xmm7 - movdqa xmm6,xmm2 - paddw xmm7,xmm5 ; xmm7=tmp10 - paddw xmm2,xmm4 ; xmm2=tmp11 - psubw xmm1,xmm5 ; xmm1=tmp13 - psubw xmm6,xmm4 ; xmm6=tmp12 - - movdqa xmm5,xmm7 - paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 - psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 - - paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)] - paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)] - psraw xmm7,PASS1_BITS ; xmm7=data0 - psraw xmm5,PASS1_BITS ; xmm5=data4 - - movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 - movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 - - ; (Original) - ; z1 = (tmp12 + tmp13) * 0.541196100; - ; data2 = z1 + tmp13 * 0.765366865; - ; data6 = z1 + tmp12 * -1.847759065; - ; - ; (This implementation) - ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; - ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); - - movdqa xmm4,xmm1 ; xmm1=tmp13 - movdqa xmm2,xmm1 - punpcklwd xmm4,xmm6 ; xmm6=tmp12 - punpckhwd xmm2,xmm6 - movdqa xmm1,xmm4 - movdqa xmm6,xmm2 - pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L - pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L - pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm4,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm1,DESCALE_P2 - psrad xmm6,DESCALE_P2 - - packssdw xmm4,xmm2 ; xmm4=data2 - packssdw xmm1,xmm6 ; xmm1=data6 - - movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 - - ; -- Odd part - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 - - movdqa xmm2,xmm0 ; xmm0=tmp4 - movdqa xmm6,xmm3 ; xmm3=tmp5 - paddw xmm2,xmm7 ; xmm2=z3 - paddw xmm6,xmm5 ; xmm6=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm4,xmm2 - movdqa xmm1,xmm2 - punpcklwd xmm4,xmm6 - punpckhwd xmm1,xmm6 - movdqa xmm2,xmm4 - movdqa xmm6,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H - pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L - pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L - movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H - - ; (Original) - ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; - ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; - ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; - ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; - ; - ; (This implementation) - ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; - ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; - ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); - ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); - ; data7 = tmp4 + z3; data5 = tmp5 + z4; - ; data3 = tmp6 + z3; data1 = tmp7 + z4; - - movdqa xmm4,xmm0 - movdqa xmm1,xmm0 - punpcklwd xmm4,xmm5 - punpckhwd xmm1,xmm5 - movdqa xmm0,xmm4 - movdqa xmm5,xmm1 - pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L - pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H - pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L - pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H - - paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L - paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H - paddd xmm0,xmm2 ; xmm0=data1L - paddd xmm5,xmm6 ; xmm5=data1H - - paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm4,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm0,DESCALE_P2 - psrad xmm5,DESCALE_P2 - - packssdw xmm4,xmm1 ; xmm4=data7 - packssdw xmm0,xmm5 ; xmm0=data1 - - movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 - movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 - - movdqa xmm1,xmm3 - movdqa xmm5,xmm3 - punpcklwd xmm1,xmm7 - punpckhwd xmm5,xmm7 - movdqa xmm3,xmm1 - movdqa xmm7,xmm5 - pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L - pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H - pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H - - paddd xmm1,xmm2 ; xmm1=data5L - paddd xmm5,xmm6 ; xmm5=data5H - paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L - paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H - - paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm1,DESCALE_P2 - psrad xmm5,DESCALE_P2 - paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)] - paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] - psrad xmm3,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm1,xmm5 ; xmm1=data5 - packssdw xmm3,xmm7 ; xmm3=data3 - - movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 - movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + paddw xmm3,xmm1 ; xmm3=tmp10 + paddw xmm6,xmm7 ; xmm6=tmp11 + psubw xmm4,xmm1 ; xmm4=tmp13 + psubw xmm0,xmm7 ; xmm0=tmp12 + + movdqa xmm1,xmm3 + paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3,PASS1_BITS ; xmm3=data0 + psllw xmm1,PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7,xmm4 ; xmm4=tmp13 + movdqa xmm6,xmm4 + punpcklwd xmm7,xmm0 ; xmm0=tmp12 + punpckhwd xmm6,xmm0 + movdqa xmm4,xmm7 + movdqa xmm0,xmm6 + pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L + pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H + pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L + pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H + + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7,DESCALE_P1 + psrad xmm6,DESCALE_P1 + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm7,xmm6 ; xmm7=data2 + packssdw xmm4,xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6,xmm2 ; xmm2=tmp4 + movdqa xmm0,xmm5 ; xmm5=tmp5 + paddw xmm6,xmm3 ; xmm6=z3 + paddw xmm0,xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7,xmm6 + movdqa xmm4,xmm6 + punpcklwd xmm7,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm6,xmm7 + movdqa xmm0,xmm4 + pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H + pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L + pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7,xmm2 + movdqa xmm4,xmm2 + punpcklwd xmm7,xmm1 + punpckhwd xmm4,xmm1 + movdqa xmm2,xmm7 + movdqa xmm1,xmm4 + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H + pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2,xmm6 ; xmm2=data1L + paddd xmm1,xmm0 ; xmm1=data1H + + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm2,DESCALE_P1 + psrad xmm1,DESCALE_P1 + + packssdw xmm7,xmm4 ; xmm7=data7 + packssdw xmm2,xmm1 ; xmm2=data1 + + movdqa xmm4,xmm5 + movdqa xmm1,xmm5 + punpcklwd xmm4,xmm3 + punpckhwd xmm1,xmm3 + movdqa xmm5,xmm4 + movdqa xmm3,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H + pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L + pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H + + paddd xmm4,xmm6 ; xmm4=data5L + paddd xmm1,xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4,DESCALE_P1 + psrad xmm1,DESCALE_P1 + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm5,DESCALE_P1 + psrad xmm3,DESCALE_P1 + + packssdw xmm4,xmm1 ; xmm4=data5 + packssdw xmm5,xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + +; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2,xmm5 + movdqa xmm7,xmm6 + psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0,xmm5 + movdqa xmm3,xmm4 + paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1,xmm7 + movdqa xmm6,xmm2 + paddw xmm7,xmm5 ; xmm7=tmp10 + paddw xmm2,xmm4 ; xmm2=tmp11 + psubw xmm1,xmm5 ; xmm1=tmp13 + psubw xmm6,xmm4 ; xmm6=tmp12 + + movdqa xmm5,xmm7 + paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)] + paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)] + psraw xmm7,PASS1_BITS ; xmm7=data0 + psraw xmm5,PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4,xmm1 ; xmm1=tmp13 + movdqa xmm2,xmm1 + punpcklwd xmm4,xmm6 ; xmm6=tmp12 + punpckhwd xmm2,xmm6 + movdqa xmm1,xmm4 + movdqa xmm6,xmm2 + pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L + pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L + pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1,DESCALE_P2 + psrad xmm6,DESCALE_P2 + + packssdw xmm4,xmm2 ; xmm4=data2 + packssdw xmm1,xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2,xmm0 ; xmm0=tmp4 + movdqa xmm6,xmm3 ; xmm3=tmp5 + paddw xmm2,xmm7 ; xmm2=z3 + paddw xmm6,xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4,xmm2 + movdqa xmm1,xmm2 + punpcklwd xmm4,xmm6 + punpckhwd xmm1,xmm6 + movdqa xmm2,xmm4 + movdqa xmm6,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H + pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L + pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4,xmm0 + movdqa xmm1,xmm0 + punpcklwd xmm4,xmm5 + punpckhwd xmm1,xmm5 + movdqa xmm0,xmm4 + movdqa xmm5,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H + pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L + pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0,xmm2 ; xmm0=data1L + paddd xmm5,xmm6 ; xmm5=data1H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm0,DESCALE_P2 + psrad xmm5,DESCALE_P2 + + packssdw xmm4,xmm1 ; xmm4=data7 + packssdw xmm0,xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1,xmm3 + movdqa xmm5,xmm3 + punpcklwd xmm1,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm3,xmm1 + movdqa xmm7,xmm5 + pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L + pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H + pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H + + paddd xmm1,xmm2 ; xmm1=data5L + paddd xmm5,xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1,DESCALE_P2 + psrad xmm5,DESCALE_P2 + paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm3,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm1,xmm5 ; xmm1=data5 + packssdw xmm3,xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfsseflt-64.asm b/simd/jfsseflt-64.asm index 07245d2dd..be2f5777b 100644 --- a/simd/jfsseflt-64.asm +++ b/simd/jfsseflt-64.asm @@ -26,32 +26,32 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_float_sse) + alignz 16 + global EXTN(jconst_fdct_float_sse) EXTN(jconst_fdct_float_sse): -PD_0_382 times 4 dd 0.382683432365089771728460 -PD_0_707 times 4 dd 0.707106781186547524400844 -PD_0_541 times 4 dd 0.541196100146196984399723 -PD_1_306 times 4 dd 1.306562964876376527856643 +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform the forward DCT on one block of samples. ; @@ -61,298 +61,298 @@ PD_1_306 times 4 dd 1.306562964876376527856643 ; r10 = FAST_FLOAT * data -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_float_sse) + align 16 + global EXTN(jsimd_fdct_float_sse) EXTN(jsimd_fdct_float_sse): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args - - ; ---- Pass 1: process rows. - - mov rdx, r10 ; (FAST_FLOAT *) - mov rcx, DCTSIZE/4 + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args + + ; ---- Pass 1: process rows. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 .rowloop: - movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) - ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) - unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) - unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) - ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) - unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[rel PD_0_707] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[rel PD_0_707] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[rel PD_0_382] ; xmm2=z5 - mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 - - add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT - dec rcx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov rdx, r10 ; (FAST_FLOAT *) - mov rcx, DCTSIZE/4 + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[rel PD_0_707] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[rel PD_0_707] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[rel PD_0_382] ; xmm2=z5 + mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec rcx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov rdx, r10 ; (FAST_FLOAT *) + mov rcx, DCTSIZE/4 .columnloop: - movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) - ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) - unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) - unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) - ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) - unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) - unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[rel PD_0_707] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[rel PD_0_707] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[rel PD_0_382] ; xmm2=z5 - mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 - - add rdx, byte 4*SIZEOF_FAST_FLOAT - dec rcx - jnz near .columnloop - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + movaps xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[rel PD_0_707] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[rel PD_0_707] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[rel PD_0_382] ; xmm2=z5 + mulps xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4 + + add rdx, byte 4*SIZEOF_FAST_FLOAT + dec rcx + jnz near .columnloop + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jfsseflt.asm b/simd/jfsseflt.asm index bc54cccde..5ece3f46d 100644 --- a/simd/jfsseflt.asm +++ b/simd/jfsseflt.asm @@ -25,32 +25,32 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_fdct_float_sse) + alignz 16 + global EXTN(jconst_fdct_float_sse) EXTN(jconst_fdct_float_sse): -PD_0_382 times 4 dd 0.382683432365089771728460 -PD_0_707 times 4 dd 0.707106781186547524400844 -PD_0_541 times 4 dd 0.541196100146196984399723 -PD_1_306 times 4 dd 1.306562964876376527856643 +PD_0_382 times 4 dd 0.382683432365089771728460 +PD_0_707 times 4 dd 0.707106781186547524400844 +PD_0_541 times 4 dd 0.541196100146196984399723 +PD_1_306 times 4 dd 1.306562964876376527856643 - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform the forward DCT on one block of samples. ; @@ -58,313 +58,313 @@ PD_1_306 times 4 dd 1.306562964876376527856643 ; jsimd_fdct_float_sse (FAST_FLOAT * data) ; -%define data(b) (b)+8 ; FAST_FLOAT * data +%define data(b) (b)+8 ; FAST_FLOAT * data -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_fdct_float_sse) + align 16 + global EXTN(jsimd_fdct_float_sse) EXTN(jsimd_fdct_float_sse): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused -; push edi ; unused - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process rows. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/4 - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused +; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16,7 .rowloop: - movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) - ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) - unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) - unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) - ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) - unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 - mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT - dec ecx - jnz near .rowloop - - ; ---- Pass 2: process columns. - - mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) - mov ecx, DCTSIZE/4 - alignx 16,7 + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(20 21 22 23), xmm2=(24 25 26 27) + ; xmm1=(30 31 32 33), xmm3=(34 35 36 37) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(20 30 21 31) + unpckhps xmm4,xmm1 ; xmm4=(22 32 23 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(24 34 25 35) + unpckhps xmm5,xmm3 ; xmm5=(26 36 27 37) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 01 02 03), xmm1=(04 05 06 07) + ; xmm7=(10 11 12 13), xmm3=(14 15 16 17) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 32 23 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(24 34 25 35) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm4,xmm7 ; xmm4=(02 12 03 13) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(04 14 05 15) + unpckhps xmm2,xmm3 ; xmm2=(06 16 07 17) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 10 20 30)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(01 11 21 31)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(06 16 26 36)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(07 17 27 37)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 32 23 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(24 34 25 35) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(02 12 22 32)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(03 13 23 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(04 14 24 34)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(05 15 25 35)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz near .rowloop + + ; ---- Pass 2: process columns. + + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) + mov ecx, DCTSIZE/4 + alignx 16,7 .columnloop: - movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] - - ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) - ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) - - movaps xmm4,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) - unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) - movaps xmm5,xmm2 ; transpose coefficients(phase 1) - unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) - unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) - - movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] - - ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) - ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) - - movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) - movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) - - movaps xmm4,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) - unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) - movaps xmm2,xmm1 ; transpose coefficients(phase 1) - unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) - unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) - - movaps xmm7,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 - unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 - movaps xmm3,xmm2 ; transpose coefficients(phase 2) - unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 - unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 - - movaps xmm0,xmm7 - movaps xmm5,xmm6 - subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 - subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 - addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 - addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 - - movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) - movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) - movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 - movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 - - movaps xmm7,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 - unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 - movaps xmm6,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 - unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 - - movaps xmm2,xmm7 - movaps xmm3,xmm4 - addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 - addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 - subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 - subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 - - ; -- Even part - - movaps xmm1,xmm5 - movaps xmm6,xmm0 - subps xmm5,xmm7 ; xmm5=tmp13 - subps xmm0,xmm4 ; xmm0=tmp12 - addps xmm1,xmm7 ; xmm1=tmp10 - addps xmm6,xmm4 ; xmm6=tmp11 - - addps xmm0,xmm5 - mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 - - movaps xmm7,xmm1 - movaps xmm4,xmm5 - subps xmm1,xmm6 ; xmm1=data4 - subps xmm5,xmm0 ; xmm5=data6 - addps xmm7,xmm6 ; xmm7=data0 - addps xmm4,xmm0 ; xmm4=data2 - - movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - ; -- Odd part - - movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 - movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 - - addps xmm2,xmm3 ; xmm2=tmp10 - addps xmm3,xmm6 ; xmm3=tmp11 - addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 - - mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 - - movaps xmm1,xmm2 ; xmm1=tmp10 - subps xmm2,xmm6 - mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 - mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) - mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) - addps xmm1,xmm2 ; xmm1=z2 - addps xmm6,xmm2 ; xmm6=z4 - - movaps xmm5,xmm0 - subps xmm0,xmm3 ; xmm0=z13 - addps xmm5,xmm3 ; xmm5=z11 - - movaps xmm7,xmm0 - movaps xmm4,xmm5 - subps xmm0,xmm1 ; xmm0=data3 - subps xmm5,xmm6 ; xmm5=data7 - addps xmm7,xmm1 ; xmm7=data5 - addps xmm4,xmm6 ; xmm4=data1 - - movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 - movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 - - add edx, byte 4*SIZEOF_FAST_FLOAT - dec ecx - jnz near .columnloop - -; pop edi ; unused -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + movaps xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm0=(02 12 22 32), xmm2=(42 52 62 72) + ; xmm1=(03 13 23 33), xmm3=(43 53 63 73) + + movaps xmm4,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm1 ; xmm0=(02 03 12 13) + unpckhps xmm4,xmm1 ; xmm4=(22 23 32 33) + movaps xmm5,xmm2 ; transpose coefficients(phase 1) + unpcklps xmm2,xmm3 ; xmm2=(42 43 52 53) + unpckhps xmm5,xmm3 ; xmm5=(62 63 72 73) + + movaps xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] + + ; xmm6=(00 10 20 30), xmm1=(40 50 60 70) + ; xmm7=(01 11 21 31), xmm3=(41 51 61 71) + + movaps XMMWORD [wk(0)], xmm4 ; wk(0)=(22 23 32 33) + movaps XMMWORD [wk(1)], xmm2 ; wk(1)=(42 43 52 53) + + movaps xmm4,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 01 10 11) + unpckhps xmm4,xmm7 ; xmm4=(20 21 30 31) + movaps xmm2,xmm1 ; transpose coefficients(phase 1) + unpcklps xmm1,xmm3 ; xmm1=(40 41 50 51) + unpckhps xmm2,xmm3 ; xmm2=(60 61 70 71) + + movaps xmm7,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm0 ; xmm6=(00 01 02 03)=data0 + unpckhps2 xmm7,xmm0 ; xmm7=(10 11 12 13)=data1 + movaps xmm3,xmm2 ; transpose coefficients(phase 2) + unpcklps2 xmm2,xmm5 ; xmm2=(60 61 62 63)=data6 + unpckhps2 xmm3,xmm5 ; xmm3=(70 71 72 73)=data7 + + movaps xmm0,xmm7 + movaps xmm5,xmm6 + subps xmm7,xmm2 ; xmm7=data1-data6=tmp6 + subps xmm6,xmm3 ; xmm6=data0-data7=tmp7 + addps xmm0,xmm2 ; xmm0=data1+data6=tmp1 + addps xmm5,xmm3 ; xmm5=data0+data7=tmp0 + + movaps xmm2, XMMWORD [wk(0)] ; xmm2=(22 23 32 33) + movaps xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53) + movaps XMMWORD [wk(0)], xmm7 ; wk(0)=tmp6 + movaps XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movaps xmm7,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(20 21 22 23)=data2 + unpckhps2 xmm7,xmm2 ; xmm7=(30 31 32 33)=data3 + movaps xmm6,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm3 ; xmm1=(40 41 42 43)=data4 + unpckhps2 xmm6,xmm3 ; xmm6=(50 51 52 53)=data5 + + movaps xmm2,xmm7 + movaps xmm3,xmm4 + addps xmm7,xmm1 ; xmm7=data3+data4=tmp3 + addps xmm4,xmm6 ; xmm4=data2+data5=tmp2 + subps xmm2,xmm1 ; xmm2=data3-data4=tmp4 + subps xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movaps xmm1,xmm5 + movaps xmm6,xmm0 + subps xmm5,xmm7 ; xmm5=tmp13 + subps xmm0,xmm4 ; xmm0=tmp12 + addps xmm1,xmm7 ; xmm1=tmp10 + addps xmm6,xmm4 ; xmm6=tmp11 + + addps xmm0,xmm5 + mulps xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1 + + movaps xmm7,xmm1 + movaps xmm4,xmm5 + subps xmm1,xmm6 ; xmm1=data4 + subps xmm5,xmm0 ; xmm5=data6 + addps xmm7,xmm6 ; xmm7=data0 + addps xmm4,xmm0 ; xmm4=data2 + + movaps XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + ; -- Odd part + + movaps xmm6, XMMWORD [wk(0)] ; xmm6=tmp6 + movaps xmm0, XMMWORD [wk(1)] ; xmm0=tmp7 + + addps xmm2,xmm3 ; xmm2=tmp10 + addps xmm3,xmm6 ; xmm3=tmp11 + addps xmm6,xmm0 ; xmm6=tmp12, xmm0=tmp7 + + mulps xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3 + + movaps xmm1,xmm2 ; xmm1=tmp10 + subps xmm2,xmm6 + mulps xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5 + mulps xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196) + mulps xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562) + addps xmm1,xmm2 ; xmm1=z2 + addps xmm6,xmm2 ; xmm6=z4 + + movaps xmm5,xmm0 + subps xmm0,xmm3 ; xmm0=z13 + addps xmm5,xmm3 ; xmm5=z11 + + movaps xmm7,xmm0 + movaps xmm4,xmm5 + subps xmm0,xmm1 ; xmm0=data3 + subps xmm5,xmm6 ; xmm5=data7 + addps xmm7,xmm1 ; xmm7=data5 + addps xmm4,xmm6 ; xmm4=data1 + + movaps XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7 + movaps XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4 + + add edx, byte 4*SIZEOF_FAST_FLOAT + dec ecx + jnz near .columnloop + +; pop edi ; unused +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/ji3dnflt.asm b/simd/ji3dnflt.asm index dc2076f41..76303fa4c 100644 --- a/simd/ji3dnflt.asm +++ b/simd/ji3dnflt.asm @@ -24,25 +24,25 @@ %include "jdct.inc" ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_float_3dnow) + alignz 16 + global EXTN(jconst_idct_float_3dnow) EXTN(jconst_idct_float_3dnow): -PD_1_414 times 2 dd 1.414213562373095048801689 -PD_1_847 times 2 dd 1.847759065022573512256366 -PD_1_082 times 2 dd 1.082392200292393968799446 -PD_2_613 times 2 dd 2.613125929752753055713286 -PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) -PB_CENTERJSAMP times 8 db CENTERJSAMPLE +PD_1_414 times 2 dd 1.414213562373095048801689 +PD_1_847 times 2 dd 1.847759065022573512256366 +PD_1_082 times 2 dd 1.082392200292393968799446 +PD_2_613 times 2 dd 2.613125929752753055713286 +PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -51,402 +51,402 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void * dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT - ; FAST_FLOAT workspace[DCTSIZE2] +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_float_3dnow) + align 16 + global EXTN(jsimd_idct_float_3dnow) EXTN(jsimd_idct_float_3dnow): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; FAST_FLOAT * wsptr - mov ecx, DCTSIZE/2 ; ctr - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT * wsptr + mov ecx, DCTSIZE/2 ; ctr + alignx 16,7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - pushpic ebx ; save GOT address - mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] - mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] - or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] - or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] - or eax,ebx - poppic ebx ; restore GOT address - jnz short .columnDCT - - ; -- AC terms all zero - - movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] - - punpcklwd mm0,mm0 - psrad mm0,(DWORD_BIT-WORD_BIT) - pi2fd mm0,mm0 - - pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movq mm1,mm0 - punpckldq mm0,mm0 - punpckhdq mm1,mm1 - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 - movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 - movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 - jmp near .nextcolumn - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + pushpic ebx ; save GOT address + mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + or eax,ebx + poppic ebx ; restore GOT address + jnz short .columnDCT + + ; -- AC terms all zero + + movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0,mm0 + psrad mm0,(DWORD_BIT-WORD_BIT) + pi2fd mm0,mm0 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm1,mm0 + punpckldq mm0,mm0 + punpckhdq mm1,mm1 + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 + jmp near .nextcolumn + alignx 16,7 %endif .columnDCT: - ; -- Even part - - movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] - movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] - movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] - - punpcklwd mm0,mm0 - punpcklwd mm1,mm1 - psrad mm0,(DWORD_BIT-WORD_BIT) - psrad mm1,(DWORD_BIT-WORD_BIT) - pi2fd mm0,mm0 - pi2fd mm1,mm1 - - pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - punpcklwd mm2,mm2 - punpcklwd mm3,mm3 - psrad mm2,(DWORD_BIT-WORD_BIT) - psrad mm3,(DWORD_BIT-WORD_BIT) - pi2fd mm2,mm2 - pi2fd mm3,mm3 - - pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movq mm4,mm0 - movq mm5,mm1 - pfsub mm0,mm2 ; mm0=tmp11 - pfsub mm1,mm3 - pfadd mm4,mm2 ; mm4=tmp10 - pfadd mm5,mm3 ; mm5=tmp13 - - pfmul mm1,[GOTOFF(ebx,PD_1_414)] - pfsub mm1,mm5 ; mm1=tmp12 - - movq mm6,mm4 - movq mm7,mm0 - pfsub mm4,mm5 ; mm4=tmp3 - pfsub mm0,mm1 ; mm0=tmp2 - pfadd mm6,mm5 ; mm6=tmp0 - pfadd mm7,mm1 ; mm7=tmp1 - - movq MMWORD [wk(1)], mm4 ; tmp3 - movq MMWORD [wk(0)], mm0 ; tmp2 - - ; -- Odd part - - movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] - movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] - movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] - - punpcklwd mm2,mm2 - punpcklwd mm3,mm3 - psrad mm2,(DWORD_BIT-WORD_BIT) - psrad mm3,(DWORD_BIT-WORD_BIT) - pi2fd mm2,mm2 - pi2fd mm3,mm3 - - pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - punpcklwd mm5,mm5 - punpcklwd mm1,mm1 - psrad mm5,(DWORD_BIT-WORD_BIT) - psrad mm1,(DWORD_BIT-WORD_BIT) - pi2fd mm5,mm5 - pi2fd mm1,mm1 - - pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movq mm4,mm2 - movq mm0,mm5 - pfadd mm2,mm1 ; mm2=z11 - pfadd mm5,mm3 ; mm5=z13 - pfsub mm4,mm1 ; mm4=z12 - pfsub mm0,mm3 ; mm0=z10 - - movq mm1,mm2 - pfsub mm2,mm5 - pfadd mm1,mm5 ; mm1=tmp7 - - pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 - - movq mm3,mm0 - pfadd mm0,mm4 - pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 - pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) - pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) - pfsubr mm3,mm0 ; mm3=tmp12 - pfsub mm4,mm0 ; mm4=tmp10 - - ; -- Final output stage - - pfsub mm3,mm1 ; mm3=tmp6 - movq mm5,mm6 - movq mm0,mm7 - pfadd mm6,mm1 ; mm6=data0=(00 01) - pfadd mm7,mm3 ; mm7=data1=(10 11) - pfsub mm5,mm1 ; mm5=data7=(70 71) - pfsub mm0,mm3 ; mm0=data6=(60 61) - pfsub mm2,mm3 ; mm2=tmp5 - - movq mm1,mm6 ; transpose coefficients - punpckldq mm6,mm7 ; mm6=(00 10) - punpckhdq mm1,mm7 ; mm1=(01 11) - movq mm3,mm0 ; transpose coefficients - punpckldq mm0,mm5 ; mm0=(60 70) - punpckhdq mm3,mm5 ; mm3=(61 71) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 - movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 - - movq mm7, MMWORD [wk(0)] ; mm7=tmp2 - movq mm5, MMWORD [wk(1)] ; mm5=tmp3 - - pfadd mm4,mm2 ; mm4=tmp4 - movq mm6,mm7 - movq mm1,mm5 - pfadd mm7,mm2 ; mm7=data2=(20 21) - pfadd mm5,mm4 ; mm5=data4=(40 41) - pfsub mm6,mm2 ; mm6=data5=(50 51) - pfsub mm1,mm4 ; mm1=data3=(30 31) - - movq mm0,mm7 ; transpose coefficients - punpckldq mm7,mm1 ; mm7=(20 30) - punpckhdq mm0,mm1 ; mm0=(21 31) - movq mm3,mm5 ; transpose coefficients - punpckldq mm5,mm6 ; mm5=(40 50) - punpckhdq mm3,mm6 ; mm3=(41 51) - - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 - movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 - movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 + ; -- Even part + + movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] + movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] + movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm0,mm0 + punpcklwd mm1,mm1 + psrad mm0,(DWORD_BIT-WORD_BIT) + psrad mm1,(DWORD_BIT-WORD_BIT) + pi2fd mm0,mm0 + pi2fd mm1,mm1 + + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm2,mm2 + punpcklwd mm3,mm3 + psrad mm2,(DWORD_BIT-WORD_BIT) + psrad mm3,(DWORD_BIT-WORD_BIT) + pi2fd mm2,mm2 + pi2fd mm3,mm3 + + pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm1 + pfsub mm0,mm2 ; mm0=tmp11 + pfsub mm1,mm3 + pfadd mm4,mm2 ; mm4=tmp10 + pfadd mm5,mm3 ; mm5=tmp13 + + pfmul mm1,[GOTOFF(ebx,PD_1_414)] + pfsub mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm5 ; mm4=tmp3 + pfsub mm0,mm1 ; mm0=tmp2 + pfadd mm6,mm5 ; mm6=tmp0 + pfadd mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] + movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] + movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd mm2,mm2 + punpcklwd mm3,mm3 + psrad mm2,(DWORD_BIT-WORD_BIT) + psrad mm3,(DWORD_BIT-WORD_BIT) + pi2fd mm2,mm2 + pi2fd mm3,mm3 + + pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + punpcklwd mm5,mm5 + punpcklwd mm1,mm1 + psrad mm5,(DWORD_BIT-WORD_BIT) + psrad mm1,(DWORD_BIT-WORD_BIT) + pi2fd mm5,mm5 + pi2fd mm1,mm1 + + pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movq mm4,mm2 + movq mm0,mm5 + pfadd mm2,mm1 ; mm2=z11 + pfadd mm5,mm3 ; mm5=z13 + pfsub mm4,mm1 ; mm4=z12 + pfsub mm0,mm3 ; mm0=z10 + + movq mm1,mm2 + pfsub mm2,mm5 + pfadd mm1,mm5 ; mm1=tmp7 + + pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3,mm0 + pfadd mm0,mm4 + pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3,mm0 ; mm3=tmp12 + pfsub mm4,mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3,mm1 ; mm3=tmp6 + movq mm5,mm6 + movq mm0,mm7 + pfadd mm6,mm1 ; mm6=data0=(00 01) + pfadd mm7,mm3 ; mm7=data1=(10 11) + pfsub mm5,mm1 ; mm5=data7=(70 71) + pfsub mm0,mm3 ; mm0=data6=(60 61) + pfsub mm2,mm3 ; mm2=tmp5 + + movq mm1,mm6 ; transpose coefficients + punpckldq mm6,mm7 ; mm6=(00 10) + punpckhdq mm1,mm7 ; mm1=(01 11) + movq mm3,mm0 ; transpose coefficients + punpckldq mm0,mm5 ; mm0=(60 70) + punpckhdq mm3,mm5 ; mm3=(61 71) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm5, MMWORD [wk(1)] ; mm5=tmp3 + + pfadd mm4,mm2 ; mm4=tmp4 + movq mm6,mm7 + movq mm1,mm5 + pfadd mm7,mm2 ; mm7=data2=(20 21) + pfadd mm5,mm4 ; mm5=data4=(40 41) + pfsub mm6,mm2 ; mm6=data5=(50 51) + pfsub mm1,mm4 ; mm1=data3=(30 31) + + movq mm0,mm7 ; transpose coefficients + punpckldq mm7,mm1 ; mm7=(20 30) + punpckhdq mm0,mm1 ; mm0=(21 31) + movq mm3,mm5 ; transpose coefficients + punpckldq mm5,mm6 ; mm5=(40 50) + punpckhdq mm3,mm6 ; mm3=(41 51) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 .nextcolumn: - add esi, byte 2*SIZEOF_JCOEF ; coef_block - add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; FAST_FLOAT * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/2 ; ctr - alignx 16,7 + add esi, byte 2*SIZEOF_JCOEF ; coef_block + add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT * wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/2 ; ctr + alignx 16,7 .rowloop: - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] - - movq mm4,mm0 - movq mm5,mm1 - pfsub mm0,mm2 ; mm0=tmp11 - pfsub mm1,mm3 - pfadd mm4,mm2 ; mm4=tmp10 - pfadd mm5,mm3 ; mm5=tmp13 - - pfmul mm1,[GOTOFF(ebx,PD_1_414)] - pfsub mm1,mm5 ; mm1=tmp12 - - movq mm6,mm4 - movq mm7,mm0 - pfsub mm4,mm5 ; mm4=tmp3 - pfsub mm0,mm1 ; mm0=tmp2 - pfadd mm6,mm5 ; mm6=tmp0 - pfadd mm7,mm1 ; mm7=tmp1 - - movq MMWORD [wk(1)], mm4 ; tmp3 - movq MMWORD [wk(0)], mm0 ; tmp2 - - ; -- Odd part - - movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] - movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] - movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] - - movq mm4,mm2 - movq mm0,mm5 - pfadd mm2,mm1 ; mm2=z11 - pfadd mm5,mm3 ; mm5=z13 - pfsub mm4,mm1 ; mm4=z12 - pfsub mm0,mm3 ; mm0=z10 - - movq mm1,mm2 - pfsub mm2,mm5 - pfadd mm1,mm5 ; mm1=tmp7 - - pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 - - movq mm3,mm0 - pfadd mm0,mm4 - pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 - pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) - pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) - pfsubr mm3,mm0 ; mm3=tmp12 - pfsub mm4,mm0 ; mm4=tmp10 - - ; -- Final output stage - - pfsub mm3,mm1 ; mm3=tmp6 - movq mm5,mm6 - movq mm0,mm7 - pfadd mm6,mm1 ; mm6=data0=(00 10) - pfadd mm7,mm3 ; mm7=data1=(01 11) - pfsub mm5,mm1 ; mm5=data7=(07 17) - pfsub mm0,mm3 ; mm0=data6=(06 16) - pfsub mm2,mm3 ; mm2=tmp5 - - movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] - pcmpeqd mm3,mm3 - psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} - - pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) - pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) - pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) - pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) - - pand mm6,mm3 ; mm6=(00 -- 10 --) - pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11) - pand mm0,mm3 ; mm0=(06 -- 16 --) - pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17) - por mm6,mm7 ; mm6=(00 01 10 11) - por mm0,mm5 ; mm0=(06 07 16 17) - - movq mm1, MMWORD [wk(0)] ; mm1=tmp2 - movq mm3, MMWORD [wk(1)] ; mm3=tmp3 - - pfadd mm4,mm2 ; mm4=tmp4 - movq mm7,mm1 - movq mm5,mm3 - pfadd mm1,mm2 ; mm1=data2=(02 12) - pfadd mm3,mm4 ; mm3=data4=(04 14) - pfsub mm7,mm2 ; mm7=data5=(05 15) - pfsub mm5,mm4 ; mm5=data3=(03 13) - - movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] - pcmpeqd mm4,mm4 - psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} - - pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) - pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) - pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) - pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) - - pand mm3,mm4 ; mm3=(04 -- 14 --) - pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15) - pand mm1,mm4 ; mm1=(02 -- 12 --) - pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13) - por mm3,mm7 ; mm3=(04 05 14 15) - por mm1,mm5 ; mm1=(02 03 12 13) - - movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] - - packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15) - packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17) - paddb mm6,mm2 - paddb mm1,mm2 - - movq mm4,mm6 ; transpose coefficients(phase 2) - punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13) - punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17) - - movq mm7,mm6 ; transpose coefficients(phase 3) - punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07) - punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 - - poppic ebx ; restore GOT address - - add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr - add edi, byte 2*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - femms ; empty MMX/3DNow! state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4,mm0 + movq mm5,mm1 + pfsub mm0,mm2 ; mm0=tmp11 + pfsub mm1,mm3 + pfadd mm4,mm2 ; mm4=tmp10 + pfadd mm5,mm3 ; mm5=tmp13 + + pfmul mm1,[GOTOFF(ebx,PD_1_414)] + pfsub mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + pfsub mm4,mm5 ; mm4=tmp3 + pfsub mm0,mm1 ; mm0=tmp2 + pfadd mm6,mm5 ; mm6=tmp0 + pfadd mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; tmp3 + movq MMWORD [wk(0)], mm0 ; tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movq mm4,mm2 + movq mm0,mm5 + pfadd mm2,mm1 ; mm2=z11 + pfadd mm5,mm3 ; mm5=z13 + pfsub mm4,mm1 ; mm4=z12 + pfsub mm0,mm3 ; mm0=z10 + + movq mm1,mm2 + pfsub mm2,mm5 + pfadd mm1,mm5 ; mm1=tmp7 + + pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 + + movq mm3,mm0 + pfadd mm0,mm4 + pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 + pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) + pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) + pfsubr mm3,mm0 ; mm3=tmp12 + pfsub mm4,mm0 ; mm4=tmp10 + + ; -- Final output stage + + pfsub mm3,mm1 ; mm3=tmp6 + movq mm5,mm6 + movq mm0,mm7 + pfadd mm6,mm1 ; mm6=data0=(00 10) + pfadd mm7,mm3 ; mm7=data1=(01 11) + pfsub mm5,mm1 ; mm5=data7=(07 17) + pfsub mm0,mm3 ; mm0=data6=(06 16) + pfsub mm2,mm3 ; mm2=tmp5 + + movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] + pcmpeqd mm3,mm3 + psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) + pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) + pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) + pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) + + pand mm6,mm3 ; mm6=(00 -- 10 --) + pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11) + pand mm0,mm3 ; mm0=(06 -- 16 --) + pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17) + por mm6,mm7 ; mm6=(00 01 10 11) + por mm0,mm5 ; mm0=(06 07 16 17) + + movq mm1, MMWORD [wk(0)] ; mm1=tmp2 + movq mm3, MMWORD [wk(1)] ; mm3=tmp3 + + pfadd mm4,mm2 ; mm4=tmp4 + movq mm7,mm1 + movq mm5,mm3 + pfadd mm1,mm2 ; mm1=data2=(02 12) + pfadd mm3,mm4 ; mm3=data4=(04 14) + pfsub mm7,mm2 ; mm7=data5=(05 15) + pfsub mm5,mm4 ; mm5=data3=(03 13) + + movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] + pcmpeqd mm4,mm4 + psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} + + pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) + pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) + pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) + pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) + + pand mm3,mm4 ; mm3=(04 -- 14 --) + pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15) + pand mm1,mm4 ; mm1=(02 -- 12 --) + pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13) + por mm3,mm7 ; mm3=(04 05 14 15) + por mm1,mm5 ; mm1=(02 03 12 13) + + movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] + + packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15) + packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17) + paddb mm6,mm2 + paddb mm1,mm2 + + movq mm4,mm6 ; transpose coefficients(phase 2) + punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17) + + movq mm7,mm6 ; transpose coefficients(phase 3) + punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + + poppic ebx ; restore GOT address + + add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 2*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + femms ; empty MMX/3DNow! state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jimmxfst.asm b/simd/jimmxfst.asm index 3b055727d..a9eaa02e6 100644 --- a/simd/jimmxfst.asm +++ b/simd/jimmxfst.asm @@ -26,31 +26,31 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. -%define PASS1_BITS 2 +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 %if IFAST_SCALE_BITS != PASS1_BITS %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." %endif %if CONST_BITS == 8 -F_1_082 equ 277 ; FIX(1.082392200) -F_1_414 equ 362 ; FIX(1.414213562) -F_1_847 equ 473 ; FIX(1.847759065) -F_2_613 equ 669 ; FIX(2.613125930) -F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) %else ; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) -F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) -F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) @@ -58,22 +58,22 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_idct_ifast_mmx) + alignz 16 + global EXTN(jconst_idct_ifast_mmx) EXTN(jconst_idct_ifast_mmx): -PW_F1414 times 4 dw F_1_414 << CONST_SHIFT -PW_F1847 times 4 dw F_1_847 << CONST_SHIFT -PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT -PW_F1082 times 4 dw F_1_082 << CONST_SHIFT -PB_CENTERJSAMP times 8 db CENTERJSAMPLE +PW_F1414 times 4 dw F_1_414 << CONST_SHIFT +PW_F1847 times 4 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 4 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -82,419 +82,419 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; jpeg_component_info * compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; jpeg_component_info * compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF - ; JCOEF workspace[DCTSIZE2] +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_ifast_mmx) + align 16 + global EXTN(jsimd_idct_ifast_mmx) EXTN(jsimd_idct_ifast_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; JCOEF * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF * wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por mm1,mm0 - packsswb mm1,mm1 - movd eax,mm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movq mm2,mm0 ; mm0=in0=(00 01 02 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - punpckhwd mm2,mm2 ; mm2=(02 02 03 03) - - movq mm1,mm0 - punpckldq mm0,mm0 ; mm0=(00 00 00 00) - punpckhdq mm1,mm1 ; mm1=(01 01 01 01) - movq mm3,mm2 - punpckldq mm2,mm2 ; mm2=(02 02 02 02) - punpckhdq mm3,mm3 ; mm3=(03 03 03 03) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 - jmp near .nextcolumn - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 %endif .columnDCT: - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movq mm4,mm0 - movq mm5,mm1 - psubw mm0,mm2 ; mm0=tmp11 - psubw mm1,mm3 - paddw mm4,mm2 ; mm4=tmp10 - paddw mm5,mm3 ; mm5=tmp13 - - psllw mm1,PRE_MULTIPLY_SCALE_BITS - pmulhw mm1,[GOTOFF(ebx,PW_F1414)] - psubw mm1,mm5 ; mm1=tmp12 - - movq mm6,mm4 - movq mm7,mm0 - psubw mm4,mm5 ; mm4=tmp3 - psubw mm0,mm1 ; mm0=tmp2 - paddw mm6,mm5 ; mm6=tmp0 - paddw mm7,mm1 ; mm7=tmp1 - - movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 - movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 - - ; -- Odd part - - movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movq mm4,mm2 - movq mm0,mm5 - psubw mm2,mm1 ; mm2=z12 - psubw mm5,mm3 ; mm5=z10 - paddw mm4,mm1 ; mm4=z11 - paddw mm0,mm3 ; mm0=z13 - - movq mm1,mm5 ; mm1=z10(unscaled) - psllw mm2,PRE_MULTIPLY_SCALE_BITS - psllw mm5,PRE_MULTIPLY_SCALE_BITS - - movq mm3,mm4 - psubw mm4,mm0 - paddw mm3,mm0 ; mm3=tmp7 - - psllw mm4,PRE_MULTIPLY_SCALE_BITS - pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movq mm0,mm5 - paddw mm5,mm2 - pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 - pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] - pmulhw mm2,[GOTOFF(ebx,PW_F1082)] - psubw mm0,mm1 - psubw mm2,mm5 ; mm2=tmp10 - paddw mm0,mm5 ; mm0=tmp12 - - ; -- Final output stage - - psubw mm0,mm3 ; mm0=tmp6 - movq mm1,mm6 - movq mm5,mm7 - paddw mm6,mm3 ; mm6=data0=(00 01 02 03) - paddw mm7,mm0 ; mm7=data1=(10 11 12 13) - psubw mm1,mm3 ; mm1=data7=(70 71 72 73) - psubw mm5,mm0 ; mm5=data6=(60 61 62 63) - psubw mm4,mm0 ; mm4=tmp5 - - movq mm3,mm6 ; transpose coefficients(phase 1) - punpcklwd mm6,mm7 ; mm6=(00 10 01 11) - punpckhwd mm3,mm7 ; mm3=(02 12 03 13) - movq mm0,mm5 ; transpose coefficients(phase 1) - punpcklwd mm5,mm1 ; mm5=(60 70 61 71) - punpckhwd mm0,mm1 ; mm0=(62 72 63 73) - - movq mm7, MMWORD [wk(0)] ; mm7=tmp2 - movq mm1, MMWORD [wk(1)] ; mm1=tmp3 - - movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) - movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) - - paddw mm2,mm4 ; mm2=tmp4 - movq mm5,mm7 - movq mm0,mm1 - paddw mm7,mm4 ; mm7=data2=(20 21 22 23) - paddw mm1,mm2 ; mm1=data4=(40 41 42 43) - psubw mm5,mm4 ; mm5=data5=(50 51 52 53) - psubw mm0,mm2 ; mm0=data3=(30 31 32 33) - - movq mm4,mm7 ; transpose coefficients(phase 1) - punpcklwd mm7,mm0 ; mm7=(20 30 21 31) - punpckhwd mm4,mm0 ; mm4=(22 32 23 33) - movq mm2,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm5 ; mm1=(40 50 41 51) - punpckhwd mm2,mm5 ; mm2=(42 52 43 53) - - movq mm0,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm7 ; mm6=(00 10 20 30) - punpckhdq mm0,mm7 ; mm0=(01 11 21 31) - movq mm5,mm3 ; transpose coefficients(phase 2) - punpckldq mm3,mm4 ; mm3=(02 12 22 32) - punpckhdq mm5,mm4 ; mm5=(03 13 23 33) - - movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) - movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 - - movq mm6,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm7 ; mm1=(40 50 60 70) - punpckhdq mm6,mm7 ; mm6=(41 51 61 71) - movq mm0,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm4 ; mm2=(42 52 62 72) - punpckhdq mm0,mm4 ; mm0=(43 53 63 73) - - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm1 + psubw mm0,mm2 ; mm0=tmp11 + psubw mm1,mm3 + paddw mm4,mm2 ; mm4=tmp10 + paddw mm5,mm3 ; mm5=tmp13 + + psllw mm1,PRE_MULTIPLY_SCALE_BITS + pmulhw mm1,[GOTOFF(ebx,PW_F1414)] + psubw mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + psubw mm4,mm5 ; mm4=tmp3 + psubw mm0,mm1 ; mm0=tmp2 + paddw mm6,mm5 ; mm6=tmp0 + paddw mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movq mm4,mm2 + movq mm0,mm5 + psubw mm2,mm1 ; mm2=z12 + psubw mm5,mm3 ; mm5=z10 + paddw mm4,mm1 ; mm4=z11 + paddw mm0,mm3 ; mm0=z13 + + movq mm1,mm5 ; mm1=z10(unscaled) + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm5,PRE_MULTIPLY_SCALE_BITS + + movq mm3,mm4 + psubw mm4,mm0 + paddw mm3,mm0 ; mm3=tmp7 + + psllw mm4,PRE_MULTIPLY_SCALE_BITS + pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0,mm5 + paddw mm5,mm2 + pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw mm2,[GOTOFF(ebx,PW_F1082)] + psubw mm0,mm1 + psubw mm2,mm5 ; mm2=tmp10 + paddw mm0,mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0,mm3 ; mm0=tmp6 + movq mm1,mm6 + movq mm5,mm7 + paddw mm6,mm3 ; mm6=data0=(00 01 02 03) + paddw mm7,mm0 ; mm7=data1=(10 11 12 13) + psubw mm1,mm3 ; mm1=data7=(70 71 72 73) + psubw mm5,mm0 ; mm5=data6=(60 61 62 63) + psubw mm4,mm0 ; mm4=tmp5 + + movq mm3,mm6 ; transpose coefficients(phase 1) + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) + punpckhwd mm3,mm7 ; mm3=(02 12 03 13) + movq mm0,mm5 ; transpose coefficients(phase 1) + punpcklwd mm5,mm1 ; mm5=(60 70 61 71) + punpckhwd mm0,mm1 ; mm0=(62 72 63 73) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 + movq mm1, MMWORD [wk(1)] ; mm1=tmp3 + + movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) + movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) + + paddw mm2,mm4 ; mm2=tmp4 + movq mm5,mm7 + movq mm0,mm1 + paddw mm7,mm4 ; mm7=data2=(20 21 22 23) + paddw mm1,mm2 ; mm1=data4=(40 41 42 43) + psubw mm5,mm4 ; mm5=data5=(50 51 52 53) + psubw mm0,mm2 ; mm0=data3=(30 31 32 33) + + movq mm4,mm7 ; transpose coefficients(phase 1) + punpcklwd mm7,mm0 ; mm7=(20 30 21 31) + punpckhwd mm4,mm0 ; mm4=(22 32 23 33) + movq mm2,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm5 ; mm1=(40 50 41 51) + punpckhwd mm2,mm5 ; mm2=(42 52 43 53) + + movq mm0,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm7 ; mm6=(00 10 20 30) + punpckhdq mm0,mm7 ; mm0=(01 11 21 31) + movq mm5,mm3 ; transpose coefficients(phase 2) + punpckldq mm3,mm4 ; mm3=(02 12 22 32) + punpckhdq mm5,mm4 ; mm5=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) + movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm6,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm7 ; mm1=(40 50 60 70) + punpckhdq mm6,mm7 ; mm6=(41 51 61 71) + movq mm0,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm4 ; mm2=(42 52 62 72) + punpckhdq mm0,mm4 ; mm0=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 .nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr - add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; JCOEF * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF * wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .rowloop: - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - movq mm4,mm0 - movq mm5,mm1 - psubw mm0,mm2 ; mm0=tmp11 - psubw mm1,mm3 - paddw mm4,mm2 ; mm4=tmp10 - paddw mm5,mm3 ; mm5=tmp13 - - psllw mm1,PRE_MULTIPLY_SCALE_BITS - pmulhw mm1,[GOTOFF(ebx,PW_F1414)] - psubw mm1,mm5 ; mm1=tmp12 - - movq mm6,mm4 - movq mm7,mm0 - psubw mm4,mm5 ; mm4=tmp3 - psubw mm0,mm1 ; mm0=tmp2 - paddw mm6,mm5 ; mm6=tmp0 - paddw mm7,mm1 ; mm7=tmp1 - - movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 - movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 - - ; -- Odd part - - movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - movq mm4,mm2 - movq mm0,mm5 - psubw mm2,mm1 ; mm2=z12 - psubw mm5,mm3 ; mm5=z10 - paddw mm4,mm1 ; mm4=z11 - paddw mm0,mm3 ; mm0=z13 - - movq mm1,mm5 ; mm1=z10(unscaled) - psllw mm2,PRE_MULTIPLY_SCALE_BITS - psllw mm5,PRE_MULTIPLY_SCALE_BITS - - movq mm3,mm4 - psubw mm4,mm0 - paddw mm3,mm0 ; mm3=tmp7 - - psllw mm4,PRE_MULTIPLY_SCALE_BITS - pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movq mm0,mm5 - paddw mm5,mm2 - pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 - pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] - pmulhw mm2,[GOTOFF(ebx,PW_F1082)] - psubw mm0,mm1 - psubw mm2,mm5 ; mm2=tmp10 - paddw mm0,mm5 ; mm0=tmp12 - - ; -- Final output stage - - psubw mm0,mm3 ; mm0=tmp6 - movq mm1,mm6 - movq mm5,mm7 - paddw mm6,mm3 ; mm6=data0=(00 10 20 30) - paddw mm7,mm0 ; mm7=data1=(01 11 21 31) - psraw mm6,(PASS1_BITS+3) ; descale - psraw mm7,(PASS1_BITS+3) ; descale - psubw mm1,mm3 ; mm1=data7=(07 17 27 37) - psubw mm5,mm0 ; mm5=data6=(06 16 26 36) - psraw mm1,(PASS1_BITS+3) ; descale - psraw mm5,(PASS1_BITS+3) ; descale - psubw mm4,mm0 ; mm4=tmp5 - - packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36) - packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37) - - movq mm3, MMWORD [wk(0)] ; mm3=tmp2 - movq mm0, MMWORD [wk(1)] ; mm0=tmp3 - - paddw mm2,mm4 ; mm2=tmp4 - movq mm5,mm3 - movq mm1,mm0 - paddw mm3,mm4 ; mm3=data2=(02 12 22 32) - paddw mm0,mm2 ; mm0=data4=(04 14 24 34) - psraw mm3,(PASS1_BITS+3) ; descale - psraw mm0,(PASS1_BITS+3) ; descale - psubw mm5,mm4 ; mm5=data5=(05 15 25 35) - psubw mm1,mm2 ; mm1=data3=(03 13 23 33) - psraw mm5,(PASS1_BITS+3) ; descale - psraw mm1,(PASS1_BITS+3) ; descale - - movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] - - packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34) - packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35) - - paddb mm6,mm4 - paddb mm7,mm4 - paddb mm3,mm4 - paddb mm1,mm4 - - movq mm2,mm6 ; transpose coefficients(phase 1) - punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31) - punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37) - movq mm0,mm3 ; transpose coefficients(phase 1) - punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33) - punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35) - - movq mm5,mm6 ; transpose coefficients(phase 2) - punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13) - punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33) - movq mm4,mm0 ; transpose coefficients(phase 2) - punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17) - punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37) - - movq mm7,mm6 ; transpose coefficients(phase 3) - punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07) - punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17) - movq mm1,mm5 ; transpose coefficients(phase 3) - punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27) - punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_JCOEF ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm0 + movq mm5,mm1 + psubw mm0,mm2 ; mm0=tmp11 + psubw mm1,mm3 + paddw mm4,mm2 ; mm4=tmp10 + paddw mm5,mm3 ; mm5=tmp13 + + psllw mm1,PRE_MULTIPLY_SCALE_BITS + pmulhw mm1,[GOTOFF(ebx,PW_F1414)] + psubw mm1,mm5 ; mm1=tmp12 + + movq mm6,mm4 + movq mm7,mm0 + psubw mm4,mm5 ; mm4=tmp3 + psubw mm0,mm1 ; mm0=tmp2 + paddw mm6,mm5 ; mm6=tmp0 + paddw mm7,mm1 ; mm7=tmp1 + + movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 + movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 + + ; -- Odd part + + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm2 + movq mm0,mm5 + psubw mm2,mm1 ; mm2=z12 + psubw mm5,mm3 ; mm5=z10 + paddw mm4,mm1 ; mm4=z11 + paddw mm0,mm3 ; mm0=z13 + + movq mm1,mm5 ; mm1=z10(unscaled) + psllw mm2,PRE_MULTIPLY_SCALE_BITS + psllw mm5,PRE_MULTIPLY_SCALE_BITS + + movq mm3,mm4 + psubw mm4,mm0 + paddw mm3,mm0 ; mm3=tmp7 + + psllw mm4,PRE_MULTIPLY_SCALE_BITS + pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movq mm0,mm5 + paddw mm5,mm2 + pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5 + pmulhw mm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw mm2,[GOTOFF(ebx,PW_F1082)] + psubw mm0,mm1 + psubw mm2,mm5 ; mm2=tmp10 + paddw mm0,mm5 ; mm0=tmp12 + + ; -- Final output stage + + psubw mm0,mm3 ; mm0=tmp6 + movq mm1,mm6 + movq mm5,mm7 + paddw mm6,mm3 ; mm6=data0=(00 10 20 30) + paddw mm7,mm0 ; mm7=data1=(01 11 21 31) + psraw mm6,(PASS1_BITS+3) ; descale + psraw mm7,(PASS1_BITS+3) ; descale + psubw mm1,mm3 ; mm1=data7=(07 17 27 37) + psubw mm5,mm0 ; mm5=data6=(06 16 26 36) + psraw mm1,(PASS1_BITS+3) ; descale + psraw mm5,(PASS1_BITS+3) ; descale + psubw mm4,mm0 ; mm4=tmp5 + + packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36) + packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37) + + movq mm3, MMWORD [wk(0)] ; mm3=tmp2 + movq mm0, MMWORD [wk(1)] ; mm0=tmp3 + + paddw mm2,mm4 ; mm2=tmp4 + movq mm5,mm3 + movq mm1,mm0 + paddw mm3,mm4 ; mm3=data2=(02 12 22 32) + paddw mm0,mm2 ; mm0=data4=(04 14 24 34) + psraw mm3,(PASS1_BITS+3) ; descale + psraw mm0,(PASS1_BITS+3) ; descale + psubw mm5,mm4 ; mm5=data5=(05 15 25 35) + psubw mm1,mm2 ; mm1=data3=(03 13 23 33) + psraw mm5,(PASS1_BITS+3) ; descale + psraw mm1,(PASS1_BITS+3) ; descale + + movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] + + packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34) + packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35) + + paddb mm6,mm4 + paddb mm7,mm4 + paddb mm3,mm4 + paddb mm1,mm4 + + movq mm2,mm6 ; transpose coefficients(phase 1) + punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31) + punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37) + movq mm0,mm3 ; transpose coefficients(phase 1) + punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33) + punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35) + + movq mm5,mm6 ; transpose coefficients(phase 2) + punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13) + punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33) + movq mm4,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17) + punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37) + + movq mm7,mm6 ; transpose coefficients(phase 3) + punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07) + punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17) + movq mm1,mm5 ; transpose coefficients(phase 3) + punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jimmxint.asm b/simd/jimmxint.asm index 7b52fae34..75b9ea88b 100644 --- a/simd/jimmxint.asm +++ b/simd/jimmxint.asm @@ -26,67 +26,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_islow_mmx) + alignz 16 + global EXTN(jconst_idct_islow_mmx) EXTN(jconst_idct_islow_mmx): -PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) -PB_CENTERJSAMP times 8 db CENTERJSAMPLE +PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -95,758 +95,758 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; jpeg_component_info * compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; jpeg_component_info * compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 12 -%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF - ; JCOEF workspace[DCTSIZE2] +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 12 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_islow_mmx) + align 16 + global EXTN(jsimd_idct_islow_mmx) EXTN(jsimd_idct_islow_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; JCOEF * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF * wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por mm1,mm0 - packsswb mm1,mm1 - movd eax,mm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw mm0,PASS1_BITS - - movq mm2,mm0 ; mm0=in0=(00 01 02 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - punpckhwd mm2,mm2 ; mm2=(02 02 03 03) - - movq mm1,mm0 - punpckldq mm0,mm0 ; mm0=(00 00 00 00) - punpckhdq mm1,mm1 ; mm1=(01 01 01 01) - movq mm3,mm2 - punpckldq mm2,mm2 ; mm2=(02 02 02 02) - punpckhdq mm3,mm3 ; mm3=(03 03 03 03) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 - jmp near .nextcolumn - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0,PASS1_BITS + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 %endif .columnDCT: - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movq mm4,mm1 ; mm1=in2=z2 - movq mm5,mm1 - punpcklwd mm4,mm3 ; mm3=in6=z3 - punpckhwd mm5,mm3 - movq mm1,mm4 - movq mm3,mm5 - pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L - pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H - pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L - pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H - - movq mm6,mm0 - paddw mm0,mm2 ; mm0=in0+in4 - psubw mm6,mm2 ; mm6=in0-in4 - - pxor mm7,mm7 - pxor mm2,mm2 - punpcklwd mm7,mm0 ; mm7=tmp0L - punpckhwd mm2,mm0 ; mm2=tmp0H - psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS - psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS - - movq mm0,mm7 - paddd mm7,mm4 ; mm7=tmp10L - psubd mm0,mm4 ; mm0=tmp13L - movq mm4,mm2 - paddd mm2,mm5 ; mm2=tmp10H - psubd mm4,mm5 ; mm4=tmp13H - - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L - movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H - movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L - movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H - - pxor mm5,mm5 - pxor mm7,mm7 - punpcklwd mm5,mm6 ; mm5=tmp1L - punpckhwd mm7,mm6 ; mm7=tmp1H - psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS - psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS - - movq mm2,mm5 - paddd mm5,mm1 ; mm5=tmp11L - psubd mm2,mm1 ; mm2=tmp12L - movq mm0,mm7 - paddd mm7,mm3 ; mm7=tmp11H - psubd mm0,mm3 ; mm0=tmp12H - - movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L - movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H - movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L - movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H - - ; -- Odd part - - movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movq mm5,mm6 - movq mm7,mm4 - paddw mm5,mm3 ; mm5=z3 - paddw mm7,mm1 ; mm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movq mm2,mm5 - movq mm0,mm5 - punpcklwd mm2,mm7 - punpckhwd mm0,mm7 - movq mm5,mm2 - movq mm7,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L - pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H - pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L - pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H - - movq MMWORD [wk(10)], mm2 ; wk(10)=z3L - movq MMWORD [wk(11)], mm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movq mm2,mm3 - movq mm0,mm3 - punpcklwd mm2,mm4 - punpckhwd mm0,mm4 - movq mm3,mm2 - movq mm4,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L - pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H - pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L - pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H - - paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L - paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H - paddd mm3,mm5 ; mm3=tmp3L - paddd mm4,mm7 ; mm4=tmp3H - - movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L - movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H - - movq mm2,mm1 - movq mm0,mm1 - punpcklwd mm2,mm6 - punpckhwd mm0,mm6 - movq mm1,mm2 - movq mm6,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L - pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H - pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L - pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H - - paddd mm2,mm5 ; mm2=tmp1L - paddd mm0,mm7 ; mm0=tmp1H - paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L - paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H - - movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L - movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movq mm5, MMWORD [wk(0)] ; mm5=tmp10L - movq mm7, MMWORD [wk(1)] ; mm7=tmp10H - - movq mm2,mm5 - movq mm0,mm7 - paddd mm5,mm3 ; mm5=data0L - paddd mm7,mm4 ; mm7=data0H - psubd mm2,mm3 ; mm2=data7L - psubd mm0,mm4 ; mm0=data7H - - movq mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1] - - paddd mm5,mm3 - paddd mm7,mm3 - psrad mm5,DESCALE_P1 - psrad mm7,DESCALE_P1 - paddd mm2,mm3 - paddd mm0,mm3 - psrad mm2,DESCALE_P1 - psrad mm0,DESCALE_P1 - - packssdw mm5,mm7 ; mm5=data0=(00 01 02 03) - packssdw mm2,mm0 ; mm2=data7=(70 71 72 73) - - movq mm4, MMWORD [wk(4)] ; mm4=tmp11L - movq mm3, MMWORD [wk(5)] ; mm3=tmp11H - - movq mm7,mm4 - movq mm0,mm3 - paddd mm4,mm1 ; mm4=data1L - paddd mm3,mm6 ; mm3=data1H - psubd mm7,mm1 ; mm7=data6L - psubd mm0,mm6 ; mm0=data6H - - movq mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1] - - paddd mm4,mm1 - paddd mm3,mm1 - psrad mm4,DESCALE_P1 - psrad mm3,DESCALE_P1 - paddd mm7,mm1 - paddd mm0,mm1 - psrad mm7,DESCALE_P1 - psrad mm0,DESCALE_P1 - - packssdw mm4,mm3 ; mm4=data1=(10 11 12 13) - packssdw mm7,mm0 ; mm7=data6=(60 61 62 63) - - movq mm6,mm5 ; transpose coefficients(phase 1) - punpcklwd mm5,mm4 ; mm5=(00 10 01 11) - punpckhwd mm6,mm4 ; mm6=(02 12 03 13) - movq mm1,mm7 ; transpose coefficients(phase 1) - punpcklwd mm7,mm2 ; mm7=(60 70 61 71) - punpckhwd mm1,mm2 ; mm1=(62 72 63 73) - - movq mm3, MMWORD [wk(6)] ; mm3=tmp12L - movq mm0, MMWORD [wk(7)] ; mm0=tmp12H - movq mm4, MMWORD [wk(10)] ; mm4=tmp1L - movq mm2, MMWORD [wk(11)] ; mm2=tmp1H - - movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11) - movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13) - movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71) - movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73) - - movq mm5,mm3 - movq mm6,mm0 - paddd mm3,mm4 ; mm3=data2L - paddd mm0,mm2 ; mm0=data2H - psubd mm5,mm4 ; mm5=data5L - psubd mm6,mm2 ; mm6=data5H - - movq mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1] - - paddd mm3,mm7 - paddd mm0,mm7 - psrad mm3,DESCALE_P1 - psrad mm0,DESCALE_P1 - paddd mm5,mm7 - paddd mm6,mm7 - psrad mm5,DESCALE_P1 - psrad mm6,DESCALE_P1 - - packssdw mm3,mm0 ; mm3=data2=(20 21 22 23) - packssdw mm5,mm6 ; mm5=data5=(50 51 52 53) - - movq mm1, MMWORD [wk(2)] ; mm1=tmp13L - movq mm4, MMWORD [wk(3)] ; mm4=tmp13H - movq mm2, MMWORD [wk(8)] ; mm2=tmp0L - movq mm7, MMWORD [wk(9)] ; mm7=tmp0H - - movq mm0,mm1 - movq mm6,mm4 - paddd mm1,mm2 ; mm1=data3L - paddd mm4,mm7 ; mm4=data3H - psubd mm0,mm2 ; mm0=data4L - psubd mm6,mm7 ; mm6=data4H - - movq mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1] - - paddd mm1,mm2 - paddd mm4,mm2 - psrad mm1,DESCALE_P1 - psrad mm4,DESCALE_P1 - paddd mm0,mm2 - paddd mm6,mm2 - psrad mm0,DESCALE_P1 - psrad mm6,DESCALE_P1 - - packssdw mm1,mm4 ; mm1=data3=(30 31 32 33) - packssdw mm0,mm6 ; mm0=data4=(40 41 42 43) - - movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11) - movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13) - - movq mm4,mm3 ; transpose coefficients(phase 1) - punpcklwd mm3,mm1 ; mm3=(20 30 21 31) - punpckhwd mm4,mm1 ; mm4=(22 32 23 33) - movq mm6,mm0 ; transpose coefficients(phase 1) - punpcklwd mm0,mm5 ; mm0=(40 50 41 51) - punpckhwd mm6,mm5 ; mm6=(42 52 43 53) - - movq mm1,mm7 ; transpose coefficients(phase 2) - punpckldq mm7,mm3 ; mm7=(00 10 20 30) - punpckhdq mm1,mm3 ; mm1=(01 11 21 31) - movq mm5,mm2 ; transpose coefficients(phase 2) - punpckldq mm2,mm4 ; mm2=(02 12 22 32) - punpckhdq mm5,mm4 ; mm5=(03 13 23 33) - - movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71) - movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 - - movq mm7,mm0 ; transpose coefficients(phase 2) - punpckldq mm0,mm3 ; mm0=(40 50 60 70) - punpckhdq mm7,mm3 ; mm7=(41 51 61 71) - movq mm1,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm4 ; mm6=(42 52 62 72) - punpckhdq mm1,mm4 ; mm1=(43 53 63 73) - - movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7 - movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1 + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4,mm1 ; mm1=in2=z2 + movq mm5,mm1 + punpcklwd mm4,mm3 ; mm3=in6=z3 + punpckhwd mm5,mm3 + movq mm1,mm4 + movq mm3,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6,mm0 + paddw mm0,mm2 ; mm0=in0+in4 + psubw mm6,mm2 ; mm6=in0-in4 + + pxor mm7,mm7 + pxor mm2,mm2 + punpcklwd mm7,mm0 ; mm7=tmp0L + punpckhwd mm2,mm0 ; mm2=tmp0H + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0,mm7 + paddd mm7,mm4 ; mm7=tmp10L + psubd mm0,mm4 ; mm0=tmp13L + movq mm4,mm2 + paddd mm2,mm5 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5,mm5 + pxor mm7,mm7 + punpcklwd mm5,mm6 ; mm5=tmp1L + punpckhwd mm7,mm6 ; mm7=tmp1H + psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2,mm5 + paddd mm5,mm1 ; mm5=tmp11L + psubd mm2,mm1 ; mm2=tmp12L + movq mm0,mm7 + paddd mm7,mm3 ; mm7=tmp11H + psubd mm0,mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm5,mm6 + movq mm7,mm4 + paddw mm5,mm3 ; mm5=z3 + paddw mm7,mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2,mm5 + movq mm0,mm5 + punpcklwd mm2,mm7 + punpckhwd mm0,mm7 + movq mm5,mm2 + movq mm7,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2,mm3 + movq mm0,mm3 + punpcklwd mm2,mm4 + punpckhwd mm0,mm4 + movq mm3,mm2 + movq mm4,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3,mm5 ; mm3=tmp3L + paddd mm4,mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2,mm1 + movq mm0,mm1 + punpcklwd mm2,mm6 + punpckhwd mm0,mm6 + movq mm1,mm2 + movq mm6,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2,mm5 ; mm2=tmp1L + paddd mm0,mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2,mm5 + movq mm0,mm7 + paddd mm5,mm3 ; mm5=data0L + paddd mm7,mm4 ; mm7=data0H + psubd mm2,mm3 ; mm2=data7L + psubd mm0,mm4 ; mm0=data7H + + movq mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1] + + paddd mm5,mm3 + paddd mm7,mm3 + psrad mm5,DESCALE_P1 + psrad mm7,DESCALE_P1 + paddd mm2,mm3 + paddd mm0,mm3 + psrad mm2,DESCALE_P1 + psrad mm0,DESCALE_P1 + + packssdw mm5,mm7 ; mm5=data0=(00 01 02 03) + packssdw mm2,mm0 ; mm2=data7=(70 71 72 73) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7,mm4 + movq mm0,mm3 + paddd mm4,mm1 ; mm4=data1L + paddd mm3,mm6 ; mm3=data1H + psubd mm7,mm1 ; mm7=data6L + psubd mm0,mm6 ; mm0=data6H + + movq mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1] + + paddd mm4,mm1 + paddd mm3,mm1 + psrad mm4,DESCALE_P1 + psrad mm3,DESCALE_P1 + paddd mm7,mm1 + paddd mm0,mm1 + psrad mm7,DESCALE_P1 + psrad mm0,DESCALE_P1 + + packssdw mm4,mm3 ; mm4=data1=(10 11 12 13) + packssdw mm7,mm0 ; mm7=data6=(60 61 62 63) + + movq mm6,mm5 ; transpose coefficients(phase 1) + punpcklwd mm5,mm4 ; mm5=(00 10 01 11) + punpckhwd mm6,mm4 ; mm6=(02 12 03 13) + movq mm1,mm7 ; transpose coefficients(phase 1) + punpcklwd mm7,mm2 ; mm7=(60 70 61 71) + punpckhwd mm1,mm2 ; mm1=(62 72 63 73) + + movq mm3, MMWORD [wk(6)] ; mm3=tmp12L + movq mm0, MMWORD [wk(7)] ; mm0=tmp12H + movq mm4, MMWORD [wk(10)] ; mm4=tmp1L + movq mm2, MMWORD [wk(11)] ; mm2=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 01 11) + movq MMWORD [wk(1)], mm6 ; wk(1)=(02 12 03 13) + movq MMWORD [wk(4)], mm7 ; wk(4)=(60 70 61 71) + movq MMWORD [wk(5)], mm1 ; wk(5)=(62 72 63 73) + + movq mm5,mm3 + movq mm6,mm0 + paddd mm3,mm4 ; mm3=data2L + paddd mm0,mm2 ; mm0=data2H + psubd mm5,mm4 ; mm5=data5L + psubd mm6,mm2 ; mm6=data5H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1] + + paddd mm3,mm7 + paddd mm0,mm7 + psrad mm3,DESCALE_P1 + psrad mm0,DESCALE_P1 + paddd mm5,mm7 + paddd mm6,mm7 + psrad mm5,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm3,mm0 ; mm3=data2=(20 21 22 23) + packssdw mm5,mm6 ; mm5=data5=(50 51 52 53) + + movq mm1, MMWORD [wk(2)] ; mm1=tmp13L + movq mm4, MMWORD [wk(3)] ; mm4=tmp13H + movq mm2, MMWORD [wk(8)] ; mm2=tmp0L + movq mm7, MMWORD [wk(9)] ; mm7=tmp0H + + movq mm0,mm1 + movq mm6,mm4 + paddd mm1,mm2 ; mm1=data3L + paddd mm4,mm7 ; mm4=data3H + psubd mm0,mm2 ; mm0=data4L + psubd mm6,mm7 ; mm6=data4H + + movq mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1] + + paddd mm1,mm2 + paddd mm4,mm2 + psrad mm1,DESCALE_P1 + psrad mm4,DESCALE_P1 + paddd mm0,mm2 + paddd mm6,mm2 + psrad mm0,DESCALE_P1 + psrad mm6,DESCALE_P1 + + packssdw mm1,mm4 ; mm1=data3=(30 31 32 33) + packssdw mm0,mm6 ; mm0=data4=(40 41 42 43) + + movq mm7, MMWORD [wk(0)] ; mm7=(00 10 01 11) + movq mm2, MMWORD [wk(1)] ; mm2=(02 12 03 13) + + movq mm4,mm3 ; transpose coefficients(phase 1) + punpcklwd mm3,mm1 ; mm3=(20 30 21 31) + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) + movq mm6,mm0 ; transpose coefficients(phase 1) + punpcklwd mm0,mm5 ; mm0=(40 50 41 51) + punpckhwd mm6,mm5 ; mm6=(42 52 43 53) + + movq mm1,mm7 ; transpose coefficients(phase 2) + punpckldq mm7,mm3 ; mm7=(00 10 20 30) + punpckhdq mm1,mm3 ; mm1=(01 11 21 31) + movq mm5,mm2 ; transpose coefficients(phase 2) + punpckldq mm2,mm4 ; mm2=(02 12 22 32) + punpckhdq mm5,mm4 ; mm5=(03 13 23 33) + + movq mm3, MMWORD [wk(4)] ; mm3=(60 70 61 71) + movq mm4, MMWORD [wk(5)] ; mm4=(62 72 63 73) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 + + movq mm7,mm0 ; transpose coefficients(phase 2) + punpckldq mm0,mm3 ; mm0=(40 50 60 70) + punpckhdq mm7,mm3 ; mm7=(41 51 61 71) + movq mm1,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm4 ; mm6=(42 52 62 72) + punpckhdq mm1,mm4 ; mm1=(43 53 63 73) + + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1 .nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr - add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; JCOEF * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF * wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .rowloop: - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movq mm4,mm1 ; mm1=in2=z2 - movq mm5,mm1 - punpcklwd mm4,mm3 ; mm3=in6=z3 - punpckhwd mm5,mm3 - movq mm1,mm4 - movq mm3,mm5 - pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L - pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H - pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L - pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H - - movq mm6,mm0 - paddw mm0,mm2 ; mm0=in0+in4 - psubw mm6,mm2 ; mm6=in0-in4 - - pxor mm7,mm7 - pxor mm2,mm2 - punpcklwd mm7,mm0 ; mm7=tmp0L - punpckhwd mm2,mm0 ; mm2=tmp0H - psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS - psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS - - movq mm0,mm7 - paddd mm7,mm4 ; mm7=tmp10L - psubd mm0,mm4 ; mm0=tmp13L - movq mm4,mm2 - paddd mm2,mm5 ; mm2=tmp10H - psubd mm4,mm5 ; mm4=tmp13H - - movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L - movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H - movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L - movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H - - pxor mm5,mm5 - pxor mm7,mm7 - punpcklwd mm5,mm6 ; mm5=tmp1L - punpckhwd mm7,mm6 ; mm7=tmp1H - psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS - psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS - - movq mm2,mm5 - paddd mm5,mm1 ; mm5=tmp11L - psubd mm2,mm1 ; mm2=tmp12L - movq mm0,mm7 - paddd mm7,mm3 ; mm7=tmp11H - psubd mm0,mm3 ; mm0=tmp12H - - movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L - movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H - movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L - movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H - - ; -- Odd part - - movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - movq mm5,mm6 - movq mm7,mm4 - paddw mm5,mm3 ; mm5=z3 - paddw mm7,mm1 ; mm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movq mm2,mm5 - movq mm0,mm5 - punpcklwd mm2,mm7 - punpckhwd mm0,mm7 - movq mm5,mm2 - movq mm7,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L - pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H - pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L - pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H - - movq MMWORD [wk(10)], mm2 ; wk(10)=z3L - movq MMWORD [wk(11)], mm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movq mm2,mm3 - movq mm0,mm3 - punpcklwd mm2,mm4 - punpckhwd mm0,mm4 - movq mm3,mm2 - movq mm4,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L - pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H - pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L - pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H - - paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L - paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H - paddd mm3,mm5 ; mm3=tmp3L - paddd mm4,mm7 ; mm4=tmp3H - - movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L - movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H - - movq mm2,mm1 - movq mm0,mm1 - punpcklwd mm2,mm6 - punpckhwd mm0,mm6 - movq mm1,mm2 - movq mm6,mm0 - pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L - pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H - pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L - pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H - - paddd mm2,mm5 ; mm2=tmp1L - paddd mm0,mm7 ; mm0=tmp1H - paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L - paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H - - movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L - movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movq mm5, MMWORD [wk(0)] ; mm5=tmp10L - movq mm7, MMWORD [wk(1)] ; mm7=tmp10H - - movq mm2,mm5 - movq mm0,mm7 - paddd mm5,mm3 ; mm5=data0L - paddd mm7,mm4 ; mm7=data0H - psubd mm2,mm3 ; mm2=data7L - psubd mm0,mm4 ; mm0=data7H - - movq mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2] - - paddd mm5,mm3 - paddd mm7,mm3 - psrad mm5,DESCALE_P2 - psrad mm7,DESCALE_P2 - paddd mm2,mm3 - paddd mm0,mm3 - psrad mm2,DESCALE_P2 - psrad mm0,DESCALE_P2 - - packssdw mm5,mm7 ; mm5=data0=(00 10 20 30) - packssdw mm2,mm0 ; mm2=data7=(07 17 27 37) - - movq mm4, MMWORD [wk(4)] ; mm4=tmp11L - movq mm3, MMWORD [wk(5)] ; mm3=tmp11H - - movq mm7,mm4 - movq mm0,mm3 - paddd mm4,mm1 ; mm4=data1L - paddd mm3,mm6 ; mm3=data1H - psubd mm7,mm1 ; mm7=data6L - psubd mm0,mm6 ; mm0=data6H - - movq mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2] - - paddd mm4,mm1 - paddd mm3,mm1 - psrad mm4,DESCALE_P2 - psrad mm3,DESCALE_P2 - paddd mm7,mm1 - paddd mm0,mm1 - psrad mm7,DESCALE_P2 - psrad mm0,DESCALE_P2 - - packssdw mm4,mm3 ; mm4=data1=(01 11 21 31) - packssdw mm7,mm0 ; mm7=data6=(06 16 26 36) - - packsswb mm5,mm7 ; mm5=(00 10 20 30 06 16 26 36) - packsswb mm4,mm2 ; mm4=(01 11 21 31 07 17 27 37) - - movq mm6, MMWORD [wk(6)] ; mm6=tmp12L - movq mm1, MMWORD [wk(7)] ; mm1=tmp12H - movq mm3, MMWORD [wk(10)] ; mm3=tmp1L - movq mm0, MMWORD [wk(11)] ; mm0=tmp1H - - movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36) - movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37) - - movq mm7,mm6 - movq mm2,mm1 - paddd mm6,mm3 ; mm6=data2L - paddd mm1,mm0 ; mm1=data2H - psubd mm7,mm3 ; mm7=data5L - psubd mm2,mm0 ; mm2=data5H - - movq mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2] - - paddd mm6,mm5 - paddd mm1,mm5 - psrad mm6,DESCALE_P2 - psrad mm1,DESCALE_P2 - paddd mm7,mm5 - paddd mm2,mm5 - psrad mm7,DESCALE_P2 - psrad mm2,DESCALE_P2 - - packssdw mm6,mm1 ; mm6=data2=(02 12 22 32) - packssdw mm7,mm2 ; mm7=data5=(05 15 25 35) - - movq mm4, MMWORD [wk(2)] ; mm4=tmp13L - movq mm3, MMWORD [wk(3)] ; mm3=tmp13H - movq mm0, MMWORD [wk(8)] ; mm0=tmp0L - movq mm5, MMWORD [wk(9)] ; mm5=tmp0H - - movq mm1,mm4 - movq mm2,mm3 - paddd mm4,mm0 ; mm4=data3L - paddd mm3,mm5 ; mm3=data3H - psubd mm1,mm0 ; mm1=data4L - psubd mm2,mm5 ; mm2=data4H - - movq mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2] - - paddd mm4,mm0 - paddd mm3,mm0 - psrad mm4,DESCALE_P2 - psrad mm3,DESCALE_P2 - paddd mm1,mm0 - paddd mm2,mm0 - psrad mm1,DESCALE_P2 - psrad mm2,DESCALE_P2 - - movq mm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP] - - packssdw mm4,mm3 ; mm4=data3=(03 13 23 33) - packssdw mm1,mm2 ; mm1=data4=(04 14 24 34) - - movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36) - movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37) - - packsswb mm6,mm1 ; mm6=(02 12 22 32 04 14 24 34) - packsswb mm4,mm7 ; mm4=(03 13 23 33 05 15 25 35) - - paddb mm0,mm5 - paddb mm3,mm5 - paddb mm6,mm5 - paddb mm4,mm5 - - movq mm2,mm0 ; transpose coefficients(phase 1) - punpcklbw mm0,mm3 ; mm0=(00 01 10 11 20 21 30 31) - punpckhbw mm2,mm3 ; mm2=(06 07 16 17 26 27 36 37) - movq mm1,mm6 ; transpose coefficients(phase 1) - punpcklbw mm6,mm4 ; mm6=(02 03 12 13 22 23 32 33) - punpckhbw mm1,mm4 ; mm1=(04 05 14 15 24 25 34 35) - - movq mm7,mm0 ; transpose coefficients(phase 2) - punpcklwd mm0,mm6 ; mm0=(00 01 02 03 10 11 12 13) - punpckhwd mm7,mm6 ; mm7=(20 21 22 23 30 31 32 33) - movq mm5,mm1 ; transpose coefficients(phase 2) - punpcklwd mm1,mm2 ; mm1=(04 05 06 07 14 15 16 17) - punpckhwd mm5,mm2 ; mm5=(24 25 26 27 34 35 36 37) - - movq mm3,mm0 ; transpose coefficients(phase 3) - punpckldq mm0,mm1 ; mm0=(00 01 02 03 04 05 06 07) - punpckhdq mm3,mm1 ; mm3=(10 11 12 13 14 15 16 17) - movq mm4,mm7 ; transpose coefficients(phase 3) - punpckldq mm7,mm5 ; mm7=(20 21 22 23 24 25 26 27) - punpckhdq mm4,mm5 ; mm4=(30 31 32 33 34 35 36 37) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_JCOEF ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movq mm4,mm1 ; mm1=in2=z2 + movq mm5,mm1 + punpcklwd mm4,mm3 ; mm3=in6=z3 + punpckhwd mm5,mm3 + movq mm1,mm4 + movq mm3,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=tmp3L + pmaddwd mm5,[GOTOFF(ebx,PW_F130_F054)] ; mm5=tmp3H + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F054_MF130)] ; mm3=tmp2H + + movq mm6,mm0 + paddw mm0,mm2 ; mm0=in0+in4 + psubw mm6,mm2 ; mm6=in0-in4 + + pxor mm7,mm7 + pxor mm2,mm2 + punpcklwd mm7,mm0 ; mm7=tmp0L + punpckhwd mm2,mm0 ; mm2=tmp0H + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + psrad mm2,(16-CONST_BITS) ; psrad mm2,16 & pslld mm2,CONST_BITS + + movq mm0,mm7 + paddd mm7,mm4 ; mm7=tmp10L + psubd mm0,mm4 ; mm0=tmp13L + movq mm4,mm2 + paddd mm2,mm5 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp13H + + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp10L + movq MMWORD [wk(1)], mm2 ; wk(1)=tmp10H + movq MMWORD [wk(2)], mm0 ; wk(2)=tmp13L + movq MMWORD [wk(3)], mm4 ; wk(3)=tmp13H + + pxor mm5,mm5 + pxor mm7,mm7 + punpcklwd mm5,mm6 ; mm5=tmp1L + punpckhwd mm7,mm6 ; mm7=tmp1H + psrad mm5,(16-CONST_BITS) ; psrad mm5,16 & pslld mm5,CONST_BITS + psrad mm7,(16-CONST_BITS) ; psrad mm7,16 & pslld mm7,CONST_BITS + + movq mm2,mm5 + paddd mm5,mm1 ; mm5=tmp11L + psubd mm2,mm1 ; mm2=tmp12L + movq mm0,mm7 + paddd mm7,mm3 ; mm7=tmp11H + psubd mm0,mm3 ; mm0=tmp12H + + movq MMWORD [wk(4)], mm5 ; wk(4)=tmp11L + movq MMWORD [wk(5)], mm7 ; wk(5)=tmp11H + movq MMWORD [wk(6)], mm2 ; wk(6)=tmp12L + movq MMWORD [wk(7)], mm0 ; wk(7)=tmp12H + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm5,mm6 + movq mm7,mm4 + paddw mm5,mm3 ; mm5=z3 + paddw mm7,mm1 ; mm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movq mm2,mm5 + movq mm0,mm5 + punpcklwd mm2,mm7 + punpckhwd mm0,mm7 + movq mm5,mm2 + movq mm7,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF078_F117)] ; mm2=z3L + pmaddwd mm0,[GOTOFF(ebx,PW_MF078_F117)] ; mm0=z3H + pmaddwd mm5,[GOTOFF(ebx,PW_F117_F078)] ; mm5=z4L + pmaddwd mm7,[GOTOFF(ebx,PW_F117_F078)] ; mm7=z4H + + movq MMWORD [wk(10)], mm2 ; wk(10)=z3L + movq MMWORD [wk(11)], mm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movq mm2,mm3 + movq mm0,mm3 + punpcklwd mm2,mm4 + punpckhwd mm0,mm4 + movq mm3,mm2 + movq mm4,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF060_MF089)] ; mm2=tmp0L + pmaddwd mm0,[GOTOFF(ebx,PW_MF060_MF089)] ; mm0=tmp0H + pmaddwd mm3,[GOTOFF(ebx,PW_MF089_F060)] ; mm3=tmp3L + pmaddwd mm4,[GOTOFF(ebx,PW_MF089_F060)] ; mm4=tmp3H + + paddd mm2, MMWORD [wk(10)] ; mm2=tmp0L + paddd mm0, MMWORD [wk(11)] ; mm0=tmp0H + paddd mm3,mm5 ; mm3=tmp3L + paddd mm4,mm7 ; mm4=tmp3H + + movq MMWORD [wk(8)], mm2 ; wk(8)=tmp0L + movq MMWORD [wk(9)], mm0 ; wk(9)=tmp0H + + movq mm2,mm1 + movq mm0,mm1 + punpcklwd mm2,mm6 + punpckhwd mm0,mm6 + movq mm1,mm2 + movq mm6,mm0 + pmaddwd mm2,[GOTOFF(ebx,PW_MF050_MF256)] ; mm2=tmp1L + pmaddwd mm0,[GOTOFF(ebx,PW_MF050_MF256)] ; mm0=tmp1H + pmaddwd mm1,[GOTOFF(ebx,PW_MF256_F050)] ; mm1=tmp2L + pmaddwd mm6,[GOTOFF(ebx,PW_MF256_F050)] ; mm6=tmp2H + + paddd mm2,mm5 ; mm2=tmp1L + paddd mm0,mm7 ; mm0=tmp1H + paddd mm1, MMWORD [wk(10)] ; mm1=tmp2L + paddd mm6, MMWORD [wk(11)] ; mm6=tmp2H + + movq MMWORD [wk(10)], mm2 ; wk(10)=tmp1L + movq MMWORD [wk(11)], mm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movq mm5, MMWORD [wk(0)] ; mm5=tmp10L + movq mm7, MMWORD [wk(1)] ; mm7=tmp10H + + movq mm2,mm5 + movq mm0,mm7 + paddd mm5,mm3 ; mm5=data0L + paddd mm7,mm4 ; mm7=data0H + psubd mm2,mm3 ; mm2=data7L + psubd mm0,mm4 ; mm0=data7H + + movq mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2] + + paddd mm5,mm3 + paddd mm7,mm3 + psrad mm5,DESCALE_P2 + psrad mm7,DESCALE_P2 + paddd mm2,mm3 + paddd mm0,mm3 + psrad mm2,DESCALE_P2 + psrad mm0,DESCALE_P2 + + packssdw mm5,mm7 ; mm5=data0=(00 10 20 30) + packssdw mm2,mm0 ; mm2=data7=(07 17 27 37) + + movq mm4, MMWORD [wk(4)] ; mm4=tmp11L + movq mm3, MMWORD [wk(5)] ; mm3=tmp11H + + movq mm7,mm4 + movq mm0,mm3 + paddd mm4,mm1 ; mm4=data1L + paddd mm3,mm6 ; mm3=data1H + psubd mm7,mm1 ; mm7=data6L + psubd mm0,mm6 ; mm0=data6H + + movq mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2] + + paddd mm4,mm1 + paddd mm3,mm1 + psrad mm4,DESCALE_P2 + psrad mm3,DESCALE_P2 + paddd mm7,mm1 + paddd mm0,mm1 + psrad mm7,DESCALE_P2 + psrad mm0,DESCALE_P2 + + packssdw mm4,mm3 ; mm4=data1=(01 11 21 31) + packssdw mm7,mm0 ; mm7=data6=(06 16 26 36) + + packsswb mm5,mm7 ; mm5=(00 10 20 30 06 16 26 36) + packsswb mm4,mm2 ; mm4=(01 11 21 31 07 17 27 37) + + movq mm6, MMWORD [wk(6)] ; mm6=tmp12L + movq mm1, MMWORD [wk(7)] ; mm1=tmp12H + movq mm3, MMWORD [wk(10)] ; mm3=tmp1L + movq mm0, MMWORD [wk(11)] ; mm0=tmp1H + + movq MMWORD [wk(0)], mm5 ; wk(0)=(00 10 20 30 06 16 26 36) + movq MMWORD [wk(1)], mm4 ; wk(1)=(01 11 21 31 07 17 27 37) + + movq mm7,mm6 + movq mm2,mm1 + paddd mm6,mm3 ; mm6=data2L + paddd mm1,mm0 ; mm1=data2H + psubd mm7,mm3 ; mm7=data5L + psubd mm2,mm0 ; mm2=data5H + + movq mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2] + + paddd mm6,mm5 + paddd mm1,mm5 + psrad mm6,DESCALE_P2 + psrad mm1,DESCALE_P2 + paddd mm7,mm5 + paddd mm2,mm5 + psrad mm7,DESCALE_P2 + psrad mm2,DESCALE_P2 + + packssdw mm6,mm1 ; mm6=data2=(02 12 22 32) + packssdw mm7,mm2 ; mm7=data5=(05 15 25 35) + + movq mm4, MMWORD [wk(2)] ; mm4=tmp13L + movq mm3, MMWORD [wk(3)] ; mm3=tmp13H + movq mm0, MMWORD [wk(8)] ; mm0=tmp0L + movq mm5, MMWORD [wk(9)] ; mm5=tmp0H + + movq mm1,mm4 + movq mm2,mm3 + paddd mm4,mm0 ; mm4=data3L + paddd mm3,mm5 ; mm3=data3H + psubd mm1,mm0 ; mm1=data4L + psubd mm2,mm5 ; mm2=data4H + + movq mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2] + + paddd mm4,mm0 + paddd mm3,mm0 + psrad mm4,DESCALE_P2 + psrad mm3,DESCALE_P2 + paddd mm1,mm0 + paddd mm2,mm0 + psrad mm1,DESCALE_P2 + psrad mm2,DESCALE_P2 + + movq mm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm5=[PB_CENTERJSAMP] + + packssdw mm4,mm3 ; mm4=data3=(03 13 23 33) + packssdw mm1,mm2 ; mm1=data4=(04 14 24 34) + + movq mm0, MMWORD [wk(0)] ; mm0=(00 10 20 30 06 16 26 36) + movq mm3, MMWORD [wk(1)] ; mm3=(01 11 21 31 07 17 27 37) + + packsswb mm6,mm1 ; mm6=(02 12 22 32 04 14 24 34) + packsswb mm4,mm7 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0,mm5 + paddb mm3,mm5 + paddb mm6,mm5 + paddb mm4,mm5 + + movq mm2,mm0 ; transpose coefficients(phase 1) + punpcklbw mm0,mm3 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm2,mm3 ; mm2=(06 07 16 17 26 27 36 37) + movq mm1,mm6 ; transpose coefficients(phase 1) + punpcklbw mm6,mm4 ; mm6=(02 03 12 13 22 23 32 33) + punpckhbw mm1,mm4 ; mm1=(04 05 14 15 24 25 34 35) + + movq mm7,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm6 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm7,mm6 ; mm7=(20 21 22 23 30 31 32 33) + movq mm5,mm1 ; transpose coefficients(phase 2) + punpcklwd mm1,mm2 ; mm1=(04 05 06 07 14 15 16 17) + punpckhwd mm5,mm2 ; mm5=(24 25 26 27 34 35 36 37) + + movq mm3,mm0 ; transpose coefficients(phase 3) + punpckldq mm0,mm1 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm3,mm1 ; mm3=(10 11 12 13 14 15 16 17) + movq mm4,mm7 ; transpose coefficients(phase 3) + punpckldq mm7,mm5 ; mm7=(20 21 22 23 24 25 26 27) + punpckhdq mm4,mm5 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_JCOEF ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jimmxred.asm b/simd/jimmxred.asm index a2b7103df..5e2483693 100644 --- a/simd/jimmxred.asm +++ b/simd/jimmxred.asm @@ -26,74 +26,74 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) -%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) -%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) -%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) %if CONST_BITS == 13 -F_0_211 equ 1730 ; FIX(0.211164243) -F_0_509 equ 4176 ; FIX(0.509795579) -F_0_601 equ 4926 ; FIX(0.601344887) -F_0_720 equ 5906 ; FIX(0.720959822) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_850 equ 6967 ; FIX(0.850430095) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_061 equ 8697 ; FIX(1.061594337) -F_1_272 equ 10426 ; FIX(1.272758580) -F_1_451 equ 11893 ; FIX(1.451774981) -F_1_847 equ 15137 ; FIX(1.847759065) -F_2_172 equ 17799 ; FIX(2.172734803) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_624 equ 29692 ; FIX(3.624509785) +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) -F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) -F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) -F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) -F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) -F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_red_mmx) + alignz 16 + global EXTN(jconst_idct_red_mmx) EXTN(jconst_idct_red_mmx): -PW_F184_MF076 times 2 dw F_1_847,-F_0_765 -PW_F256_F089 times 2 dw F_2_562, F_0_899 -PW_F106_MF217 times 2 dw F_1_061,-F_2_172 -PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509 -PW_F145_MF021 times 2 dw F_1_451,-F_0_211 -PW_F362_MF127 times 2 dw F_3_624,-F_1_272 -PW_F085_MF072 times 2 dw F_0_850,-F_0_720 -PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1) -PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1) -PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1) -PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1) -PB_CENTERJSAMP times 8 db CENTERJSAMPLE - - alignz 16 +PW_F184_MF076 times 2 dw F_1_847,-F_0_765 +PW_F256_F089 times 2 dw F_2_562, F_0_899 +PW_F106_MF217 times 2 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 2 dw F_1_451,-F_0_211 +PW_F362_MF127 times 2 dw F_3_624,-F_1_272 +PW_F085_MF072 times 2 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 8 db CENTERJSAMPLE + + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients, ; producing a reduced-size 4x4 output block. @@ -103,388 +103,388 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void * dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF - ; JCOEF workspace[DCTSIZE2] +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF + ; JCOEF workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_4x4_mmx) + align 16 + global EXTN(jsimd_idct_4x4_mmx) EXTN(jsimd_idct_4x4_mmx): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - pushpic ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; JCOEF * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + pushpic ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; JCOEF * wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por mm0,mm1 - packsswb mm0,mm0 - movd eax,mm0 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw mm0,PASS1_BITS - - movq mm2,mm0 ; mm0=in0=(00 01 02 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - punpckhwd mm2,mm2 ; mm2=(02 02 03 03) - - movq mm1,mm0 - punpckldq mm0,mm0 ; mm0=(00 00 00 00) - punpckhdq mm1,mm1 ; mm1=(01 01 01 01) - movq mm3,mm2 - punpckldq mm2,mm2 ; mm2=(02 02 02 02) - punpckhdq mm3,mm3 ; mm3=(03 03 03 03) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 - jmp near .nextcolumn - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm0,mm1 + packsswb mm0,mm0 + movd eax,mm0 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw mm0,PASS1_BITS + + movq mm2,mm0 ; mm0=in0=(00 01 02 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm2,mm2 ; mm2=(02 02 03 03) + + movq mm1,mm0 + punpckldq mm0,mm0 ; mm0=(00 00 00 00) + punpckhdq mm1,mm1 ; mm1=(01 01 01 01) + movq mm3,mm2 + punpckldq mm2,mm2 ; mm2=(02 02 02 02) + punpckhdq mm3,mm3 ; mm3=(03 03 03 03) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + jmp near .nextcolumn + alignx 16,7 %endif .columnDCT: - ; -- Odd part - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movq mm4,mm0 - movq mm5,mm0 - punpcklwd mm4,mm1 - punpckhwd mm5,mm1 - movq mm0,mm4 - movq mm1,mm5 - pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) - pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) - pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) - pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) - - movq mm6,mm2 - movq mm7,mm2 - punpcklwd mm6,mm3 - punpckhwd mm7,mm3 - movq mm2,mm6 - movq mm3,mm7 - pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) - pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) - pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) - pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) - - paddd mm6,mm4 ; mm6=tmp2L - paddd mm7,mm5 ; mm7=tmp2H - paddd mm2,mm0 ; mm2=tmp0L - paddd mm3,mm1 ; mm3=tmp0H - - movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L - movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H - - ; -- Even part - - movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - pxor mm1,mm1 - pxor mm2,mm2 - punpcklwd mm1,mm4 ; mm1=tmp0L - punpckhwd mm2,mm4 ; mm2=tmp0H - psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 - psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 - - movq mm3,mm5 ; mm5=in2=z2 - punpcklwd mm5,mm0 ; mm0=in6=z3 - punpckhwd mm3,mm0 - pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L - pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H - - movq mm4,mm1 - movq mm0,mm2 - paddd mm1,mm5 ; mm1=tmp10L - paddd mm2,mm3 ; mm2=tmp10H - psubd mm4,mm5 ; mm4=tmp12L - psubd mm0,mm3 ; mm0=tmp12H - - ; -- Final output stage - - movq mm5,mm1 - movq mm3,mm2 - paddd mm1,mm6 ; mm1=data0L - paddd mm2,mm7 ; mm2=data0H - psubd mm5,mm6 ; mm5=data3L - psubd mm3,mm7 ; mm3=data3H - - movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4] - - paddd mm1,mm6 - paddd mm2,mm6 - psrad mm1,DESCALE_P1_4 - psrad mm2,DESCALE_P1_4 - paddd mm5,mm6 - paddd mm3,mm6 - psrad mm5,DESCALE_P1_4 - psrad mm3,DESCALE_P1_4 - - packssdw mm1,mm2 ; mm1=data0=(00 01 02 03) - packssdw mm5,mm3 ; mm5=data3=(30 31 32 33) - - movq mm7, MMWORD [wk(0)] ; mm7=tmp0L - movq mm6, MMWORD [wk(1)] ; mm6=tmp0H - - movq mm2,mm4 - movq mm3,mm0 - paddd mm4,mm7 ; mm4=data1L - paddd mm0,mm6 ; mm0=data1H - psubd mm2,mm7 ; mm2=data2L - psubd mm3,mm6 ; mm3=data2H - - movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4] - - paddd mm4,mm7 - paddd mm0,mm7 - psrad mm4,DESCALE_P1_4 - psrad mm0,DESCALE_P1_4 - paddd mm2,mm7 - paddd mm3,mm7 - psrad mm2,DESCALE_P1_4 - psrad mm3,DESCALE_P1_4 - - packssdw mm4,mm0 ; mm4=data1=(10 11 12 13) - packssdw mm2,mm3 ; mm2=data2=(20 21 22 23) - - movq mm6,mm1 ; transpose coefficients(phase 1) - punpcklwd mm1,mm4 ; mm1=(00 10 01 11) - punpckhwd mm6,mm4 ; mm6=(02 12 03 13) - movq mm7,mm2 ; transpose coefficients(phase 1) - punpcklwd mm2,mm5 ; mm2=(20 30 21 31) - punpckhwd mm7,mm5 ; mm7=(22 32 23 33) - - movq mm0,mm1 ; transpose coefficients(phase 2) - punpckldq mm1,mm2 ; mm1=(00 10 20 30) - punpckhdq mm0,mm2 ; mm0=(01 11 21 31) - movq mm3,mm6 ; transpose coefficients(phase 2) - punpckldq mm6,mm7 ; mm6=(02 12 22 32) - punpckhdq mm3,mm7 ; mm3=(03 13 23 33) - - movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1 - movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 - movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6 - movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movq mm4,mm0 + movq mm5,mm0 + punpcklwd mm4,mm1 + punpckhwd mm5,mm1 + movq mm0,mm4 + movq mm1,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6,mm2 + movq mm7,mm2 + punpcklwd mm6,mm3 + punpckhwd mm7,mm3 + movq mm2,mm6 + movq mm3,mm7 + pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6,mm4 ; mm6=tmp2L + paddd mm7,mm5 ; mm7=tmp2H + paddd mm2,mm0 ; mm2=tmp0L + paddd mm3,mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor mm1,mm1 + pxor mm2,mm2 + punpcklwd mm1,mm4 ; mm1=tmp0L + punpckhwd mm2,mm4 ; mm2=tmp0H + psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3,mm5 ; mm5=in2=z2 + punpcklwd mm5,mm0 ; mm0=in6=z3 + punpckhwd mm3,mm0 + pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4,mm1 + movq mm0,mm2 + paddd mm1,mm5 ; mm1=tmp10L + paddd mm2,mm3 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp12L + psubd mm0,mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5,mm1 + movq mm3,mm2 + paddd mm1,mm6 ; mm1=data0L + paddd mm2,mm7 ; mm2=data0H + psubd mm5,mm6 ; mm5=data3L + psubd mm3,mm7 ; mm3=data3H + + movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4] + + paddd mm1,mm6 + paddd mm2,mm6 + psrad mm1,DESCALE_P1_4 + psrad mm2,DESCALE_P1_4 + paddd mm5,mm6 + paddd mm3,mm6 + psrad mm5,DESCALE_P1_4 + psrad mm3,DESCALE_P1_4 + + packssdw mm1,mm2 ; mm1=data0=(00 01 02 03) + packssdw mm5,mm3 ; mm5=data3=(30 31 32 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2,mm4 + movq mm3,mm0 + paddd mm4,mm7 ; mm4=data1L + paddd mm0,mm6 ; mm0=data1H + psubd mm2,mm7 ; mm2=data2L + psubd mm3,mm6 ; mm3=data2H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4] + + paddd mm4,mm7 + paddd mm0,mm7 + psrad mm4,DESCALE_P1_4 + psrad mm0,DESCALE_P1_4 + paddd mm2,mm7 + paddd mm3,mm7 + psrad mm2,DESCALE_P1_4 + psrad mm3,DESCALE_P1_4 + + packssdw mm4,mm0 ; mm4=data1=(10 11 12 13) + packssdw mm2,mm3 ; mm2=data2=(20 21 22 23) + + movq mm6,mm1 ; transpose coefficients(phase 1) + punpcklwd mm1,mm4 ; mm1=(00 10 01 11) + punpckhwd mm6,mm4 ; mm6=(02 12 03 13) + movq mm7,mm2 ; transpose coefficients(phase 1) + punpcklwd mm2,mm5 ; mm2=(20 30 21 31) + punpckhwd mm7,mm5 ; mm7=(22 32 23 33) + + movq mm0,mm1 ; transpose coefficients(phase 2) + punpckldq mm1,mm2 ; mm1=(00 10 20 30) + punpckhdq mm0,mm2 ; mm0=(01 11 21 31) + movq mm3,mm6 ; transpose coefficients(phase 2) + punpckldq mm6,mm7 ; mm6=(02 12 22 32) + punpckhdq mm3,mm7 ; mm3=(03 13 23 33) + + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 .nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr - add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; JCOEF * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Odd part - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - movq mm4,mm0 - movq mm5,mm0 - punpcklwd mm4,mm1 - punpckhwd mm5,mm1 - movq mm0,mm4 - movq mm1,mm5 - pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) - pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) - pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) - pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) - - movq mm6,mm2 - movq mm7,mm2 - punpcklwd mm6,mm3 - punpckhwd mm7,mm3 - movq mm2,mm6 - movq mm3,mm7 - pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) - pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) - pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) - pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) - - paddd mm6,mm4 ; mm6=tmp2L - paddd mm7,mm5 ; mm7=tmp2H - paddd mm2,mm0 ; mm2=tmp0L - paddd mm3,mm1 ; mm3=tmp0H - - movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L - movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H - - ; -- Even part - - movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - pxor mm1,mm1 - pxor mm2,mm2 - punpcklwd mm1,mm4 ; mm1=tmp0L - punpckhwd mm2,mm4 ; mm2=tmp0H - psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 - psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 - - movq mm3,mm5 ; mm5=in2=z2 - punpcklwd mm5,mm0 ; mm0=in6=z3 - punpckhwd mm3,mm0 - pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L - pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H - - movq mm4,mm1 - movq mm0,mm2 - paddd mm1,mm5 ; mm1=tmp10L - paddd mm2,mm3 ; mm2=tmp10H - psubd mm4,mm5 ; mm4=tmp12L - psubd mm0,mm3 ; mm0=tmp12H - - ; -- Final output stage - - movq mm5,mm1 - movq mm3,mm2 - paddd mm1,mm6 ; mm1=data0L - paddd mm2,mm7 ; mm2=data0H - psubd mm5,mm6 ; mm5=data3L - psubd mm3,mm7 ; mm3=data3H - - movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4] - - paddd mm1,mm6 - paddd mm2,mm6 - psrad mm1,DESCALE_P2_4 - psrad mm2,DESCALE_P2_4 - paddd mm5,mm6 - paddd mm3,mm6 - psrad mm5,DESCALE_P2_4 - psrad mm3,DESCALE_P2_4 - - packssdw mm1,mm2 ; mm1=data0=(00 10 20 30) - packssdw mm5,mm3 ; mm5=data3=(03 13 23 33) - - movq mm7, MMWORD [wk(0)] ; mm7=tmp0L - movq mm6, MMWORD [wk(1)] ; mm6=tmp0H - - movq mm2,mm4 - movq mm3,mm0 - paddd mm4,mm7 ; mm4=data1L - paddd mm0,mm6 ; mm0=data1H - psubd mm2,mm7 ; mm2=data2L - psubd mm3,mm6 ; mm3=data2H - - movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4] - - paddd mm4,mm7 - paddd mm0,mm7 - psrad mm4,DESCALE_P2_4 - psrad mm0,DESCALE_P2_4 - paddd mm2,mm7 - paddd mm3,mm7 - psrad mm2,DESCALE_P2_4 - psrad mm3,DESCALE_P2_4 - - packssdw mm4,mm0 ; mm4=data1=(01 11 21 31) - packssdw mm2,mm3 ; mm2=data2=(02 12 22 32) - - movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] - - packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32) - packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33) - paddb mm1,mm6 - paddb mm4,mm6 - - movq mm7,mm1 ; transpose coefficients(phase 1) - punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31) - punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33) - - movq mm0,mm1 ; transpose coefficients(phase 2) - punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13) - punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 - movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 - - psrlq mm1,4*BYTE_BIT - psrlq mm0,4*BYTE_BIT - - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 - movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr + add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; JCOEF * wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + movq mm4,mm0 + movq mm5,mm0 + punpcklwd mm4,mm1 + punpckhwd mm5,mm1 + movq mm0,mm4 + movq mm1,mm5 + pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L) + pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H) + pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L) + pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H) + + movq mm6,mm2 + movq mm7,mm2 + punpcklwd mm6,mm3 + punpckhwd mm7,mm3 + movq mm2,mm6 + movq mm3,mm7 + pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L) + pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H) + pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L) + pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H) + + paddd mm6,mm4 ; mm6=tmp2L + paddd mm7,mm5 ; mm7=tmp2H + paddd mm2,mm0 ; mm2=tmp0L + paddd mm3,mm1 ; mm3=tmp0H + + movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L + movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H + + ; -- Even part + + movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + pxor mm1,mm1 + pxor mm2,mm2 + punpcklwd mm1,mm4 ; mm1=tmp0L + punpckhwd mm2,mm4 ; mm2=tmp0H + psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1 + psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1 + + movq mm3,mm5 ; mm5=in2=z2 + punpcklwd mm5,mm0 ; mm0=in6=z3 + punpckhwd mm3,mm0 + pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L + pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H + + movq mm4,mm1 + movq mm0,mm2 + paddd mm1,mm5 ; mm1=tmp10L + paddd mm2,mm3 ; mm2=tmp10H + psubd mm4,mm5 ; mm4=tmp12L + psubd mm0,mm3 ; mm0=tmp12H + + ; -- Final output stage + + movq mm5,mm1 + movq mm3,mm2 + paddd mm1,mm6 ; mm1=data0L + paddd mm2,mm7 ; mm2=data0H + psubd mm5,mm6 ; mm5=data3L + psubd mm3,mm7 ; mm3=data3H + + movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4] + + paddd mm1,mm6 + paddd mm2,mm6 + psrad mm1,DESCALE_P2_4 + psrad mm2,DESCALE_P2_4 + paddd mm5,mm6 + paddd mm3,mm6 + psrad mm5,DESCALE_P2_4 + psrad mm3,DESCALE_P2_4 + + packssdw mm1,mm2 ; mm1=data0=(00 10 20 30) + packssdw mm5,mm3 ; mm5=data3=(03 13 23 33) + + movq mm7, MMWORD [wk(0)] ; mm7=tmp0L + movq mm6, MMWORD [wk(1)] ; mm6=tmp0H + + movq mm2,mm4 + movq mm3,mm0 + paddd mm4,mm7 ; mm4=data1L + paddd mm0,mm6 ; mm0=data1H + psubd mm2,mm7 ; mm2=data2L + psubd mm3,mm6 ; mm3=data2H + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4] + + paddd mm4,mm7 + paddd mm0,mm7 + psrad mm4,DESCALE_P2_4 + psrad mm0,DESCALE_P2_4 + paddd mm2,mm7 + paddd mm3,mm7 + psrad mm2,DESCALE_P2_4 + psrad mm3,DESCALE_P2_4 + + packssdw mm4,mm0 ; mm4=data1=(01 11 21 31) + packssdw mm2,mm3 ; mm2=data2=(02 12 22 32) + + movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32) + packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33) + paddb mm1,mm6 + paddb mm4,mm6 + + movq mm7,mm1 ; transpose coefficients(phase 1) + punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31) + punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33) + + movq mm0,mm1 ; transpose coefficients(phase 2) + punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13) + punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 + movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 + + psrlq mm1,4*BYTE_BIT + psrlq mm0,4*BYTE_BIT + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1 + movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0 + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- @@ -497,210 +497,210 @@ EXTN(jsimd_idct_4x4_mmx): ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void * dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col - align 16 - global EXTN(jsimd_idct_2x2_mmx) + align 16 + global EXTN(jsimd_idct_2x2_mmx) EXTN(jsimd_idct_2x2_mmx): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - - mov edx, POINTER [dct_table(ebp)] ; quantptr - mov esi, JCOEFPTR [coef_block(ebp)] ; inptr - - ; | input: | result: | - ; | 00 01 ** 03 ** 05 ** 07 | | - ; | 10 11 ** 13 ** 15 ** 17 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | - ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | - ; | 50 51 ** 53 ** 55 ** 57 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 70 71 ** 73 ** 75 ** 77 | | - - ; -- Odd part - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; mm0=(10 11 ** 13), mm1=(30 31 ** 33) - ; mm2=(50 51 ** 53), mm3=(70 71 ** 73) - - pcmpeqd mm7,mm7 - pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF} - - movq mm4,mm0 ; mm4=(10 11 ** 13) - movq mm5,mm2 ; mm5=(50 51 ** 53) - punpcklwd mm4,mm1 ; mm4=(10 30 11 31) - punpcklwd mm5,mm3 ; mm5=(50 70 51 71) - pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] - - psrld mm0,WORD_BIT ; mm0=(11 -- 13 --) - pand mm1,mm7 ; mm1=(-- 31 -- 33) - psrld mm2,WORD_BIT ; mm2=(51 -- 53 --) - pand mm3,mm7 ; mm3=(-- 71 -- 73) - por mm0,mm1 ; mm0=(11 31 13 33) - por mm2,mm3 ; mm2=(51 71 53 73) - pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)] - - paddd mm4,mm5 ; mm4=tmp0[col0 col1] - - movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)] - pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)] - pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; mm6=(** 15 ** 17), mm1=(** 35 ** 37) - ; mm3=(** 55 ** 57), mm5=(** 75 ** 77) - - psrld mm6,WORD_BIT ; mm6=(15 -- 17 --) - pand mm1,mm7 ; mm1=(-- 35 -- 37) - psrld mm3,WORD_BIT ; mm3=(55 -- 57 --) - pand mm5,mm7 ; mm5=(-- 75 -- 77) - por mm6,mm1 ; mm6=(15 35 17 37) - por mm3,mm5 ; mm3=(55 75 57 77) - pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)] - - paddd mm0,mm2 ; mm0=tmp0[col1 col3] - paddd mm6,mm3 ; mm6=tmp0[col5 col7] - - ; -- Even part - - movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)] - pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; mm1=(00 01 ** 03), mm5=(** 05 ** 07) - - movq mm2,mm1 ; mm2=(00 01 ** 03) - pslld mm1,WORD_BIT ; mm1=(-- 00 -- **) - psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****] - - pand mm2,mm7 ; mm2=(-- 01 -- 03) - pand mm5,mm7 ; mm5=(-- 05 -- 07) - psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3] - psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7] - - ; -- Final output stage - - movq mm3,mm1 - paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **) - psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **) - punpckldq mm1,mm3 ; mm1=(A0 B0) - - movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2] - - movq mm4,mm2 - movq mm3,mm5 - paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3) - paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7) - psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3) - psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7) - - paddd mm1,mm7 - psrad mm1,DESCALE_P1_2 - - paddd mm2,mm7 - paddd mm5,mm7 - psrad mm2,DESCALE_P1_2 - psrad mm5,DESCALE_P1_2 - paddd mm4,mm7 - paddd mm3,mm7 - psrad mm4,DESCALE_P1_2 - psrad mm3,DESCALE_P1_2 - - ; ---- Pass 2: process rows, store into output array. - - mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(ebp)] - - ; | input:| result:| - ; | A0 B0 | | - ; | A1 B1 | C0 C1 | - ; | A3 B3 | D0 D1 | - ; | A5 B5 | | - ; | A7 B7 | | - - ; -- Odd part - - packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3) - packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7) - pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm0=(10 11 ** 13), mm1=(30 31 ** 33) + ; mm2=(50 51 ** 53), mm3=(70 71 ** 73) + + pcmpeqd mm7,mm7 + pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF} + + movq mm4,mm0 ; mm4=(10 11 ** 13) + movq mm5,mm2 ; mm5=(50 51 ** 53) + punpcklwd mm4,mm1 ; mm4=(10 30 11 31) + punpcklwd mm5,mm3 ; mm5=(50 70 51 71) + pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] + + psrld mm0,WORD_BIT ; mm0=(11 -- 13 --) + pand mm1,mm7 ; mm1=(-- 31 -- 33) + psrld mm2,WORD_BIT ; mm2=(51 -- 53 --) + pand mm3,mm7 ; mm3=(-- 71 -- 73) + por mm0,mm1 ; mm0=(11 31 13 33) + por mm2,mm3 ; mm2=(51 71 53 73) + pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)] + + paddd mm4,mm5 ; mm4=tmp0[col0 col1] + + movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)] + pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)] + pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm6=(** 15 ** 17), mm1=(** 35 ** 37) + ; mm3=(** 55 ** 57), mm5=(** 75 ** 77) + + psrld mm6,WORD_BIT ; mm6=(15 -- 17 --) + pand mm1,mm7 ; mm1=(-- 35 -- 37) + psrld mm3,WORD_BIT ; mm3=(55 -- 57 --) + pand mm5,mm7 ; mm5=(-- 75 -- 77) + por mm6,mm1 ; mm6=(15 35 17 37) + por mm3,mm5 ; mm3=(55 75 57 77) + pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)] + + paddd mm0,mm2 ; mm0=tmp0[col1 col3] + paddd mm6,mm3 ; mm6=tmp0[col5 col7] + + ; -- Even part + + movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)] + pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; mm1=(00 01 ** 03), mm5=(** 05 ** 07) + + movq mm2,mm1 ; mm2=(00 01 ** 03) + pslld mm1,WORD_BIT ; mm1=(-- 00 -- **) + psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****] + + pand mm2,mm7 ; mm2=(-- 01 -- 03) + pand mm5,mm7 ; mm5=(-- 05 -- 07) + psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3] + psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7] + + ; -- Final output stage + + movq mm3,mm1 + paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **) + psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **) + punpckldq mm1,mm3 ; mm1=(A0 B0) + + movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2] + + movq mm4,mm2 + movq mm3,mm5 + paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3) + paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7) + psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3) + psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7) + + paddd mm1,mm7 + psrad mm1,DESCALE_P1_2 + + paddd mm2,mm7 + paddd mm5,mm7 + psrad mm2,DESCALE_P1_2 + psrad mm5,DESCALE_P1_2 + paddd mm4,mm7 + paddd mm3,mm7 + psrad mm4,DESCALE_P1_2 + psrad mm3,DESCALE_P1_2 + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3) + packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7) + pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)] - paddd mm2,mm5 ; mm2=tmp0[row0 row1] - - ; -- Even part + paddd mm2,mm5 ; mm2=tmp0[row0 row1] + + ; -- Even part - pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1] + pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1] - ; -- Final output stage + ; -- Final output stage - movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2] + movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2] - movq mm6,mm1 - paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1) - psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1) + movq mm6,mm1 + paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1) + psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1) - paddd mm1,mm0 - paddd mm6,mm0 - psrad mm1,DESCALE_P2_2 - psrad mm6,DESCALE_P2_2 + paddd mm1,mm0 + paddd mm6,mm0 + psrad mm1,DESCALE_P2_2 + psrad mm6,DESCALE_P2_2 - movq mm7,mm1 ; transpose coefficients - punpckldq mm1,mm6 ; mm1=(C0 D0) - punpckhdq mm7,mm6 ; mm7=(C1 D1) + movq mm7,mm1 ; transpose coefficients + punpckldq mm1,mm6 ; mm1=(C0 D0) + punpckhdq mm7,mm6 ; mm7=(C1 D1) - packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1) - packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1) - paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)] + packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1) + packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1) + paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)] - movd ecx,mm1 - movd ebx,mm1 ; ebx=(C0 D0 C1 D1) - shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --) + movd ecx,mm1 + movd ebx,mm1 ; ebx=(C0 D0 C1 D1) + shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov WORD [edx+eax*SIZEOF_JSAMPLE], bx - mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov WORD [edx+eax*SIZEOF_JSAMPLE], bx + mov WORD [esi+eax*SIZEOF_JSAMPLE], cx - emms ; empty MMX state + emms ; empty MMX state - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jiss2flt-64.asm b/simd/jiss2flt-64.asm index 6e7e6d425..6d57a0184 100644 --- a/simd/jiss2flt-64.asm +++ b/simd/jiss2flt-64.asm @@ -26,34 +26,34 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_float_sse2) + alignz 16 + global EXTN(jconst_idct_float_sse2) EXTN(jconst_idct_float_sse2): -PD_1_414 times 4 dd 1.414213562373095048801689 -PD_1_847 times 4 dd 1.847759065022573512256366 -PD_1_082 times 4 dd 1.082392200292393968799446 -PD_M2_613 times 4 dd -2.613125929752753055713286 -PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -67,417 +67,417 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col -%define original_rbp rbp+0 -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT - ; FAST_FLOAT workspace[DCTSIZE2] +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_float_sse2) + align 16 + global EXTN(jsimd_idct_float_sse2) EXTN(jsimd_idct_float_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [workspace] - collect_args - push rbx - - ; ---- Pass 1: process columns from input, store into work array. - - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr - lea rdi, [workspace] ; FAST_FLOAT * wsptr - mov rcx, DCTSIZE/4 ; ctr + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [workspace] + collect_args + push rbx + + ; ---- Pass 1: process columns from input, store into work array. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + lea rdi, [workspace] ; FAST_FLOAT * wsptr + mov rcx, DCTSIZE/4 ; ctr .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - por xmm1,xmm2 - por xmm3,xmm4 - por xmm5,xmm6 - por xmm1,xmm3 - por xmm5,xmm7 - por xmm1,xmm5 - packsswb xmm1,xmm1 - movd eax,xmm1 - test rax,rax - jnz short .columnDCT - - ; -- AC terms all zero - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm1,xmm0 - movaps xmm2,xmm0 - movaps xmm3,xmm0 - - shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) - shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) - shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) - shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) - - movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 - jmp near .nextcolumn + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm2 + por xmm3,xmm4 + por xmm5,xmm6 + por xmm1,xmm3 + por xmm5,xmm7 + por xmm1,xmm5 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn %endif .columnDCT: - ; -- Even part - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) - - punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) - punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) - cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) - cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[rel PD_1_414] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - - punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) - punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) - cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) - cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) - - punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) - punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) - psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) - cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) - cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) - - mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[rel PD_1_847] ; xmm0=z5 - mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) - addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) - subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) - subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) - movaps xmm3,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) - unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 - - movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) - movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm0,xmm7 - movaps xmm3,xmm5 - addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) - addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) - subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) - subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) - - movaps xmm2,xmm7 ; transpose coefficients(phase 1) - unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) - unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) - movaps xmm4,xmm5 ; transpose coefficients(phase 1) - unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) - unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) - - movaps xmm3,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) - movaps xmm0,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) - movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) - - movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 - - movaps xmm6,xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) - movaps xmm3,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) - - movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[rel PD_1_414] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + + punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[rel PD_1_847] ; xmm0=z5 + mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 .nextcolumn: - add rsi, byte 4*SIZEOF_JCOEF ; coef_block - add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec rcx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov rax, [original_rbp] - lea rsi, [workspace] ; FAST_FLOAT * wsptr - mov rdi, r12 ; (JSAMPROW *) - mov rax, r13 - mov rcx, DCTSIZE/4 ; ctr + add rsi, byte 4*SIZEOF_JCOEF ; coef_block + add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec rcx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + lea rsi, [workspace] ; FAST_FLOAT * wsptr + mov rdi, r12 ; (JSAMPROW *) + mov rax, r13 + mov rcx, DCTSIZE/4 ; ctr .rowloop: - ; -- Even part - - movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[rel PD_1_414] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[rel PD_1_847] ; xmm0=z5 - mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) - addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) - subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) - subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] - pcmpeqd xmm3,xmm3 - psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) - addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) - addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) - addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) - - pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) - pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) - por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) - por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) - - movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 - movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm7,xmm1 - movaps xmm5,xmm3 - addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) - addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) - subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) - subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) - - movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] - pcmpeqd xmm4,xmm4 - psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) - addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) - addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) - addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) - - pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) - pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) - por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) - por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) - - movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] - - packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) - packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) - paddb xmm6,xmm2 - paddb xmm1,xmm2 - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 3) - punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - - pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 - mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 - - add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr - add rdi, byte 4*SIZEOF_JSAMPROW - dec rcx ; ctr - jnz near .rowloop - - pop rbx - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[rel PD_1_414] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[rel PD_1_414] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[rel PD_1_847] ; xmm0=z5 + mulps xmm3,[rel PD_M2_613] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[rel PD_1_082] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm3,xmm3 + psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm7,xmm1 + movaps xmm5,xmm3 + addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2,[rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] + pcmpeqd xmm4,xmm4 + psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6,xmm2 + paddb xmm1,xmm2 + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 + + add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add rdi, byte 4*SIZEOF_JSAMPROW + dec rcx ; ctr + jnz near .rowloop + + pop rbx + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jiss2flt.asm b/simd/jiss2flt.asm index 17bc3633e..86c905614 100644 --- a/simd/jiss2flt.asm +++ b/simd/jiss2flt.asm @@ -25,34 +25,34 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_float_sse2) + alignz 16 + global EXTN(jconst_idct_float_sse2) EXTN(jconst_idct_float_sse2): -PD_1_414 times 4 dd 1.414213562373095048801689 -PD_1_847 times 4 dd 1.847759065022573512256366 -PD_1_082 times 4 dd 1.082392200292393968799446 -PD_M2_613 times 4 dd -2.613125929752753055713286 -PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -61,438 +61,438 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void * dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT - ; FAST_FLOAT workspace[DCTSIZE2] +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_float_sse2) + align 16 + global EXTN(jsimd_idct_float_sse2) EXTN(jsimd_idct_float_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; FAST_FLOAT * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT * wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm2 - por xmm3,xmm4 - por xmm5,xmm6 - por xmm1,xmm3 - por xmm5,xmm7 - por xmm1,xmm5 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm1,xmm0 - movaps xmm2,xmm0 - movaps xmm3,xmm0 - - shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) - shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) - shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) - shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 - jmp near .nextcolumn - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm2 + por xmm3,xmm4 + por xmm5,xmm6 + por xmm1,xmm3 + por xmm5,xmm7 + por xmm1,xmm5 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16,7 %endif .columnDCT: - ; -- Even part - - movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) - psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) - cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) - cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) - - punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) - punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) - cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) - cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) - punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) - psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) - psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) - cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) - cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) - - punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) - punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) - psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) - psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) - cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) - cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) - - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) - addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) - subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) - subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) - movaps xmm3,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) - unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 - - movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) - movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm0,xmm7 - movaps xmm3,xmm5 - addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) - addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) - subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) - subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) - - movaps xmm2,xmm7 ; transpose coefficients(phase 1) - unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) - unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) - movaps xmm4,xmm5 ; transpose coefficients(phase 1) - unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) - unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) - - movaps xmm3,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) - movaps xmm0,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) - movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - - movaps xmm6,xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) - movaps xmm3,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) - - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + ; -- Even part + + movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 .nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; FAST_FLOAT * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT * wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .rowloop: - ; -- Even part - - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) - addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) - subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) - subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] - pcmpeqd xmm3,xmm3 - psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) - addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) - addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) - addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) - - pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) - pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) - por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) - por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) - - movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 - movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm7,xmm1 - movaps xmm5,xmm3 - addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) - addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) - subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) - subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) - - movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] - pcmpeqd xmm4,xmm4 - psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} - - addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) - addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) - addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) - addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) - - pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) - pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) - pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) - pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) - por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) - por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) - - movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] - - packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) - packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) - paddb xmm6,xmm2 - paddb xmm1,xmm2 - - movdqa xmm4,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - - movdqa xmm7,xmm6 ; transpose coefficients(phase 3) - punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - - pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] + pcmpeqd xmm3,xmm3 + psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm7,xmm1 + movaps xmm5,xmm3 + addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] + pcmpeqd xmm4,xmm4 + psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6,xmm2 + paddb xmm1,xmm2 + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jiss2fst-64.asm b/simd/jiss2fst-64.asm index 088750583..432e289d3 100644 --- a/simd/jiss2fst-64.asm +++ b/simd/jiss2fst-64.asm @@ -27,31 +27,31 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. -%define PASS1_BITS 2 +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 %if IFAST_SCALE_BITS != PASS1_BITS %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." %endif %if CONST_BITS == 8 -F_1_082 equ 277 ; FIX(1.082392200) -F_1_414 equ 362 ; FIX(1.414213562) -F_1_847 equ 473 ; FIX(1.847759065) -F_2_613 equ 669 ; FIX(2.613125930) -F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) %else ; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) -F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) -F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) @@ -59,22 +59,22 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_idct_ifast_sse2) + alignz 16 + global EXTN(jconst_idct_ifast_sse2) EXTN(jconst_idct_ifast_sse2): -PW_F1414 times 8 dw F_1_414 << CONST_SHIFT -PW_F1847 times 8 dw F_1_847 << CONST_SHIFT -PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT -PW_F1082 times 8 dw F_1_082 << CONST_SHIFT -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -88,405 +88,405 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col -%define original_rbp rbp+0 -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_idct_ifast_sse2) + align 16 + global EXTN(jsimd_idct_ifast_sse2) EXTN(jsimd_idct_ifast_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args - ; ---- Pass 1: process columns from input. + ; ---- Pass 1: process columns from input. - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test rax,rax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) - - pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) - pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) - pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) - pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) - pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) - pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) - pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) - pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 - jmp near .column_end + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end %endif .columnDCT: - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - psubw xmm0,xmm2 ; xmm0=tmp11 - psubw xmm1,xmm3 - paddw xmm4,xmm2 ; xmm4=tmp10 - paddw xmm5,xmm3 ; xmm5=tmp13 - - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm1,[rel PW_F1414] - psubw xmm1,xmm5 ; xmm1=tmp12 - - movdqa xmm6,xmm4 - movdqa xmm7,xmm0 - psubw xmm4,xmm5 ; xmm4=tmp3 - psubw xmm0,xmm1 ; xmm0=tmp2 - paddw xmm6,xmm5 ; xmm6=tmp0 - paddw xmm7,xmm1 ; xmm7=tmp1 - - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 - - ; -- Odd part - - movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm2 - movdqa xmm0,xmm5 - psubw xmm2,xmm1 ; xmm2=z12 - psubw xmm5,xmm3 ; xmm5=z10 - paddw xmm4,xmm1 ; xmm4=z11 - paddw xmm0,xmm3 ; xmm0=z13 - - movdqa xmm1,xmm5 ; xmm1=z10(unscaled) - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm3,xmm4 - psubw xmm4,xmm0 - paddw xmm3,xmm0 ; xmm3=tmp7 - - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm0,xmm5 - paddw xmm5,xmm2 - pmulhw xmm5,[rel PW_F1847] ; xmm5=z5 - pmulhw xmm0,[rel PW_MF1613] - pmulhw xmm2,[rel PW_F1082] - psubw xmm0,xmm1 - psubw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm0,xmm5 ; xmm0=tmp12 - - ; -- Final output stage - - psubw xmm0,xmm3 ; xmm0=tmp6 - movdqa xmm1,xmm6 - movdqa xmm5,xmm7 - paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) - paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) - psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) - psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) - psubw xmm4,xmm0 ; xmm4=tmp5 - - movdqa xmm3,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) - punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) - movdqa xmm0,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) - punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) - - paddw xmm2,xmm4 ; xmm2=tmp4 - movdqa xmm5,xmm7 - movdqa xmm0,xmm1 - paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) - paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) - psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) - psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm2,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) - - movdqa xmm0,xmm3 ; transpose coefficients(phase 2) - punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) - punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) - punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) - - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) - punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) - movdqa xmm0,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) - movdqa xmm7,xmm5 ; transpose coefficients(phase 3) - punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 - - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) - movdqa xmm7,xmm3 ; transpose coefficients(phase 3) - punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + psubw xmm0,xmm2 ; xmm0=tmp11 + psubw xmm1,xmm3 + paddw xmm4,xmm2 ; xmm4=tmp10 + paddw xmm5,xmm3 ; xmm5=tmp13 + + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1,[rel PW_F1414] + psubw xmm1,xmm5 ; xmm1=tmp12 + + movdqa xmm6,xmm4 + movdqa xmm7,xmm0 + psubw xmm4,xmm5 ; xmm4=tmp3 + psubw xmm0,xmm1 ; xmm0=tmp2 + paddw xmm6,xmm5 ; xmm6=tmp0 + paddw xmm7,xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm2 + movdqa xmm0,xmm5 + psubw xmm2,xmm1 ; xmm2=z12 + psubw xmm5,xmm3 ; xmm5=z10 + paddw xmm4,xmm1 ; xmm4=z11 + paddw xmm0,xmm3 ; xmm0=z13 + + movdqa xmm1,xmm5 ; xmm1=z10(unscaled) + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3,xmm4 + psubw xmm4,xmm0 + paddw xmm3,xmm0 ; xmm3=tmp7 + + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4,[rel PW_F1414] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0,xmm5 + paddw xmm5,xmm2 + pmulhw xmm5,[rel PW_F1847] ; xmm5=z5 + pmulhw xmm0,[rel PW_MF1613] + pmulhw xmm2,[rel PW_F1082] + psubw xmm0,xmm1 + psubw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm0,xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0,xmm3 ; xmm0=tmp6 + movdqa xmm1,xmm6 + movdqa xmm5,xmm7 + paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4,xmm0 ; xmm4=tmp5 + + movdqa xmm3,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2,xmm4 ; xmm2=tmp4 + movdqa xmm5,xmm7 + movdqa xmm0,xmm1 + paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0,xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7,xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7,xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) .column_end: - ; -- Prefetch the next coefficient block - - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov rax, [original_rbp] - mov rdi, r12 ; (JSAMPROW *) - mov rax, r13 - - ; -- Even part - - ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 - - movdqa xmm2,xmm6 - movdqa xmm0,xmm5 - psubw xmm6,xmm1 ; xmm6=tmp11 - psubw xmm5,xmm3 - paddw xmm2,xmm1 ; xmm2=tmp10 - paddw xmm0,xmm3 ; xmm0=tmp13 - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[rel PW_F1414] - psubw xmm5,xmm0 ; xmm5=tmp12 - - movdqa xmm1,xmm2 - movdqa xmm3,xmm6 - psubw xmm2,xmm0 ; xmm2=tmp3 - psubw xmm6,xmm5 ; xmm6=tmp2 - paddw xmm1,xmm0 ; xmm1=tmp0 - paddw xmm3,xmm5 ; xmm3=tmp1 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 - - ; -- Odd part - - ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 - - movdqa xmm2,xmm0 - movdqa xmm6,xmm4 - psubw xmm0,xmm7 ; xmm0=z12 - psubw xmm4,xmm5 ; xmm4=z10 - paddw xmm2,xmm7 ; xmm2=z11 - paddw xmm6,xmm5 ; xmm6=z13 - - movdqa xmm7,xmm4 ; xmm7=z10(unscaled) - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm5,xmm2 - psubw xmm2,xmm6 - paddw xmm5,xmm6 ; xmm5=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm6,xmm4 - paddw xmm4,xmm0 - pmulhw xmm4,[rel PW_F1847] ; xmm4=z5 - pmulhw xmm6,[rel PW_MF1613] - pmulhw xmm0,[rel PW_F1082] - psubw xmm6,xmm7 - psubw xmm0,xmm4 ; xmm0=tmp10 - paddw xmm6,xmm4 ; xmm6=tmp12 - - ; -- Final output stage - - psubw xmm6,xmm5 ; xmm6=tmp6 - movdqa xmm7,xmm1 - movdqa xmm4,xmm3 - paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) - paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) - psraw xmm1,(PASS1_BITS+3) ; descale - psraw xmm3,(PASS1_BITS+3) ; descale - psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) - psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) - psraw xmm7,(PASS1_BITS+3) ; descale - psraw xmm4,(PASS1_BITS+3) ; descale - psubw xmm2,xmm6 ; xmm2=tmp5 - - packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 - - paddw xmm0,xmm2 ; xmm0=tmp4 - movdqa xmm4,xmm5 - movdqa xmm7,xmm6 - paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) - paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) - psraw xmm5,(PASS1_BITS+3) ; descale - psraw xmm6,(PASS1_BITS+3) ; descale - psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) - psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) - psraw xmm4,(PASS1_BITS+3) ; descale - psraw xmm7,(PASS1_BITS+3) ; descale - - movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] - - packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm1,xmm2 - paddb xmm3,xmm2 - paddb xmm5,xmm2 - paddb xmm7,xmm2 - - movdqa xmm0,xmm1 ; transpose coefficients(phase 1) - punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 2) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm2,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 3) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm7,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 - mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 - - mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 - mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret - ret + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov rax, r13 + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2,xmm6 + movdqa xmm0,xmm5 + psubw xmm6,xmm1 ; xmm6=tmp11 + psubw xmm5,xmm3 + paddw xmm2,xmm1 ; xmm2=tmp10 + paddw xmm0,xmm3 ; xmm0=tmp13 + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[rel PW_F1414] + psubw xmm5,xmm0 ; xmm5=tmp12 + + movdqa xmm1,xmm2 + movdqa xmm3,xmm6 + psubw xmm2,xmm0 ; xmm2=tmp3 + psubw xmm6,xmm5 ; xmm6=tmp2 + paddw xmm1,xmm0 ; xmm1=tmp0 + paddw xmm3,xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + psubw xmm0,xmm7 ; xmm0=z12 + psubw xmm4,xmm5 ; xmm4=z10 + paddw xmm2,xmm7 ; xmm2=z11 + paddw xmm6,xmm5 ; xmm6=z13 + + movdqa xmm7,xmm4 ; xmm7=z10(unscaled) + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5,xmm2 + psubw xmm2,xmm6 + paddw xmm5,xmm6 ; xmm5=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2,[rel PW_F1414] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6,xmm4 + paddw xmm4,xmm0 + pmulhw xmm4,[rel PW_F1847] ; xmm4=z5 + pmulhw xmm6,[rel PW_MF1613] + pmulhw xmm0,[rel PW_F1082] + psubw xmm6,xmm7 + psubw xmm0,xmm4 ; xmm0=tmp10 + paddw xmm6,xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6,xmm5 ; xmm6=tmp6 + movdqa xmm7,xmm1 + movdqa xmm4,xmm3 + paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1,(PASS1_BITS+3) ; descale + psraw xmm3,(PASS1_BITS+3) ; descale + psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7,(PASS1_BITS+3) ; descale + psraw xmm4,(PASS1_BITS+3) ; descale + psubw xmm2,xmm6 ; xmm2=tmp5 + + packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0,xmm2 ; xmm0=tmp4 + movdqa xmm4,xmm5 + movdqa xmm7,xmm6 + paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5,(PASS1_BITS+3) ; descale + psraw xmm6,(PASS1_BITS+3) ; descale + psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4,(PASS1_BITS+3) ; descale + psraw xmm7,(PASS1_BITS+3) ; descale + + movdqa xmm2,[rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] + + packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1,xmm2 + paddb xmm3,xmm2 + paddb xmm5,xmm2 + paddb xmm7,xmm2 + + movdqa xmm0,xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jiss2fst.asm b/simd/jiss2fst.asm index b53664d7c..0312be206 100644 --- a/simd/jiss2fst.asm +++ b/simd/jiss2fst.asm @@ -26,31 +26,31 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 8 ; 14 is also OK. -%define PASS1_BITS 2 +%define CONST_BITS 8 ; 14 is also OK. +%define PASS1_BITS 2 %if IFAST_SCALE_BITS != PASS1_BITS %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." %endif %if CONST_BITS == 8 -F_1_082 equ 277 ; FIX(1.082392200) -F_1_414 equ 362 ; FIX(1.414213562) -F_1_847 equ 473 ; FIX(1.847759065) -F_2_613 equ 669 ; FIX(2.613125930) -F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) +F_1_082 equ 277 ; FIX(1.082392200) +F_1_414 equ 362 ; FIX(1.414213562) +F_1_847 equ 473 ; FIX(1.847759065) +F_2_613 equ 669 ; FIX(2.613125930) +F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) %else ; NASM cannot do compile-time arithmetic on floating-point constants. -%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) -F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) -F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) +F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) +F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) +F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) @@ -58,22 +58,22 @@ F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) %define PRE_MULTIPLY_SCALE_BITS 2 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) - alignz 16 - global EXTN(jconst_idct_ifast_sse2) + alignz 16 + global EXTN(jconst_idct_ifast_sse2) EXTN(jconst_idct_ifast_sse2): -PW_F1414 times 8 dw F_1_414 << CONST_SHIFT -PW_F1847 times 8 dw F_1_847 << CONST_SHIFT -PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT -PW_F1082 times 8 dw F_1_082 << CONST_SHIFT -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PW_F1414 times 8 dw F_1_414 << CONST_SHIFT +PW_F1847 times 8 dw F_1_847 << CONST_SHIFT +PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT +PW_F1082 times 8 dw F_1_082 << CONST_SHIFT +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -82,421 +82,421 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; jpeg_component_info * compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; jpeg_component_info * compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_idct_ifast_sse2) + align 16 + global EXTN(jsimd_idct_ifast_sse2) EXTN(jsimd_idct_ifast_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) - - pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) - pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) - pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) - pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) - pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) - pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) - pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) - pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 - jmp near .column_end - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end + alignx 16,7 %endif .columnDCT: - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm1 - psubw xmm0,xmm2 ; xmm0=tmp11 - psubw xmm1,xmm3 - paddw xmm4,xmm2 ; xmm4=tmp10 - paddw xmm5,xmm3 ; xmm5=tmp13 - - psllw xmm1,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] - psubw xmm1,xmm5 ; xmm1=tmp12 - - movdqa xmm6,xmm4 - movdqa xmm7,xmm0 - psubw xmm4,xmm5 ; xmm4=tmp3 - psubw xmm0,xmm1 ; xmm0=tmp2 - paddw xmm6,xmm5 ; xmm6=tmp0 - paddw xmm7,xmm1 ; xmm7=tmp1 - - movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 - movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 - - ; -- Odd part - - movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] - movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] - - movdqa xmm4,xmm2 - movdqa xmm0,xmm5 - psubw xmm2,xmm1 ; xmm2=z12 - psubw xmm5,xmm3 ; xmm5=z10 - paddw xmm4,xmm1 ; xmm4=z11 - paddw xmm0,xmm3 ; xmm0=z13 - - movdqa xmm1,xmm5 ; xmm1=z10(unscaled) - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm3,xmm4 - psubw xmm4,xmm0 - paddw xmm3,xmm0 ; xmm3=tmp7 - - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm0,xmm5 - paddw xmm5,xmm2 - pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 - pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] - pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] - psubw xmm0,xmm1 - psubw xmm2,xmm5 ; xmm2=tmp10 - paddw xmm0,xmm5 ; xmm0=tmp12 - - ; -- Final output stage - - psubw xmm0,xmm3 ; xmm0=tmp6 - movdqa xmm1,xmm6 - movdqa xmm5,xmm7 - paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) - paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) - psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) - psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) - psubw xmm4,xmm0 ; xmm4=tmp5 - - movdqa xmm3,xmm6 ; transpose coefficients(phase 1) - punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) - punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) - movdqa xmm0,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) - punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) - - paddw xmm2,xmm4 ; xmm2=tmp4 - movdqa xmm5,xmm7 - movdqa xmm0,xmm1 - paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) - paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) - psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) - psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm2,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) - punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) - - movdqa xmm0,xmm3 ; transpose coefficients(phase 2) - punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) - punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) - punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) - - movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) - punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) - movdqa xmm0,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) - punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) - - movdqa xmm4,xmm6 ; transpose coefficients(phase 3) - punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) - movdqa xmm7,xmm5 ; transpose coefficients(phase 3) - punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) - movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 - movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 - - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) - movdqa xmm7,xmm3 ; transpose coefficients(phase 3) - punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + psubw xmm0,xmm2 ; xmm0=tmp11 + psubw xmm1,xmm3 + paddw xmm4,xmm2 ; xmm4=tmp10 + paddw xmm5,xmm3 ; xmm5=tmp13 + + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] + psubw xmm1,xmm5 ; xmm1=tmp12 + + movdqa xmm6,xmm4 + movdqa xmm7,xmm0 + psubw xmm4,xmm5 ; xmm4=tmp3 + psubw xmm0,xmm1 ; xmm0=tmp2 + paddw xmm6,xmm5 ; xmm6=tmp0 + paddw xmm7,xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm2 + movdqa xmm0,xmm5 + psubw xmm2,xmm1 ; xmm2=z12 + psubw xmm5,xmm3 ; xmm5=z10 + paddw xmm4,xmm1 ; xmm4=z11 + paddw xmm0,xmm3 ; xmm0=z13 + + movdqa xmm1,xmm5 ; xmm1=z10(unscaled) + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3,xmm4 + psubw xmm4,xmm0 + paddw xmm3,xmm0 ; xmm3=tmp7 + + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0,xmm5 + paddw xmm5,xmm2 + pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 + pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] + psubw xmm0,xmm1 + psubw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm0,xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0,xmm3 ; xmm0=tmp6 + movdqa xmm1,xmm6 + movdqa xmm5,xmm7 + paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4,xmm0 ; xmm4=tmp5 + + movdqa xmm3,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2,xmm4 ; xmm2=tmp4 + movdqa xmm5,xmm7 + movdqa xmm0,xmm1 + paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0,xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7,xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7,xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) .column_end: - ; -- Prefetch the next coefficient block - - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Even part - - ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 - - movdqa xmm2,xmm6 - movdqa xmm0,xmm5 - psubw xmm6,xmm1 ; xmm6=tmp11 - psubw xmm5,xmm3 - paddw xmm2,xmm1 ; xmm2=tmp10 - paddw xmm0,xmm3 ; xmm0=tmp13 - - psllw xmm5,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] - psubw xmm5,xmm0 ; xmm5=tmp12 - - movdqa xmm1,xmm2 - movdqa xmm3,xmm6 - psubw xmm2,xmm0 ; xmm2=tmp3 - psubw xmm6,xmm5 ; xmm6=tmp2 - paddw xmm1,xmm0 ; xmm1=tmp0 - paddw xmm3,xmm5 ; xmm3=tmp1 - - movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 - - ; -- Odd part - - ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 - - movdqa xmm2,xmm0 - movdqa xmm6,xmm4 - psubw xmm0,xmm7 ; xmm0=z12 - psubw xmm4,xmm5 ; xmm4=z10 - paddw xmm2,xmm7 ; xmm2=z11 - paddw xmm6,xmm5 ; xmm6=z13 - - movdqa xmm7,xmm4 ; xmm7=z10(unscaled) - psllw xmm0,PRE_MULTIPLY_SCALE_BITS - psllw xmm4,PRE_MULTIPLY_SCALE_BITS - - movdqa xmm5,xmm2 - psubw xmm2,xmm6 - paddw xmm5,xmm6 ; xmm5=tmp7 - - psllw xmm2,PRE_MULTIPLY_SCALE_BITS - pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 - - ; To avoid overflow... - ; - ; (Original) - ; tmp12 = -2.613125930 * z10 + z5; - ; - ; (This implementation) - ; tmp12 = (-1.613125930 - 1) * z10 + z5; - ; = -1.613125930 * z10 - z10 + z5; - - movdqa xmm6,xmm4 - paddw xmm4,xmm0 - pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 - pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] - pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] - psubw xmm6,xmm7 - psubw xmm0,xmm4 ; xmm0=tmp10 - paddw xmm6,xmm4 ; xmm6=tmp12 - - ; -- Final output stage - - psubw xmm6,xmm5 ; xmm6=tmp6 - movdqa xmm7,xmm1 - movdqa xmm4,xmm3 - paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) - paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) - psraw xmm1,(PASS1_BITS+3) ; descale - psraw xmm3,(PASS1_BITS+3) ; descale - psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) - psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) - psraw xmm7,(PASS1_BITS+3) ; descale - psraw xmm4,(PASS1_BITS+3) ; descale - psubw xmm2,xmm6 ; xmm2=tmp5 - - packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 - movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 - - paddw xmm0,xmm2 ; xmm0=tmp4 - movdqa xmm4,xmm5 - movdqa xmm7,xmm6 - paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) - paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) - psraw xmm5,(PASS1_BITS+3) ; descale - psraw xmm6,(PASS1_BITS+3) ; descale - psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) - psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) - psraw xmm4,(PASS1_BITS+3) ; descale - psraw xmm7,(PASS1_BITS+3) ; descale - - movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] - - packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm1,xmm2 - paddb xmm3,xmm2 - paddb xmm5,xmm2 - paddb xmm7,xmm2 - - movdqa xmm0,xmm1 ; transpose coefficients(phase 1) - punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm1 ; transpose coefficients(phase 2) - punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm2,xmm6 ; transpose coefficients(phase 2) - punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm3,xmm1 ; transpose coefficients(phase 3) - punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm7,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 - - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 - mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2,xmm6 + movdqa xmm0,xmm5 + psubw xmm6,xmm1 ; xmm6=tmp11 + psubw xmm5,xmm3 + paddw xmm2,xmm1 ; xmm2=tmp10 + paddw xmm0,xmm3 ; xmm0=tmp13 + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] + psubw xmm5,xmm0 ; xmm5=tmp12 + + movdqa xmm1,xmm2 + movdqa xmm3,xmm6 + psubw xmm2,xmm0 ; xmm2=tmp3 + psubw xmm6,xmm5 ; xmm6=tmp2 + paddw xmm1,xmm0 ; xmm1=tmp0 + paddw xmm3,xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + psubw xmm0,xmm7 ; xmm0=z12 + psubw xmm4,xmm5 ; xmm4=z10 + paddw xmm2,xmm7 ; xmm2=z11 + paddw xmm6,xmm5 ; xmm6=z13 + + movdqa xmm7,xmm4 ; xmm7=z10(unscaled) + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5,xmm2 + psubw xmm2,xmm6 + paddw xmm5,xmm6 ; xmm5=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6,xmm4 + paddw xmm4,xmm0 + pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 + pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] + pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] + psubw xmm6,xmm7 + psubw xmm0,xmm4 ; xmm0=tmp10 + paddw xmm6,xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6,xmm5 ; xmm6=tmp6 + movdqa xmm7,xmm1 + movdqa xmm4,xmm3 + paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1,(PASS1_BITS+3) ; descale + psraw xmm3,(PASS1_BITS+3) ; descale + psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7,(PASS1_BITS+3) ; descale + psraw xmm4,(PASS1_BITS+3) ; descale + psubw xmm2,xmm6 ; xmm2=tmp5 + + packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0,xmm2 ; xmm0=tmp4 + movdqa xmm4,xmm5 + movdqa xmm7,xmm6 + paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5,(PASS1_BITS+3) ; descale + psraw xmm6,(PASS1_BITS+3) ; descale + psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4,(PASS1_BITS+3) ; descale + psraw xmm7,(PASS1_BITS+3) ; descale + + movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1,xmm2 + paddb xmm3,xmm2 + paddb xmm5,xmm2 + paddb xmm7,xmm2 + + movdqa xmm0,xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jiss2int-64.asm b/simd/jiss2int-64.asm index 13764d6ae..10e952136 100644 --- a/simd/jiss2int-64.asm +++ b/simd/jiss2int-64.asm @@ -27,67 +27,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_islow_sse2) + alignz 16 + global EXTN(jconst_idct_islow_sse2) EXTN(jconst_idct_islow_sse2): -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -101,748 +101,748 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col -%define original_rbp rbp+0 -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 12 +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 12 - align 16 - global EXTN(jsimd_idct_islow_sse2) + align 16 + global EXTN(jsimd_idct_islow_sse2) EXTN(jsimd_idct_islow_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args - ; ---- Pass 1: process columns from input. + ; ---- Pass 1: process columns from input. - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test rax,rax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm5,PASS1_BITS - - movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) - punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) - - pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) - pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) - pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) - pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) - pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) - pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) - pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) - pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 - jmp near .column_end + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5,PASS1_BITS + + movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end %endif .columnDCT: - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm4,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm4,xmm3 ; xmm3=in6=z3 - punpckhwd xmm5,xmm3 - movdqa xmm1,xmm4 - movdqa xmm3,xmm5 - pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L - pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H - pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L - pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H - - movdqa xmm6,xmm0 - paddw xmm0,xmm2 ; xmm0=in0+in4 - psubw xmm6,xmm2 ; xmm6=in0-in4 - - pxor xmm7,xmm7 - pxor xmm2,xmm2 - punpcklwd xmm7,xmm0 ; xmm7=tmp0L - punpckhwd xmm2,xmm0 ; xmm2=tmp0H - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS - - movdqa xmm0,xmm7 - paddd xmm7,xmm4 ; xmm7=tmp10L - psubd xmm0,xmm4 ; xmm0=tmp13L - movdqa xmm4,xmm2 - paddd xmm2,xmm5 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp13H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm7,xmm7 - punpcklwd xmm5,xmm6 ; xmm5=tmp1L - punpckhwd xmm7,xmm6 ; xmm7=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - - movdqa xmm2,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm2,xmm1 ; xmm2=tmp12L - movdqa xmm0,xmm7 - paddd xmm7,xmm3 ; xmm7=tmp11H - psubd xmm0,xmm3 ; xmm0=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm5,xmm6 - movdqa xmm7,xmm4 - paddw xmm5,xmm3 ; xmm5=z3 - paddw xmm7,xmm1 ; xmm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm2,xmm5 - movdqa xmm0,xmm5 - punpcklwd xmm2,xmm7 - punpckhwd xmm0,xmm7 - movdqa xmm5,xmm2 - movdqa xmm7,xmm0 - pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L - pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H - pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L - pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm2,xmm3 - movdqa xmm0,xmm3 - punpcklwd xmm2,xmm4 - punpckhwd xmm0,xmm4 - movdqa xmm3,xmm2 - movdqa xmm4,xmm0 - pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L - pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H - pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L - pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H - - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L - paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H - paddd xmm3,xmm5 ; xmm3=tmp3L - paddd xmm4,xmm7 ; xmm4=tmp3H - - movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H - - movdqa xmm2,xmm1 - movdqa xmm0,xmm1 - punpcklwd xmm2,xmm6 - punpckhwd xmm0,xmm6 - movdqa xmm1,xmm2 - movdqa xmm6,xmm0 - pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L - pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H - pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L - pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H - - paddd xmm2,xmm5 ; xmm2=tmp1L - paddd xmm0,xmm7 ; xmm0=tmp1H - paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H - - movdqa xmm2,xmm5 - movdqa xmm0,xmm7 - paddd xmm5,xmm3 ; xmm5=data0L - paddd xmm7,xmm4 ; xmm7=data0H - psubd xmm2,xmm3 ; xmm2=data7L - psubd xmm0,xmm4 ; xmm0=data7H - - movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] - - paddd xmm5,xmm3 - paddd xmm7,xmm3 - psrad xmm5,DESCALE_P1 - psrad xmm7,DESCALE_P1 - paddd xmm2,xmm3 - paddd xmm0,xmm3 - psrad xmm2,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) - packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) - - movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L - movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H - - movdqa xmm7,xmm4 - movdqa xmm0,xmm3 - paddd xmm4,xmm1 ; xmm4=data1L - paddd xmm3,xmm6 ; xmm3=data1H - psubd xmm7,xmm1 ; xmm7=data6L - psubd xmm0,xmm6 ; xmm0=data6H - - movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] - - paddd xmm4,xmm1 - paddd xmm3,xmm1 - psrad xmm4,DESCALE_P1 - psrad xmm3,DESCALE_P1 - paddd xmm7,xmm1 - paddd xmm0,xmm1 - psrad xmm7,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) - - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm1,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) - punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) - - movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L - movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H - movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L - movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) - - movdqa xmm5,xmm3 - movdqa xmm6,xmm0 - paddd xmm3,xmm4 ; xmm3=data2L - paddd xmm0,xmm2 ; xmm0=data2H - psubd xmm5,xmm4 ; xmm5=data5L - psubd xmm6,xmm2 ; xmm6=data5H - - movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] - - paddd xmm3,xmm7 - paddd xmm0,xmm7 - psrad xmm3,DESCALE_P1 - psrad xmm0,DESCALE_P1 - paddd xmm5,xmm7 - paddd xmm6,xmm7 - psrad xmm5,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) - packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L - movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H - movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L - movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H - - movdqa xmm0,xmm1 - movdqa xmm6,xmm4 - paddd xmm1,xmm2 ; xmm1=data3L - paddd xmm4,xmm7 ; xmm4=data3H - psubd xmm0,xmm2 ; xmm0=data4L - psubd xmm6,xmm7 ; xmm6=data4H - - movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] - - paddd xmm1,xmm2 - paddd xmm4,xmm2 - psrad xmm1,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm0,xmm2 - paddd xmm6,xmm2 - psrad xmm0,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) - packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) - movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) - - movdqa xmm4,xmm3 ; transpose coefficients(phase 1) - punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm6,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) - punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) - punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) - punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) - movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) - - movdqa xmm2,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) - punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) - punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) - - movdqa xmm3,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 - - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) - movdqa xmm4,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm4,xmm3 ; xmm3=in6=z3 + punpckhwd xmm5,xmm3 + movdqa xmm1,xmm4 + movdqa xmm3,xmm5 + pmaddwd xmm4,[rel PW_F130_F054] ; xmm4=tmp3L + pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm3,[rel PW_F054_MF130] ; xmm3=tmp2H + + movdqa xmm6,xmm0 + paddw xmm0,xmm2 ; xmm0=in0+in4 + psubw xmm6,xmm2 ; xmm6=in0-in4 + + pxor xmm7,xmm7 + pxor xmm2,xmm2 + punpcklwd xmm7,xmm0 ; xmm7=tmp0L + punpckhwd xmm2,xmm0 ; xmm2=tmp0H + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0,xmm7 + paddd xmm7,xmm4 ; xmm7=tmp10L + psubd xmm0,xmm4 ; xmm0=tmp13L + movdqa xmm4,xmm2 + paddd xmm2,xmm5 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm7,xmm7 + punpcklwd xmm5,xmm6 ; xmm5=tmp1L + punpckhwd xmm7,xmm6 ; xmm7=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm2,xmm1 ; xmm2=tmp12L + movdqa xmm0,xmm7 + paddd xmm7,xmm3 ; xmm7=tmp11H + psubd xmm0,xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5,xmm6 + movdqa xmm7,xmm4 + paddw xmm5,xmm3 ; xmm5=z3 + paddw xmm7,xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2,xmm5 + movdqa xmm0,xmm5 + punpcklwd xmm2,xmm7 + punpckhwd xmm0,xmm7 + movdqa xmm5,xmm2 + movdqa xmm7,xmm0 + pmaddwd xmm2,[rel PW_MF078_F117] ; xmm2=z3L + pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3H + pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm7,[rel PW_F117_F078] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2,xmm3 + movdqa xmm0,xmm3 + punpcklwd xmm2,xmm4 + punpckhwd xmm0,xmm4 + movdqa xmm3,xmm2 + movdqa xmm4,xmm0 + pmaddwd xmm2,[rel PW_MF060_MF089] ; xmm2=tmp0L + pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0H + pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3L + pmaddwd xmm4,[rel PW_MF089_F060] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3,xmm5 ; xmm3=tmp3L + paddd xmm4,xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2,xmm1 + movdqa xmm0,xmm1 + punpcklwd xmm2,xmm6 + punpckhwd xmm0,xmm6 + movdqa xmm1,xmm2 + movdqa xmm6,xmm0 + pmaddwd xmm2,[rel PW_MF050_MF256] ; xmm2=tmp1L + pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1H + pmaddwd xmm1,[rel PW_MF256_F050] ; xmm1=tmp2L + pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm2,xmm5 ; xmm2=tmp1L + paddd xmm0,xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2,xmm5 + movdqa xmm0,xmm7 + paddd xmm5,xmm3 ; xmm5=data0L + paddd xmm7,xmm4 ; xmm7=data0H + psubd xmm2,xmm3 ; xmm2=data7L + psubd xmm0,xmm4 ; xmm0=data7H + + movdqa xmm3,[rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1] + + paddd xmm5,xmm3 + paddd xmm7,xmm3 + psrad xmm5,DESCALE_P1 + psrad xmm7,DESCALE_P1 + paddd xmm2,xmm3 + paddd xmm0,xmm3 + psrad xmm2,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7,xmm4 + movdqa xmm0,xmm3 + paddd xmm4,xmm1 ; xmm4=data1L + paddd xmm3,xmm6 ; xmm3=data1H + psubd xmm7,xmm1 ; xmm7=data6L + psubd xmm0,xmm6 ; xmm0=data6H + + movdqa xmm1,[rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1] + + paddd xmm4,xmm1 + paddd xmm3,xmm1 + psrad xmm4,DESCALE_P1 + psrad xmm3,DESCALE_P1 + paddd xmm7,xmm1 + paddd xmm0,xmm1 + psrad xmm7,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5,xmm3 + movdqa xmm6,xmm0 + paddd xmm3,xmm4 ; xmm3=data2L + paddd xmm0,xmm2 ; xmm0=data2H + psubd xmm5,xmm4 ; xmm5=data5L + psubd xmm6,xmm2 ; xmm6=data5H + + movdqa xmm7,[rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1] + + paddd xmm3,xmm7 + paddd xmm0,xmm7 + psrad xmm3,DESCALE_P1 + psrad xmm0,DESCALE_P1 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + psrad xmm5,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0,xmm1 + movdqa xmm6,xmm4 + paddd xmm1,xmm2 ; xmm1=data3L + paddd xmm4,xmm7 ; xmm4=data3H + psubd xmm0,xmm2 ; xmm0=data4L + psubd xmm6,xmm7 ; xmm6=data4H + + movdqa xmm2,[rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1] + + paddd xmm1,xmm2 + paddd xmm4,xmm2 + psrad xmm1,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm0,xmm2 + paddd xmm6,xmm2 + psrad xmm0,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4,xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 .column_end: - ; -- Prefetch the next coefficient block - - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov rax, [original_rbp] - mov rdi, r12 ; (JSAMPROW *) - mov rax, r13 - - ; -- Even part - - ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm6,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm6,xmm2 ; xmm2=in6=z3 - punpckhwd xmm5,xmm2 - movdqa xmm1,xmm6 - movdqa xmm2,xmm5 - pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L - pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H - pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L - pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H - - movdqa xmm3,xmm7 - paddw xmm7,xmm0 ; xmm7=in0+in4 - psubw xmm3,xmm0 ; xmm3=in0-in4 - - pxor xmm4,xmm4 - pxor xmm0,xmm0 - punpcklwd xmm4,xmm7 ; xmm4=tmp0L - punpckhwd xmm0,xmm7 ; xmm0=tmp0H - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS - - movdqa xmm7,xmm4 - paddd xmm4,xmm6 ; xmm4=tmp10L - psubd xmm7,xmm6 ; xmm7=tmp13L - movdqa xmm6,xmm0 - paddd xmm0,xmm5 ; xmm0=tmp10H - psubd xmm6,xmm5 ; xmm6=tmp13H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm4,xmm4 - punpcklwd xmm5,xmm3 ; xmm5=tmp1L - punpckhwd xmm4,xmm3 ; xmm4=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - - movdqa xmm0,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm0,xmm1 ; xmm0=tmp12L - movdqa xmm7,xmm4 - paddd xmm4,xmm2 ; xmm4=tmp11H - psubd xmm7,xmm2 ; xmm7=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 - movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 - movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 - movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 - - movdqa xmm5,xmm6 - movdqa xmm4,xmm3 - paddw xmm5,xmm1 ; xmm5=z3 - paddw xmm4,xmm2 ; xmm4=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm0,xmm5 - movdqa xmm7,xmm5 - punpcklwd xmm0,xmm4 - punpckhwd xmm7,xmm4 - movdqa xmm5,xmm0 - movdqa xmm4,xmm7 - pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L - pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H - pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L - pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm0,xmm1 - movdqa xmm7,xmm1 - punpcklwd xmm0,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm1,xmm0 - movdqa xmm3,xmm7 - pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L - pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H - pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L - pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H - - paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L - paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H - paddd xmm1,xmm5 ; xmm1=tmp3L - paddd xmm3,xmm4 ; xmm3=tmp3H - - movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H - - movdqa xmm0,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm0,xmm6 - punpckhwd xmm7,xmm6 - movdqa xmm2,xmm0 - movdqa xmm6,xmm7 - pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L - pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H - pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L - pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H - - paddd xmm0,xmm5 ; xmm0=tmp1L - paddd xmm7,xmm4 ; xmm7=tmp1H - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H - - movdqa xmm0,xmm5 - movdqa xmm7,xmm4 - paddd xmm5,xmm1 ; xmm5=data0L - paddd xmm4,xmm3 ; xmm4=data0H - psubd xmm0,xmm1 ; xmm0=data7L - psubd xmm7,xmm3 ; xmm7=data7H - - movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] - - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrad xmm5,DESCALE_P2 - psrad xmm4,DESCALE_P2 - paddd xmm0,xmm1 - paddd xmm7,xmm1 - psrad xmm0,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) - packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L - movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H - - movdqa xmm4,xmm3 - movdqa xmm7,xmm1 - paddd xmm3,xmm2 ; xmm3=data1L - paddd xmm1,xmm6 ; xmm1=data1H - psubd xmm4,xmm2 ; xmm4=data6L - psubd xmm7,xmm6 ; xmm7=data6H - - movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] - - paddd xmm3,xmm2 - paddd xmm1,xmm2 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm4,xmm2 - paddd xmm7,xmm2 - psrad xmm4,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) - packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) - - packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H - movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L - movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm4,xmm6 - movdqa xmm0,xmm2 - paddd xmm6,xmm1 ; xmm6=data2L - paddd xmm2,xmm7 ; xmm2=data2H - psubd xmm4,xmm1 ; xmm4=data5L - psubd xmm0,xmm7 ; xmm0=data5H - - movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] - - paddd xmm6,xmm5 - paddd xmm2,xmm5 - psrad xmm6,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm4,xmm5 - paddd xmm0,xmm5 - psrad xmm4,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) - packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) - - movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L - movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H - movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L - movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H - - movdqa xmm2,xmm3 - movdqa xmm0,xmm1 - paddd xmm3,xmm7 ; xmm3=data3L - paddd xmm1,xmm5 ; xmm1=data3H - psubd xmm2,xmm7 ; xmm2=data4L - psubd xmm0,xmm5 ; xmm0=data4H - - movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] - - paddd xmm3,xmm7 - paddd xmm1,xmm7 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm2,xmm7 - paddd xmm0,xmm7 - psrad xmm2,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] - - packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) - packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm7,xmm5 - paddb xmm1,xmm5 - paddb xmm6,xmm5 - paddb xmm3,xmm5 - - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 2) - punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm3,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 - mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 - - mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 - mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 - movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 - - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + ; -- Prefetch the next coefficient block + + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov rax, r13 + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm6,xmm2 ; xmm2=in6=z3 + punpckhwd xmm5,xmm2 + movdqa xmm1,xmm6 + movdqa xmm2,xmm5 + pmaddwd xmm6,[rel PW_F130_F054] ; xmm6=tmp3L + pmaddwd xmm5,[rel PW_F130_F054] ; xmm5=tmp3H + pmaddwd xmm1,[rel PW_F054_MF130] ; xmm1=tmp2L + pmaddwd xmm2,[rel PW_F054_MF130] ; xmm2=tmp2H + + movdqa xmm3,xmm7 + paddw xmm7,xmm0 ; xmm7=in0+in4 + psubw xmm3,xmm0 ; xmm3=in0-in4 + + pxor xmm4,xmm4 + pxor xmm0,xmm0 + punpcklwd xmm4,xmm7 ; xmm4=tmp0L + punpckhwd xmm0,xmm7 ; xmm0=tmp0H + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7,xmm4 + paddd xmm4,xmm6 ; xmm4=tmp10L + psubd xmm7,xmm6 ; xmm7=tmp13L + movdqa xmm6,xmm0 + paddd xmm0,xmm5 ; xmm0=tmp10H + psubd xmm6,xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm4,xmm4 + punpcklwd xmm5,xmm3 ; xmm5=tmp1L + punpckhwd xmm4,xmm3 ; xmm4=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm0,xmm1 ; xmm0=tmp12L + movdqa xmm7,xmm4 + paddd xmm4,xmm2 ; xmm4=tmp11H + psubd xmm7,xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5,xmm6 + movdqa xmm4,xmm3 + paddw xmm5,xmm1 ; xmm5=z3 + paddw xmm4,xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0,xmm5 + movdqa xmm7,xmm5 + punpcklwd xmm0,xmm4 + punpckhwd xmm7,xmm4 + movdqa xmm5,xmm0 + movdqa xmm4,xmm7 + pmaddwd xmm0,[rel PW_MF078_F117] ; xmm0=z3L + pmaddwd xmm7,[rel PW_MF078_F117] ; xmm7=z3H + pmaddwd xmm5,[rel PW_F117_F078] ; xmm5=z4L + pmaddwd xmm4,[rel PW_F117_F078] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0,xmm1 + movdqa xmm7,xmm1 + punpcklwd xmm0,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm1,xmm0 + movdqa xmm3,xmm7 + pmaddwd xmm0,[rel PW_MF060_MF089] ; xmm0=tmp0L + pmaddwd xmm7,[rel PW_MF060_MF089] ; xmm7=tmp0H + pmaddwd xmm1,[rel PW_MF089_F060] ; xmm1=tmp3L + pmaddwd xmm3,[rel PW_MF089_F060] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1,xmm5 ; xmm1=tmp3L + paddd xmm3,xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm0,xmm6 + punpckhwd xmm7,xmm6 + movdqa xmm2,xmm0 + movdqa xmm6,xmm7 + pmaddwd xmm0,[rel PW_MF050_MF256] ; xmm0=tmp1L + pmaddwd xmm7,[rel PW_MF050_MF256] ; xmm7=tmp1H + pmaddwd xmm2,[rel PW_MF256_F050] ; xmm2=tmp2L + pmaddwd xmm6,[rel PW_MF256_F050] ; xmm6=tmp2H + + paddd xmm0,xmm5 ; xmm0=tmp1L + paddd xmm7,xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0,xmm5 + movdqa xmm7,xmm4 + paddd xmm5,xmm1 ; xmm5=data0L + paddd xmm4,xmm3 ; xmm4=data0H + psubd xmm0,xmm1 ; xmm0=data7L + psubd xmm7,xmm3 ; xmm7=data7H + + movdqa xmm1,[rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2] + + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrad xmm5,DESCALE_P2 + psrad xmm4,DESCALE_P2 + paddd xmm0,xmm1 + paddd xmm7,xmm1 + psrad xmm0,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4,xmm3 + movdqa xmm7,xmm1 + paddd xmm3,xmm2 ; xmm3=data1L + paddd xmm1,xmm6 ; xmm1=data1H + psubd xmm4,xmm2 ; xmm4=data6L + psubd xmm7,xmm6 ; xmm7=data6H + + movdqa xmm2,[rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2] + + paddd xmm3,xmm2 + paddd xmm1,xmm2 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm4,xmm2 + paddd xmm7,xmm2 + psrad xmm4,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4,xmm6 + movdqa xmm0,xmm2 + paddd xmm6,xmm1 ; xmm6=data2L + paddd xmm2,xmm7 ; xmm2=data2H + psubd xmm4,xmm1 ; xmm4=data5L + psubd xmm0,xmm7 ; xmm0=data5H + + movdqa xmm5,[rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2] + + paddd xmm6,xmm5 + paddd xmm2,xmm5 + psrad xmm6,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm4,xmm5 + paddd xmm0,xmm5 + psrad xmm4,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2,xmm3 + movdqa xmm0,xmm1 + paddd xmm3,xmm7 ; xmm3=data3L + paddd xmm1,xmm5 ; xmm1=data3H + psubd xmm2,xmm7 ; xmm2=data4L + psubd xmm0,xmm5 ; xmm0=data4H + + movdqa xmm7,[rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2] + + paddd xmm3,xmm7 + paddd xmm1,xmm7 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm2,xmm7 + paddd xmm0,xmm7 + psrad xmm2,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + movdqa xmm5,[rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP] + + packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7,xmm5 + paddb xmm1,xmm5 + paddb xmm6,xmm5 + paddb xmm3,xmm5 + + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1 + mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + + mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 + mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 + + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jiss2int.asm b/simd/jiss2int.asm index adf39fb3a..e78f5ffaa 100644 --- a/simd/jiss2int.asm +++ b/simd/jiss2int.asm @@ -26,67 +26,67 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1 (CONST_BITS-PASS1_BITS) -%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) +%define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) %if CONST_BITS == 13 -F_0_298 equ 2446 ; FIX(0.298631336) -F_0_390 equ 3196 ; FIX(0.390180644) -F_0_541 equ 4433 ; FIX(0.541196100) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_175 equ 9633 ; FIX(1.175875602) -F_1_501 equ 12299 ; FIX(1.501321110) -F_1_847 equ 15137 ; FIX(1.847759065) -F_1_961 equ 16069 ; FIX(1.961570560) -F_2_053 equ 16819 ; FIX(2.053119869) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_072 equ 25172 ; FIX(3.072711026) +F_0_298 equ 2446 ; FIX(0.298631336) +F_0_390 equ 3196 ; FIX(0.390180644) +F_0_541 equ 4433 ; FIX(0.541196100) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_175 equ 9633 ; FIX(1.175875602) +F_1_501 equ 12299 ; FIX(1.501321110) +F_1_847 equ 15137 ; FIX(1.847759065) +F_1_961 equ 16069 ; FIX(1.961570560) +F_2_053 equ 16819 ; FIX(2.053119869) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_072 equ 25172 ; FIX(3.072711026) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) -F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) -F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) -F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) -F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_islow_sse2) + alignz 16 + global EXTN(jconst_idct_islow_sse2) EXTN(jconst_idct_islow_sse2): -PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 -PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) -PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 -PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) -PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 -PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) -PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 -PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) -PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) -PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE +PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 +PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) +PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 +PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) +PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 +PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) +PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 +PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) +PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) +PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -95,765 +95,765 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; jpeg_component_info * compptr -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; jpeg_component_info * compptr +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 12 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 12 - align 16 - global EXTN(jsimd_idct_islow_sse2) + align 16 + global EXTN(jsimd_idct_islow_sse2) EXTN(jsimd_idct_islow_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm1,xmm0 - packsswb xmm1,xmm1 - packsswb xmm1,xmm1 - movd eax,xmm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm5,PASS1_BITS - - movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) - punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) - - pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) - pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) - pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) - pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) - pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) - pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) - pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) - pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) - - movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 - jmp near .column_end - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5,PASS1_BITS + + movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end + alignx 16,7 %endif .columnDCT: - ; -- Even part - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm4,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm4,xmm3 ; xmm3=in6=z3 - punpckhwd xmm5,xmm3 - movdqa xmm1,xmm4 - movdqa xmm3,xmm5 - pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L - pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L - pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H - - movdqa xmm6,xmm0 - paddw xmm0,xmm2 ; xmm0=in0+in4 - psubw xmm6,xmm2 ; xmm6=in0-in4 - - pxor xmm7,xmm7 - pxor xmm2,xmm2 - punpcklwd xmm7,xmm0 ; xmm7=tmp0L - punpckhwd xmm2,xmm0 ; xmm2=tmp0H - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS - - movdqa xmm0,xmm7 - paddd xmm7,xmm4 ; xmm7=tmp10L - psubd xmm0,xmm4 ; xmm0=tmp13L - movdqa xmm4,xmm2 - paddd xmm2,xmm5 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp13H - - movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm7,xmm7 - punpcklwd xmm5,xmm6 ; xmm5=tmp1L - punpckhwd xmm7,xmm6 ; xmm7=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS - - movdqa xmm2,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm2,xmm1 ; xmm2=tmp12L - movdqa xmm0,xmm7 - paddd xmm7,xmm3 ; xmm7=tmp11H - psubd xmm0,xmm3 ; xmm0=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm5,xmm6 - movdqa xmm7,xmm4 - paddw xmm5,xmm3 ; xmm5=z3 - paddw xmm7,xmm1 ; xmm7=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm2,xmm5 - movdqa xmm0,xmm5 - punpcklwd xmm2,xmm7 - punpckhwd xmm0,xmm7 - movdqa xmm5,xmm2 - movdqa xmm7,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H - pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L - pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm2,xmm3 - movdqa xmm0,xmm3 - punpcklwd xmm2,xmm4 - punpckhwd xmm0,xmm4 - movdqa xmm3,xmm2 - movdqa xmm4,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H - pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L - pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H - - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L - paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H - paddd xmm3,xmm5 ; xmm3=tmp3L - paddd xmm4,xmm7 ; xmm4=tmp3H - - movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H - - movdqa xmm2,xmm1 - movdqa xmm0,xmm1 - punpcklwd xmm2,xmm6 - punpckhwd xmm0,xmm6 - movdqa xmm1,xmm2 - movdqa xmm6,xmm0 - pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L - pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H - pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L - pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H - - paddd xmm2,xmm5 ; xmm2=tmp1L - paddd xmm0,xmm7 ; xmm0=tmp1H - paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H - - movdqa xmm2,xmm5 - movdqa xmm0,xmm7 - paddd xmm5,xmm3 ; xmm5=data0L - paddd xmm7,xmm4 ; xmm7=data0H - psubd xmm2,xmm3 ; xmm2=data7L - psubd xmm0,xmm4 ; xmm0=data7H - - movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] - - paddd xmm5,xmm3 - paddd xmm7,xmm3 - psrad xmm5,DESCALE_P1 - psrad xmm7,DESCALE_P1 - paddd xmm2,xmm3 - paddd xmm0,xmm3 - psrad xmm2,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) - packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) - - movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L - movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H - - movdqa xmm7,xmm4 - movdqa xmm0,xmm3 - paddd xmm4,xmm1 ; xmm4=data1L - paddd xmm3,xmm6 ; xmm3=data1H - psubd xmm7,xmm1 ; xmm7=data6L - psubd xmm0,xmm6 ; xmm0=data6H - - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] - - paddd xmm4,xmm1 - paddd xmm3,xmm1 - psrad xmm4,DESCALE_P1 - psrad xmm3,DESCALE_P1 - paddd xmm7,xmm1 - paddd xmm0,xmm1 - psrad xmm7,DESCALE_P1 - psrad xmm0,DESCALE_P1 - - packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) - - movdqa xmm6,xmm5 ; transpose coefficients(phase 1) - punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm1,xmm7 ; transpose coefficients(phase 1) - punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) - punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) - - movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L - movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H - movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L - movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) - movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) - movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) - movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) - - movdqa xmm5,xmm3 - movdqa xmm6,xmm0 - paddd xmm3,xmm4 ; xmm3=data2L - paddd xmm0,xmm2 ; xmm0=data2H - psubd xmm5,xmm4 ; xmm5=data5L - psubd xmm6,xmm2 ; xmm6=data5H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] - - paddd xmm3,xmm7 - paddd xmm0,xmm7 - psrad xmm3,DESCALE_P1 - psrad xmm0,DESCALE_P1 - paddd xmm5,xmm7 - paddd xmm6,xmm7 - psrad xmm5,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) - packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) - - movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L - movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H - movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L - movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H - - movdqa xmm0,xmm1 - movdqa xmm6,xmm4 - paddd xmm1,xmm2 ; xmm1=data3L - paddd xmm4,xmm7 ; xmm4=data3H - psubd xmm0,xmm2 ; xmm0=data4L - psubd xmm6,xmm7 ; xmm6=data4H - - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] - - paddd xmm1,xmm2 - paddd xmm4,xmm2 - psrad xmm1,DESCALE_P1 - psrad xmm4,DESCALE_P1 - paddd xmm0,xmm2 - paddd xmm6,xmm2 - psrad xmm0,DESCALE_P1 - psrad xmm6,DESCALE_P1 - - packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) - packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) - movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) - - movdqa xmm4,xmm3 ; transpose coefficients(phase 1) - punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) - punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) - movdqa xmm6,xmm0 ; transpose coefficients(phase 1) - punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) - punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 2) - punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) - punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) - punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) - movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) - - movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) - movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) - - movdqa xmm2,xmm0 ; transpose coefficients(phase 2) - punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) - punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) - movdqa xmm5,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) - punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) - - movdqa xmm3,xmm7 ; transpose coefficients(phase 3) - punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) - punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) - movdqa xmm4,xmm1 ; transpose coefficients(phase 3) - punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) - punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) - - movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) - - movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 - movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 - - movdqa xmm3,xmm0 ; transpose coefficients(phase 3) - punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) - punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) - movdqa xmm4,xmm2 ; transpose coefficients(phase 3) - punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) - punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) - - movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 - movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm4,xmm3 ; xmm3=in6=z3 + punpckhwd xmm5,xmm3 + movdqa xmm1,xmm4 + movdqa xmm3,xmm5 + pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L + pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H + + movdqa xmm6,xmm0 + paddw xmm0,xmm2 ; xmm0=in0+in4 + psubw xmm6,xmm2 ; xmm6=in0-in4 + + pxor xmm7,xmm7 + pxor xmm2,xmm2 + punpcklwd xmm7,xmm0 ; xmm7=tmp0L + punpckhwd xmm2,xmm0 ; xmm2=tmp0H + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0,xmm7 + paddd xmm7,xmm4 ; xmm7=tmp10L + psubd xmm0,xmm4 ; xmm0=tmp13L + movdqa xmm4,xmm2 + paddd xmm2,xmm5 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm7,xmm7 + punpcklwd xmm5,xmm6 ; xmm5=tmp1L + punpckhwd xmm7,xmm6 ; xmm7=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm2,xmm1 ; xmm2=tmp12L + movdqa xmm0,xmm7 + paddd xmm7,xmm3 ; xmm7=tmp11H + psubd xmm0,xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5,xmm6 + movdqa xmm7,xmm4 + paddw xmm5,xmm3 ; xmm5=z3 + paddw xmm7,xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2,xmm5 + movdqa xmm0,xmm5 + punpcklwd xmm2,xmm7 + punpckhwd xmm0,xmm7 + movdqa xmm5,xmm2 + movdqa xmm7,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H + pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2,xmm3 + movdqa xmm0,xmm3 + punpcklwd xmm2,xmm4 + punpckhwd xmm0,xmm4 + movdqa xmm3,xmm2 + movdqa xmm4,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H + pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3,xmm5 ; xmm3=tmp3L + paddd xmm4,xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2,xmm1 + movdqa xmm0,xmm1 + punpcklwd xmm2,xmm6 + punpckhwd xmm0,xmm6 + movdqa xmm1,xmm2 + movdqa xmm6,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H + pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L + pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm2,xmm5 ; xmm2=tmp1L + paddd xmm0,xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2,xmm5 + movdqa xmm0,xmm7 + paddd xmm5,xmm3 ; xmm5=data0L + paddd xmm7,xmm4 ; xmm7=data0H + psubd xmm2,xmm3 ; xmm2=data7L + psubd xmm0,xmm4 ; xmm0=data7H + + movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] + + paddd xmm5,xmm3 + paddd xmm7,xmm3 + psrad xmm5,DESCALE_P1 + psrad xmm7,DESCALE_P1 + paddd xmm2,xmm3 + paddd xmm0,xmm3 + psrad xmm2,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7,xmm4 + movdqa xmm0,xmm3 + paddd xmm4,xmm1 ; xmm4=data1L + paddd xmm3,xmm6 ; xmm3=data1H + psubd xmm7,xmm1 ; xmm7=data6L + psubd xmm0,xmm6 ; xmm0=data6H + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] + + paddd xmm4,xmm1 + paddd xmm3,xmm1 + psrad xmm4,DESCALE_P1 + psrad xmm3,DESCALE_P1 + paddd xmm7,xmm1 + paddd xmm0,xmm1 + psrad xmm7,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5,xmm3 + movdqa xmm6,xmm0 + paddd xmm3,xmm4 ; xmm3=data2L + paddd xmm0,xmm2 ; xmm0=data2H + psubd xmm5,xmm4 ; xmm5=data5L + psubd xmm6,xmm2 ; xmm6=data5H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] + + paddd xmm3,xmm7 + paddd xmm0,xmm7 + psrad xmm3,DESCALE_P1 + psrad xmm0,DESCALE_P1 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + psrad xmm5,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0,xmm1 + movdqa xmm6,xmm4 + paddd xmm1,xmm2 ; xmm1=data3L + paddd xmm4,xmm7 ; xmm4=data3H + psubd xmm0,xmm2 ; xmm0=data4L + psubd xmm6,xmm7 ; xmm6=data4H + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] + + paddd xmm1,xmm2 + paddd xmm4,xmm2 + psrad xmm1,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm0,xmm2 + paddd xmm6,xmm2 + psrad xmm0,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4,xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 .column_end: - ; -- Prefetch the next coefficient block - - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - - ; -- Even part - - ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 - - ; (Original) - ; z1 = (z2 + z3) * 0.541196100; - ; tmp2 = z1 + z3 * -1.847759065; - ; tmp3 = z1 + z2 * 0.765366865; - ; - ; (This implementation) - ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); - ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; - - movdqa xmm6,xmm1 ; xmm1=in2=z2 - movdqa xmm5,xmm1 - punpcklwd xmm6,xmm2 ; xmm2=in6=z3 - punpckhwd xmm5,xmm2 - movdqa xmm1,xmm6 - movdqa xmm2,xmm5 - pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L - pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H - pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L - pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H - - movdqa xmm3,xmm7 - paddw xmm7,xmm0 ; xmm7=in0+in4 - psubw xmm3,xmm0 ; xmm3=in0-in4 - - pxor xmm4,xmm4 - pxor xmm0,xmm0 - punpcklwd xmm4,xmm7 ; xmm4=tmp0L - punpckhwd xmm0,xmm7 ; xmm0=tmp0H - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS - - movdqa xmm7,xmm4 - paddd xmm4,xmm6 ; xmm4=tmp10L - psubd xmm7,xmm6 ; xmm7=tmp13L - movdqa xmm6,xmm0 - paddd xmm0,xmm5 ; xmm0=tmp10H - psubd xmm6,xmm5 ; xmm6=tmp13H - - movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L - movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H - movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L - movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H - - pxor xmm5,xmm5 - pxor xmm4,xmm4 - punpcklwd xmm5,xmm3 ; xmm5=tmp1L - punpckhwd xmm4,xmm3 ; xmm4=tmp1H - psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS - psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS - - movdqa xmm0,xmm5 - paddd xmm5,xmm1 ; xmm5=tmp11L - psubd xmm0,xmm1 ; xmm0=tmp12L - movdqa xmm7,xmm4 - paddd xmm4,xmm2 ; xmm4=tmp11H - psubd xmm7,xmm2 ; xmm7=tmp12H - - movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L - movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H - movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L - movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H - - ; -- Odd part - - movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 - movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 - movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 - movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 - - movdqa xmm5,xmm6 - movdqa xmm4,xmm3 - paddw xmm5,xmm1 ; xmm5=z3 - paddw xmm4,xmm2 ; xmm4=z4 - - ; (Original) - ; z5 = (z3 + z4) * 1.175875602; - ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; - ; z3 += z5; z4 += z5; - ; - ; (This implementation) - ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; - ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); - - movdqa xmm0,xmm5 - movdqa xmm7,xmm5 - punpcklwd xmm0,xmm4 - punpckhwd xmm7,xmm4 - movdqa xmm5,xmm0 - movdqa xmm4,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H - pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L - pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H - - ; (Original) - ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; - ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; - ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; - ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; - ; tmp0 += z1 + z3; tmp1 += z2 + z4; - ; tmp2 += z2 + z3; tmp3 += z1 + z4; - ; - ; (This implementation) - ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; - ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; - ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); - ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); - ; tmp0 += z3; tmp1 += z4; - ; tmp2 += z3; tmp3 += z4; - - movdqa xmm0,xmm1 - movdqa xmm7,xmm1 - punpcklwd xmm0,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm1,xmm0 - movdqa xmm3,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H - pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L - pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H - - paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L - paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H - paddd xmm1,xmm5 ; xmm1=tmp3L - paddd xmm3,xmm4 ; xmm3=tmp3H - - movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L - movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H - - movdqa xmm0,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm0,xmm6 - punpckhwd xmm7,xmm6 - movdqa xmm2,xmm0 - movdqa xmm6,xmm7 - pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L - pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H - pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L - pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H - - paddd xmm0,xmm5 ; xmm0=tmp1L - paddd xmm7,xmm4 ; xmm7=tmp1H - paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L - paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H - - movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L - movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H - - ; -- Final output stage - - movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L - movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H - - movdqa xmm0,xmm5 - movdqa xmm7,xmm4 - paddd xmm5,xmm1 ; xmm5=data0L - paddd xmm4,xmm3 ; xmm4=data0H - psubd xmm0,xmm1 ; xmm0=data7L - psubd xmm7,xmm3 ; xmm7=data7H - - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] - - paddd xmm5,xmm1 - paddd xmm4,xmm1 - psrad xmm5,DESCALE_P2 - psrad xmm4,DESCALE_P2 - paddd xmm0,xmm1 - paddd xmm7,xmm1 - psrad xmm0,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) - packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) - - movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L - movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H - - movdqa xmm4,xmm3 - movdqa xmm7,xmm1 - paddd xmm3,xmm2 ; xmm3=data1L - paddd xmm1,xmm6 ; xmm1=data1H - psubd xmm4,xmm2 ; xmm4=data6L - psubd xmm7,xmm6 ; xmm7=data6H - - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] - - paddd xmm3,xmm2 - paddd xmm1,xmm2 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm4,xmm2 - paddd xmm7,xmm2 - psrad xmm4,DESCALE_P2 - psrad xmm7,DESCALE_P2 - - packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) - packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) - - packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L - movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H - movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L - movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H - - movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - movdqa xmm4,xmm6 - movdqa xmm0,xmm2 - paddd xmm6,xmm1 ; xmm6=data2L - paddd xmm2,xmm7 ; xmm2=data2H - psubd xmm4,xmm1 ; xmm4=data5L - psubd xmm0,xmm7 ; xmm0=data5H - - movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] - - paddd xmm6,xmm5 - paddd xmm2,xmm5 - psrad xmm6,DESCALE_P2 - psrad xmm2,DESCALE_P2 - paddd xmm4,xmm5 - paddd xmm0,xmm5 - psrad xmm4,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) - packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) - - movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L - movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H - movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L - movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H - - movdqa xmm2,xmm3 - movdqa xmm0,xmm1 - paddd xmm3,xmm7 ; xmm3=data3L - paddd xmm1,xmm5 ; xmm1=data3H - psubd xmm2,xmm7 ; xmm2=data4L - psubd xmm0,xmm5 ; xmm0=data4H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] - - paddd xmm3,xmm7 - paddd xmm1,xmm7 - psrad xmm3,DESCALE_P2 - psrad xmm1,DESCALE_P2 - paddd xmm2,xmm7 - paddd xmm0,xmm7 - psrad xmm2,DESCALE_P2 - psrad xmm0,DESCALE_P2 - - movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] - - packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) - packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) - movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) - - packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) - packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) - - paddb xmm7,xmm5 - paddb xmm1,xmm5 - paddb xmm6,xmm5 - paddb xmm3,xmm5 - - movdqa xmm0,xmm7 ; transpose coefficients(phase 1) - punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) - punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) - movdqa xmm2,xmm6 ; transpose coefficients(phase 1) - punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) - punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) - - movdqa xmm4,xmm7 ; transpose coefficients(phase 2) - punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) - punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) - movdqa xmm5,xmm2 ; transpose coefficients(phase 2) - punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) - punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) - - movdqa xmm1,xmm7 ; transpose coefficients(phase 3) - punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) - punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) - movdqa xmm3,xmm4 ; transpose coefficients(phase 3) - punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) - punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) - - pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) - pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) - pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) - pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 - mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - - mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 - mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] - movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 - movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm6,xmm2 ; xmm2=in6=z3 + punpckhwd xmm5,xmm2 + movdqa xmm1,xmm6 + movdqa xmm2,xmm5 + pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L + pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H + + movdqa xmm3,xmm7 + paddw xmm7,xmm0 ; xmm7=in0+in4 + psubw xmm3,xmm0 ; xmm3=in0-in4 + + pxor xmm4,xmm4 + pxor xmm0,xmm0 + punpcklwd xmm4,xmm7 ; xmm4=tmp0L + punpckhwd xmm0,xmm7 ; xmm0=tmp0H + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7,xmm4 + paddd xmm4,xmm6 ; xmm4=tmp10L + psubd xmm7,xmm6 ; xmm7=tmp13L + movdqa xmm6,xmm0 + paddd xmm0,xmm5 ; xmm0=tmp10H + psubd xmm6,xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm4,xmm4 + punpcklwd xmm5,xmm3 ; xmm5=tmp1L + punpckhwd xmm4,xmm3 ; xmm4=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm0,xmm1 ; xmm0=tmp12L + movdqa xmm7,xmm4 + paddd xmm4,xmm2 ; xmm4=tmp11H + psubd xmm7,xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5,xmm6 + movdqa xmm4,xmm3 + paddw xmm5,xmm1 ; xmm5=z3 + paddw xmm4,xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0,xmm5 + movdqa xmm7,xmm5 + punpcklwd xmm0,xmm4 + punpckhwd xmm7,xmm4 + movdqa xmm5,xmm0 + movdqa xmm4,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H + pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0,xmm1 + movdqa xmm7,xmm1 + punpcklwd xmm0,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm1,xmm0 + movdqa xmm3,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H + pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L + pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1,xmm5 ; xmm1=tmp3L + paddd xmm3,xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm0,xmm6 + punpckhwd xmm7,xmm6 + movdqa xmm2,xmm0 + movdqa xmm6,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H + pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L + pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm0,xmm5 ; xmm0=tmp1L + paddd xmm7,xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0,xmm5 + movdqa xmm7,xmm4 + paddd xmm5,xmm1 ; xmm5=data0L + paddd xmm4,xmm3 ; xmm4=data0H + psubd xmm0,xmm1 ; xmm0=data7L + psubd xmm7,xmm3 ; xmm7=data7H + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] + + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrad xmm5,DESCALE_P2 + psrad xmm4,DESCALE_P2 + paddd xmm0,xmm1 + paddd xmm7,xmm1 + psrad xmm0,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4,xmm3 + movdqa xmm7,xmm1 + paddd xmm3,xmm2 ; xmm3=data1L + paddd xmm1,xmm6 ; xmm1=data1H + psubd xmm4,xmm2 ; xmm4=data6L + psubd xmm7,xmm6 ; xmm7=data6H + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] + + paddd xmm3,xmm2 + paddd xmm1,xmm2 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm4,xmm2 + paddd xmm7,xmm2 + psrad xmm4,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4,xmm6 + movdqa xmm0,xmm2 + paddd xmm6,xmm1 ; xmm6=data2L + paddd xmm2,xmm7 ; xmm2=data2H + psubd xmm4,xmm1 ; xmm4=data5L + psubd xmm0,xmm7 ; xmm0=data5H + + movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] + + paddd xmm6,xmm5 + paddd xmm2,xmm5 + psrad xmm6,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm4,xmm5 + paddd xmm0,xmm5 + psrad xmm4,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2,xmm3 + movdqa xmm0,xmm1 + paddd xmm3,xmm7 ; xmm3=data3L + paddd xmm1,xmm5 ; xmm1=data3H + psubd xmm2,xmm7 ; xmm2=data4L + psubd xmm0,xmm5 ; xmm0=data4H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] + + paddd xmm3,xmm7 + paddd xmm1,xmm7 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm2,xmm7 + paddd xmm0,xmm7 + psrad xmm2,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] + + packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7,xmm5 + paddb xmm1,xmm5 + paddb xmm6,xmm5 + paddb xmm3,xmm5 + + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] + movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 + movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jiss2red-64.asm b/simd/jiss2red-64.asm index 6807f17ce..bd7c35c45 100644 --- a/simd/jiss2red-64.asm +++ b/simd/jiss2red-64.asm @@ -27,74 +27,74 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) -%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) -%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) -%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) %if CONST_BITS == 13 -F_0_211 equ 1730 ; FIX(0.211164243) -F_0_509 equ 4176 ; FIX(0.509795579) -F_0_601 equ 4926 ; FIX(0.601344887) -F_0_720 equ 5906 ; FIX(0.720959822) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_850 equ 6967 ; FIX(0.850430095) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_061 equ 8697 ; FIX(1.061594337) -F_1_272 equ 10426 ; FIX(1.272758580) -F_1_451 equ 11893 ; FIX(1.451774981) -F_1_847 equ 15137 ; FIX(1.847759065) -F_2_172 equ 17799 ; FIX(2.172734803) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_624 equ 29692 ; FIX(3.624509785) +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) -F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) -F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) -F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) -F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) -F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_red_sse2) + alignz 16 + global EXTN(jconst_idct_red_sse2) EXTN(jconst_idct_red_sse2): -PW_F184_MF076 times 4 dw F_1_847,-F_0_765 -PW_F256_F089 times 4 dw F_2_562, F_0_899 -PW_F106_MF217 times 4 dw F_1_061,-F_2_172 -PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 -PW_F145_MF021 times 4 dw F_1_451,-F_0_211 -PW_F362_MF127 times 4 dw F_3_624,-F_1_272 -PW_F085_MF072 times 4 dw F_0_850,-F_0_720 -PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) -PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) -PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) -PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE - - alignz 16 +PW_F184_MF076 times 4 dw F_1_847,-F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 4 dw F_1_451,-F_0_211 +PW_F362_MF127 times 4 dw F_3_624,-F_1_272 +PW_F085_MF072 times 4 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 64 + SECTION SEG_TEXT + BITS 64 ; ; Perform dequantization and inverse DCT on one block of coefficients, ; producing a reduced-size 4x4 output block. @@ -109,292 +109,292 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col -%define original_rbp rbp+0 -%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_rbp rbp+0 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_idct_4x4_sse2) + align 16 + global EXTN(jsimd_idct_4x4_sse2) EXTN(jsimd_idct_4x4_sse2): - push rbp - mov rax,rsp ; rax = original rbp - sub rsp, byte 4 - and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [rsp],rax - mov rbp,rsp ; rbp = aligned rbp - lea rsp, [wk(0)] - collect_args + push rbp + mov rax,rsp ; rax = original rbp + sub rsp, byte 4 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [rsp],rax + mov rbp,rsp ; rbp = aligned rbp + lea rsp, [wk(0)] + collect_args - ; ---- Pass 1: process columns from input. + ; ---- Pass 1: process columns from input. - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 - mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - por xmm0,xmm1 - packsswb xmm0,xmm0 - packsswb xmm0,xmm0 - movd eax,xmm0 - test rax,rax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm0,PASS1_BITS - - movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) - - pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) - pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) - pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) - pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) - - jmp near .column_end + mov eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + por xmm0,xmm1 + packsswb xmm0,xmm0 + packsswb xmm0,xmm0 + movd eax,xmm0 + test rax,rax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0,PASS1_BITS + + movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end %endif .columnDCT: - ; -- Odd part - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm0 - punpcklwd xmm4,xmm1 - punpckhwd xmm5,xmm1 - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L) - pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H) - pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L) - pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H) - - movdqa xmm6,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm6,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L) - pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H) - pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L) - pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H) - - paddd xmm6,xmm4 ; xmm6=tmp2L - paddd xmm7,xmm5 ; xmm7=tmp2H - paddd xmm2,xmm0 ; xmm2=tmp0L - paddd xmm3,xmm1 ; xmm3=tmp0H - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H - - ; -- Even part - - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] - movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - - pxor xmm1,xmm1 - pxor xmm2,xmm2 - punpcklwd xmm1,xmm4 ; xmm1=tmp0L - punpckhwd xmm2,xmm4 ; xmm2=tmp0H - psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 - psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 - - movdqa xmm3,xmm5 ; xmm5=in2=z2 - punpcklwd xmm5,xmm0 ; xmm0=in6=z3 - punpckhwd xmm3,xmm0 - pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L - pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H - - movdqa xmm4,xmm1 - movdqa xmm0,xmm2 - paddd xmm1,xmm5 ; xmm1=tmp10L - paddd xmm2,xmm3 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp12L - psubd xmm0,xmm3 ; xmm0=tmp12H - - ; -- Final output stage - - movdqa xmm5,xmm1 - movdqa xmm3,xmm2 - paddd xmm1,xmm6 ; xmm1=data0L - paddd xmm2,xmm7 ; xmm2=data0H - psubd xmm5,xmm6 ; xmm5=data3L - psubd xmm3,xmm7 ; xmm3=data3H - - movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] - - paddd xmm1,xmm6 - paddd xmm2,xmm6 - psrad xmm1,DESCALE_P1_4 - psrad xmm2,DESCALE_P1_4 - paddd xmm5,xmm6 - paddd xmm3,xmm6 - psrad xmm5,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) - packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H - - movdqa xmm2,xmm4 - movdqa xmm3,xmm0 - paddd xmm4,xmm7 ; xmm4=data1L - paddd xmm0,xmm6 ; xmm0=data1H - psubd xmm2,xmm7 ; xmm2=data2L - psubd xmm3,xmm6 ; xmm3=data2H - - movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] - - paddd xmm4,xmm7 - paddd xmm0,xmm7 - psrad xmm4,DESCALE_P1_4 - psrad xmm0,DESCALE_P1_4 - paddd xmm2,xmm7 - paddd xmm3,xmm7 - psrad xmm2,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm7,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) - - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) - punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) - movdqa xmm3,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) - punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm0 + punpcklwd xmm4,xmm1 + punpckhwd xmm5,xmm1 + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + pmaddwd xmm4,[rel PW_F256_F089] ; xmm4=(tmp2L) + pmaddwd xmm5,[rel PW_F256_F089] ; xmm5=(tmp2H) + pmaddwd xmm0,[rel PW_F106_MF217] ; xmm0=(tmp0L) + pmaddwd xmm1,[rel PW_F106_MF217] ; xmm1=(tmp0H) + + movdqa xmm6,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm6,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2L) + pmaddwd xmm7,[rel PW_MF060_MF050] ; xmm7=(tmp2H) + pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0L) + pmaddwd xmm3,[rel PW_F145_MF021] ; xmm3=(tmp0H) + + paddd xmm6,xmm4 ; xmm6=tmp2L + paddd xmm7,xmm5 ; xmm7=tmp2H + paddd xmm2,xmm0 ; xmm2=tmp0L + paddd xmm3,xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1,xmm1 + pxor xmm2,xmm2 + punpcklwd xmm1,xmm4 ; xmm1=tmp0L + punpckhwd xmm2,xmm4 ; xmm2=tmp0H + psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3,xmm5 ; xmm5=in2=z2 + punpcklwd xmm5,xmm0 ; xmm0=in6=z3 + punpckhwd xmm3,xmm0 + pmaddwd xmm5,[rel PW_F184_MF076] ; xmm5=tmp2L + pmaddwd xmm3,[rel PW_F184_MF076] ; xmm3=tmp2H + + movdqa xmm4,xmm1 + movdqa xmm0,xmm2 + paddd xmm1,xmm5 ; xmm1=tmp10L + paddd xmm2,xmm3 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp12L + psubd xmm0,xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5,xmm1 + movdqa xmm3,xmm2 + paddd xmm1,xmm6 ; xmm1=data0L + paddd xmm2,xmm7 ; xmm2=data0H + psubd xmm5,xmm6 ; xmm5=data3L + psubd xmm3,xmm7 ; xmm3=data3H + + movdqa xmm6,[rel PD_DESCALE_P1_4] ; xmm6=[rel PD_DESCALE_P1_4] + + paddd xmm1,xmm6 + paddd xmm2,xmm6 + psrad xmm1,DESCALE_P1_4 + psrad xmm2,DESCALE_P1_4 + paddd xmm5,xmm6 + paddd xmm3,xmm6 + psrad xmm5,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2,xmm4 + movdqa xmm3,xmm0 + paddd xmm4,xmm7 ; xmm4=data1L + paddd xmm0,xmm6 ; xmm0=data1H + psubd xmm2,xmm7 ; xmm2=data2L + psubd xmm3,xmm6 ; xmm3=data2H + + movdqa xmm7,[rel PD_DESCALE_P1_4] ; xmm7=[rel PD_DESCALE_P1_4] + + paddd xmm4,xmm7 + paddd xmm0,xmm7 + psrad xmm4,DESCALE_P1_4 + psrad xmm0,DESCALE_P1_4 + paddd xmm2,xmm7 + paddd xmm3,xmm7 + psrad xmm2,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) .column_end: - ; -- Prefetch the next coefficient block + ; -- Prefetch the next coefficient block - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - ; ---- Pass 2: process rows, store into output array. + ; ---- Pass 2: process rows, store into output array. - mov rax, [original_rbp] - mov rdi, r12 ; (JSAMPROW *) - mov rax, r13 + mov rax, [original_rbp] + mov rdi, r12 ; (JSAMPROW *) + mov rax, r13 - ; -- Even part + ; -- Even part - pxor xmm4,xmm4 - punpcklwd xmm4,xmm1 ; xmm4=tmp0 - psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + pxor xmm4,xmm4 + punpcklwd xmm4,xmm1 ; xmm4=tmp0 + psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 - ; -- Odd part + ; -- Odd part - punpckhwd xmm1,xmm0 - punpckhwd xmm6,xmm3 - movdqa xmm5,xmm1 - movdqa xmm2,xmm6 - pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2) - pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2) - pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0) - pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0) + punpckhwd xmm1,xmm0 + punpckhwd xmm6,xmm3 + movdqa xmm5,xmm1 + movdqa xmm2,xmm6 + pmaddwd xmm1,[rel PW_F256_F089] ; xmm1=(tmp2) + pmaddwd xmm6,[rel PW_MF060_MF050] ; xmm6=(tmp2) + pmaddwd xmm5,[rel PW_F106_MF217] ; xmm5=(tmp0) + pmaddwd xmm2,[rel PW_F145_MF021] ; xmm2=(tmp0) - paddd xmm6,xmm1 ; xmm6=tmp2 - paddd xmm2,xmm5 ; xmm2=tmp0 + paddd xmm6,xmm1 ; xmm6=tmp2 + paddd xmm2,xmm5 ; xmm2=tmp0 - ; -- Even part + ; -- Even part - punpcklwd xmm0,xmm3 - pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2 + punpcklwd xmm0,xmm3 + pmaddwd xmm0,[rel PW_F184_MF076] ; xmm0=tmp2 - movdqa xmm7,xmm4 - paddd xmm4,xmm0 ; xmm4=tmp10 - psubd xmm7,xmm0 ; xmm7=tmp12 + movdqa xmm7,xmm4 + paddd xmm4,xmm0 ; xmm4=tmp10 + psubd xmm7,xmm0 ; xmm7=tmp12 - ; -- Final output stage + ; -- Final output stage - movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] + movdqa xmm1,[rel PD_DESCALE_P2_4] ; xmm1=[rel PD_DESCALE_P2_4] - movdqa xmm5,xmm4 - movdqa xmm3,xmm7 - paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) - paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) - psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) - psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + movdqa xmm5,xmm4 + movdqa xmm3,xmm7 + paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) - paddd xmm4,xmm1 - paddd xmm7,xmm1 - psrad xmm4,DESCALE_P2_4 - psrad xmm7,DESCALE_P2_4 - paddd xmm5,xmm1 - paddd xmm3,xmm1 - psrad xmm5,DESCALE_P2_4 - psrad xmm3,DESCALE_P2_4 + paddd xmm4,xmm1 + paddd xmm7,xmm1 + psrad xmm4,DESCALE_P2_4 + psrad xmm7,DESCALE_P2_4 + paddd xmm5,xmm1 + paddd xmm3,xmm1 + psrad xmm5,DESCALE_P2_4 + psrad xmm3,DESCALE_P2_4 - packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) - packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) - movdqa xmm0,xmm4 ; transpose coefficients(phase 1) - punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) - punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + movdqa xmm0,xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) - movdqa xmm6,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) - punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + movdqa xmm6,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) - packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) - paddb xmm4,[rel PB_CENTERJSAMP] + packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4,[rel PB_CENTERJSAMP] - pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) - pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) - pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 - movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 - mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] - movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 - movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 + mov rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 - uncollect_args - mov rsp,rbp ; rsp <- aligned rbp - pop rsp ; rsp <- original rbp - pop rbp - ret + uncollect_args + mov rsp,rbp ; rsp <- aligned rbp + pop rsp ; rsp <- original rbp + pop rbp + ret ; -------------------------------------------------------------------------- @@ -412,165 +412,165 @@ EXTN(jsimd_idct_4x4_sse2): ; r12 = JSAMPARRAY output_buf ; r13 = JDIMENSION output_col - align 16 - global EXTN(jsimd_idct_2x2_sse2) + align 16 + global EXTN(jsimd_idct_2x2_sse2) EXTN(jsimd_idct_2x2_sse2): - push rbp - mov rax,rsp - mov rbp,rsp - collect_args - push rbx - - ; ---- Pass 1: process columns from input. - - mov rdx, r10 ; quantptr - mov rsi, r11 ; inptr - - ; | input: | result: | - ; | 00 01 ** 03 ** 05 ** 07 | | - ; | 10 11 ** 13 ** 15 ** 17 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | - ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | - ; | 50 51 ** 53 ** 55 ** 57 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 70 71 ** 73 ** 75 ** 77 | | - - ; -- Odd part + push rbp + mov rax,rsp + mov rbp,rsp + collect_args + push rbx + + ; ---- Pass 1: process columns from input. + + mov rdx, r10 ; quantptr + mov rsi, r11 ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) - ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) - pcmpeqd xmm7,xmm7 - pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + pcmpeqd xmm7,xmm7 + pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} - movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) - movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) - punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) - punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) - pmaddwd xmm4,[rel PW_F362_MF127] - pmaddwd xmm5,[rel PW_F085_MF072] + movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4,[rel PW_F362_MF127] + pmaddwd xmm5,[rel PW_F085_MF072] - psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) - pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) - psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) - pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) - por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) - por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) - pmaddwd xmm0,[rel PW_F362_MF127] - pmaddwd xmm2,[rel PW_F085_MF072] + psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0,[rel PW_F362_MF127] + pmaddwd xmm2,[rel PW_F085_MF072] - paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] - paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] - ; -- Even part + ; -- Even part - movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] - pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] - ; xmm6=(00 01 ** 03 ** 05 ** 07) + ; xmm6=(00 01 ** 03 ** 05 ** 07) - movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) - pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) - pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) - psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] - psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] - ; -- Final output stage + ; -- Final output stage - movdqa xmm3,xmm6 - movdqa xmm5,xmm1 - paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) - paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) - psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) - psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + movdqa xmm3,xmm6 + movdqa xmm5,xmm1 + paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) - movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] + movdqa xmm2,[rel PD_DESCALE_P1_2] ; xmm2=[rel PD_DESCALE_P1_2] - punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) - movdqa xmm7,xmm1 - punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) - punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + movdqa xmm7,xmm1 + punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) - paddd xmm6,xmm2 - psrad xmm6,DESCALE_P1_2 + paddd xmm6,xmm2 + psrad xmm6,DESCALE_P1_2 - paddd xmm1,xmm2 - paddd xmm7,xmm2 - psrad xmm1,DESCALE_P1_2 - psrad xmm7,DESCALE_P1_2 + paddd xmm1,xmm2 + paddd xmm7,xmm2 + psrad xmm1,DESCALE_P1_2 + psrad xmm7,DESCALE_P1_2 - ; -- Prefetch the next coefficient block + ; -- Prefetch the next coefficient block - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - ; ---- Pass 2: process rows, store into output array. + ; ---- Pass 2: process rows, store into output array. - mov rdi, r12 ; (JSAMPROW *) - mov rax, r13 + mov rdi, r12 ; (JSAMPROW *) + mov rax, r13 - ; | input:| result:| - ; | A0 B0 | | - ; | A1 B1 | C0 C1 | - ; | A3 B3 | D0 D1 | - ; | A5 B5 | | - ; | A7 B7 | | + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | - ; -- Odd part + ; -- Odd part - packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) - packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) - pmaddwd xmm1,[rel PW_F362_MF127] - pmaddwd xmm7,[rel PW_F085_MF072] + packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1,[rel PW_F362_MF127] + pmaddwd xmm7,[rel PW_F085_MF072] - paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] - ; -- Even part + ; -- Even part - pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] - ; -- Final output stage + ; -- Final output stage - movdqa xmm4,xmm6 - paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) - psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + movdqa xmm4,xmm6 + paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) - punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) - paddd xmm6,[rel PD_DESCALE_P2_2] - psrad xmm6,DESCALE_P2_2 + paddd xmm6,[rel PD_DESCALE_P2_2] + psrad xmm6,DESCALE_P2_2 - packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) - packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) - paddb xmm6,[rel PB_CENTERJSAMP] + packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6,[rel PB_CENTERJSAMP] - pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) - pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) + pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) - mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] - mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] - mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx - mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx + mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] + mov rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] + mov WORD [rdx+rax*SIZEOF_JSAMPLE], bx + mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx - pop rbx - uncollect_args - pop rbp - ret + pop rbx + uncollect_args + pop rbp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jiss2red.asm b/simd/jiss2red.asm index 238c61d07..886d79d7f 100644 --- a/simd/jiss2red.asm +++ b/simd/jiss2red.asm @@ -26,74 +26,74 @@ ; -------------------------------------------------------------------------- -%define CONST_BITS 13 -%define PASS1_BITS 2 +%define CONST_BITS 13 +%define PASS1_BITS 2 -%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) -%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) -%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) -%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) +%define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) +%define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) +%define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) +%define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) %if CONST_BITS == 13 -F_0_211 equ 1730 ; FIX(0.211164243) -F_0_509 equ 4176 ; FIX(0.509795579) -F_0_601 equ 4926 ; FIX(0.601344887) -F_0_720 equ 5906 ; FIX(0.720959822) -F_0_765 equ 6270 ; FIX(0.765366865) -F_0_850 equ 6967 ; FIX(0.850430095) -F_0_899 equ 7373 ; FIX(0.899976223) -F_1_061 equ 8697 ; FIX(1.061594337) -F_1_272 equ 10426 ; FIX(1.272758580) -F_1_451 equ 11893 ; FIX(1.451774981) -F_1_847 equ 15137 ; FIX(1.847759065) -F_2_172 equ 17799 ; FIX(2.172734803) -F_2_562 equ 20995 ; FIX(2.562915447) -F_3_624 equ 29692 ; FIX(3.624509785) +F_0_211 equ 1730 ; FIX(0.211164243) +F_0_509 equ 4176 ; FIX(0.509795579) +F_0_601 equ 4926 ; FIX(0.601344887) +F_0_720 equ 5906 ; FIX(0.720959822) +F_0_765 equ 6270 ; FIX(0.765366865) +F_0_850 equ 6967 ; FIX(0.850430095) +F_0_899 equ 7373 ; FIX(0.899976223) +F_1_061 equ 8697 ; FIX(1.061594337) +F_1_272 equ 10426 ; FIX(1.272758580) +F_1_451 equ 11893 ; FIX(1.451774981) +F_1_847 equ 15137 ; FIX(1.847759065) +F_2_172 equ 17799 ; FIX(2.172734803) +F_2_562 equ 20995 ; FIX(2.562915447) +F_3_624 equ 29692 ; FIX(3.624509785) %else ; NASM cannot do compile-time arithmetic on floating-point constants. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) -F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) -F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) -F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) -F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) -F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) -F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) -F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) -F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) -F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) -F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) -F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) -F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) -F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) -F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) +F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) +F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) +F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) +F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) +F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) +F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) +F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) +F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) +F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) +F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) %endif ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_red_sse2) + alignz 16 + global EXTN(jconst_idct_red_sse2) EXTN(jconst_idct_red_sse2): -PW_F184_MF076 times 4 dw F_1_847,-F_0_765 -PW_F256_F089 times 4 dw F_2_562, F_0_899 -PW_F106_MF217 times 4 dw F_1_061,-F_2_172 -PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 -PW_F145_MF021 times 4 dw F_1_451,-F_0_211 -PW_F362_MF127 times 4 dw F_3_624,-F_1_272 -PW_F085_MF072 times 4 dw F_0_850,-F_0_720 -PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) -PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) -PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) -PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) -PB_CENTERJSAMP times 16 db CENTERJSAMPLE - - alignz 16 +PW_F184_MF076 times 4 dw F_1_847,-F_0_765 +PW_F256_F089 times 4 dw F_2_562, F_0_899 +PW_F106_MF217 times 4 dw F_1_061,-F_2_172 +PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 +PW_F145_MF021 times 4 dw F_1_451,-F_0_211 +PW_F362_MF127 times 4 dw F_3_624,-F_1_272 +PW_F085_MF072 times 4 dw F_0_850,-F_0_720 +PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) +PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) +PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) +PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) +PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients, ; producing a reduced-size 4x4 output block. @@ -103,309 +103,309 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void * dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 - align 16 - global EXTN(jsimd_idct_4x4_sse2) + align 16 + global EXTN(jsimd_idct_4x4_sse2) EXTN(jsimd_idct_4x4_sse2): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [wk(0)] - pushpic ebx -; push ecx ; unused -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx +; push ecx ; unused +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz short .columnDCT - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por xmm0,xmm1 - packsswb xmm0,xmm0 - packsswb xmm0,xmm0 - movd eax,xmm0 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - psllw xmm0,PASS1_BITS - - movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) - punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) - punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) - - pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) - pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) - pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) - pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) - - jmp near .column_end - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm0,xmm1 + packsswb xmm0,xmm0 + packsswb xmm0,xmm0 + movd eax,xmm0 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0,PASS1_BITS + + movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end + alignx 16,7 %endif .columnDCT: - ; -- Odd part - - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - movdqa xmm4,xmm0 - movdqa xmm5,xmm0 - punpcklwd xmm4,xmm1 - punpckhwd xmm5,xmm1 - movdqa xmm0,xmm4 - movdqa xmm1,xmm5 - pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) - pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) - pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) - pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) - - movdqa xmm6,xmm2 - movdqa xmm7,xmm2 - punpcklwd xmm6,xmm3 - punpckhwd xmm7,xmm3 - movdqa xmm2,xmm6 - movdqa xmm3,xmm7 - pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) - pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) - pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) - pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) - - paddd xmm6,xmm4 ; xmm6=tmp2L - paddd xmm7,xmm5 ; xmm7=tmp2H - paddd xmm2,xmm0 ; xmm2=tmp0L - paddd xmm3,xmm1 ; xmm3=tmp0H - - movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L - movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H - - ; -- Even part - - movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] - pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - - pxor xmm1,xmm1 - pxor xmm2,xmm2 - punpcklwd xmm1,xmm4 ; xmm1=tmp0L - punpckhwd xmm2,xmm4 ; xmm2=tmp0H - psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 - psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 - - movdqa xmm3,xmm5 ; xmm5=in2=z2 - punpcklwd xmm5,xmm0 ; xmm0=in6=z3 - punpckhwd xmm3,xmm0 - pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L - pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H - - movdqa xmm4,xmm1 - movdqa xmm0,xmm2 - paddd xmm1,xmm5 ; xmm1=tmp10L - paddd xmm2,xmm3 ; xmm2=tmp10H - psubd xmm4,xmm5 ; xmm4=tmp12L - psubd xmm0,xmm3 ; xmm0=tmp12H - - ; -- Final output stage - - movdqa xmm5,xmm1 - movdqa xmm3,xmm2 - paddd xmm1,xmm6 ; xmm1=data0L - paddd xmm2,xmm7 ; xmm2=data0H - psubd xmm5,xmm6 ; xmm5=data3L - psubd xmm3,xmm7 ; xmm3=data3H - - movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] - - paddd xmm1,xmm6 - paddd xmm2,xmm6 - psrad xmm1,DESCALE_P1_4 - psrad xmm2,DESCALE_P1_4 - paddd xmm5,xmm6 - paddd xmm3,xmm6 - psrad xmm5,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) - packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) - - movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L - movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H - - movdqa xmm2,xmm4 - movdqa xmm3,xmm0 - paddd xmm4,xmm7 ; xmm4=data1L - paddd xmm0,xmm6 ; xmm0=data1H - psubd xmm2,xmm7 ; xmm2=data2L - psubd xmm3,xmm6 ; xmm3=data2H - - movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] - - paddd xmm4,xmm7 - paddd xmm0,xmm7 - psrad xmm4,DESCALE_P1_4 - psrad xmm0,DESCALE_P1_4 - paddd xmm2,xmm7 - paddd xmm3,xmm7 - psrad xmm2,DESCALE_P1_4 - psrad xmm3,DESCALE_P1_4 - - packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) - packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) - - movdqa xmm6,xmm1 ; transpose coefficients(phase 1) - punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) - punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) - movdqa xmm7,xmm2 ; transpose coefficients(phase 1) - punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) - punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) - - movdqa xmm0,xmm1 ; transpose coefficients(phase 2) - punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) - punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) - movdqa xmm3,xmm6 ; transpose coefficients(phase 2) - punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) - punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm0 + punpcklwd xmm4,xmm1 + punpckhwd xmm5,xmm1 + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) + pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) + pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) + pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) + + movdqa xmm6,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm6,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) + pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) + pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) + + paddd xmm6,xmm4 ; xmm6=tmp2L + paddd xmm7,xmm5 ; xmm7=tmp2H + paddd xmm2,xmm0 ; xmm2=tmp0L + paddd xmm3,xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1,xmm1 + pxor xmm2,xmm2 + punpcklwd xmm1,xmm4 ; xmm1=tmp0L + punpckhwd xmm2,xmm4 ; xmm2=tmp0H + psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3,xmm5 ; xmm5=in2=z2 + punpcklwd xmm5,xmm0 ; xmm0=in6=z3 + punpckhwd xmm3,xmm0 + pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L + pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H + + movdqa xmm4,xmm1 + movdqa xmm0,xmm2 + paddd xmm1,xmm5 ; xmm1=tmp10L + paddd xmm2,xmm3 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp12L + psubd xmm0,xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5,xmm1 + movdqa xmm3,xmm2 + paddd xmm1,xmm6 ; xmm1=data0L + paddd xmm2,xmm7 ; xmm2=data0H + psubd xmm5,xmm6 ; xmm5=data3L + psubd xmm3,xmm7 ; xmm3=data3H + + movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] + + paddd xmm1,xmm6 + paddd xmm2,xmm6 + psrad xmm1,DESCALE_P1_4 + psrad xmm2,DESCALE_P1_4 + paddd xmm5,xmm6 + paddd xmm3,xmm6 + psrad xmm5,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2,xmm4 + movdqa xmm3,xmm0 + paddd xmm4,xmm7 ; xmm4=data1L + paddd xmm0,xmm6 ; xmm0=data1H + psubd xmm2,xmm7 ; xmm2=data2L + psubd xmm3,xmm6 ; xmm3=data2H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] + + paddd xmm4,xmm7 + paddd xmm0,xmm7 + psrad xmm4,DESCALE_P1_4 + psrad xmm0,DESCALE_P1_4 + paddd xmm2,xmm7 + paddd xmm3,xmm7 + psrad xmm2,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) .column_end: - ; -- Prefetch the next coefficient block + ; -- Prefetch the next coefficient block - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - ; ---- Pass 2: process rows, store into output array. + ; ---- Pass 2: process rows, store into output array. - mov eax, [original_ebp] - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] - ; -- Even part + ; -- Even part - pxor xmm4,xmm4 - punpcklwd xmm4,xmm1 ; xmm4=tmp0 - psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + pxor xmm4,xmm4 + punpcklwd xmm4,xmm1 ; xmm4=tmp0 + psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 - ; -- Odd part + ; -- Odd part - punpckhwd xmm1,xmm0 - punpckhwd xmm6,xmm3 - movdqa xmm5,xmm1 - movdqa xmm2,xmm6 - pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) - pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) - pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) - pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) + punpckhwd xmm1,xmm0 + punpckhwd xmm6,xmm3 + movdqa xmm5,xmm1 + movdqa xmm2,xmm6 + pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) + pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) + pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) + pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) - paddd xmm6,xmm1 ; xmm6=tmp2 - paddd xmm2,xmm5 ; xmm2=tmp0 + paddd xmm6,xmm1 ; xmm6=tmp2 + paddd xmm2,xmm5 ; xmm2=tmp0 - ; -- Even part + ; -- Even part - punpcklwd xmm0,xmm3 - pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 + punpcklwd xmm0,xmm3 + pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 - movdqa xmm7,xmm4 - paddd xmm4,xmm0 ; xmm4=tmp10 - psubd xmm7,xmm0 ; xmm7=tmp12 + movdqa xmm7,xmm4 + paddd xmm4,xmm0 ; xmm4=tmp10 + psubd xmm7,xmm0 ; xmm7=tmp12 - ; -- Final output stage + ; -- Final output stage - movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] - movdqa xmm5,xmm4 - movdqa xmm3,xmm7 - paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) - paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) - psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) - psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + movdqa xmm5,xmm4 + movdqa xmm3,xmm7 + paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) - paddd xmm4,xmm1 - paddd xmm7,xmm1 - psrad xmm4,DESCALE_P2_4 - psrad xmm7,DESCALE_P2_4 - paddd xmm5,xmm1 - paddd xmm3,xmm1 - psrad xmm5,DESCALE_P2_4 - psrad xmm3,DESCALE_P2_4 + paddd xmm4,xmm1 + paddd xmm7,xmm1 + psrad xmm4,DESCALE_P2_4 + psrad xmm7,DESCALE_P2_4 + paddd xmm5,xmm1 + paddd xmm3,xmm1 + psrad xmm5,DESCALE_P2_4 + psrad xmm3,DESCALE_P2_4 - packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) - packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) - movdqa xmm0,xmm4 ; transpose coefficients(phase 1) - punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) - punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + movdqa xmm0,xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) - movdqa xmm6,xmm4 ; transpose coefficients(phase 2) - punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) - punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + movdqa xmm6,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) - packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) - paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] + packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] - pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) - pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) - pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 - movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 + movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; unused - poppic ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; -------------------------------------------------------------------------- @@ -418,177 +418,177 @@ EXTN(jsimd_idct_4x4_sse2): ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void * dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col - align 16 - global EXTN(jsimd_idct_2x2_sse2) + align 16 + global EXTN(jsimd_idct_2x2_sse2) EXTN(jsimd_idct_2x2_sse2): - push ebp - mov ebp,esp - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input. - - mov edx, POINTER [dct_table(ebp)] ; quantptr - mov esi, JCOEFPTR [coef_block(ebp)] ; inptr - - ; | input: | result: | - ; | 00 01 ** 03 ** 05 ** 07 | | - ; | 10 11 ** 13 ** 15 ** 17 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | - ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | - ; | 50 51 ** 53 ** 55 ** 57 | | - ; | ** ** ** ** ** ** ** ** | | - ; | 70 71 ** 73 ** 75 ** 77 | | - - ; -- Odd part + push ebp + mov ebp,esp + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part - movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] - pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] - pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) - ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) - pcmpeqd xmm7,xmm7 - pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + pcmpeqd xmm7,xmm7 + pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} - movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) - movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) - punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) - punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) - pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] + movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] - psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) - pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) - psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) - pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) - por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) - por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) - pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] + psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] - paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] - paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] - ; -- Even part + ; -- Even part - movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] - pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] - ; xmm6=(00 01 ** 03 ** 05 ** 07) + ; xmm6=(00 01 ** 03 ** 05 ** 07) - movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) - pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) - pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) - psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] - psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] - ; -- Final output stage + ; -- Final output stage - movdqa xmm3,xmm6 - movdqa xmm5,xmm1 - paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) - paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) - psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) - psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + movdqa xmm3,xmm6 + movdqa xmm5,xmm1 + paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) - movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] - punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) - movdqa xmm7,xmm1 - punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) - punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + movdqa xmm7,xmm1 + punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) - paddd xmm6,xmm2 - psrad xmm6,DESCALE_P1_2 + paddd xmm6,xmm2 + psrad xmm6,DESCALE_P1_2 - paddd xmm1,xmm2 - paddd xmm7,xmm2 - psrad xmm1,DESCALE_P1_2 - psrad xmm7,DESCALE_P1_2 + paddd xmm1,xmm2 + paddd xmm7,xmm2 + psrad xmm1,DESCALE_P1_2 + psrad xmm7,DESCALE_P1_2 - ; -- Prefetch the next coefficient block + ; -- Prefetch the next coefficient block - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] - ; ---- Pass 2: process rows, store into output array. + ; ---- Pass 2: process rows, store into output array. - mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(ebp)] + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] - ; | input:| result:| - ; | A0 B0 | | - ; | A1 B1 | C0 C1 | - ; | A3 B3 | D0 D1 | - ; | A5 B5 | | - ; | A7 B7 | | + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | - ; -- Odd part + ; -- Odd part - packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) - packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) - pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] - pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] + packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] - paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] - ; -- Even part + ; -- Even part - pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] - ; -- Final output stage + ; -- Final output stage - movdqa xmm4,xmm6 - paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) - psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + movdqa xmm4,xmm6 + paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) - punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) - paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] - psrad xmm6,DESCALE_P2_2 + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] + psrad xmm6,DESCALE_P2_2 - packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) - packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) - paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] + packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] - pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) - pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) + pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - mov WORD [edx+eax*SIZEOF_JSAMPLE], bx - mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov WORD [edx+eax*SIZEOF_JSAMPLE], bx + mov WORD [esi+eax*SIZEOF_JSAMPLE], cx - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - pop ebp - ret + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jisseflt.asm b/simd/jisseflt.asm index d6147c12d..8b8135552 100644 --- a/simd/jisseflt.asm +++ b/simd/jisseflt.asm @@ -25,34 +25,34 @@ ; -------------------------------------------------------------------------- -%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) - shufps %1,%2,0x44 +%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 %endmacro -%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) - shufps %1,%2,0xEE +%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE %endmacro ; -------------------------------------------------------------------------- - SECTION SEG_CONST + SECTION SEG_CONST - alignz 16 - global EXTN(jconst_idct_float_sse) + alignz 16 + global EXTN(jconst_idct_float_sse) EXTN(jconst_idct_float_sse): -PD_1_414 times 4 dd 1.414213562373095048801689 -PD_1_847 times 4 dd 1.847759065022573512256366 -PD_1_082 times 4 dd 1.082392200292393968799446 -PD_M2_613 times 4 dd -2.613125929752753055713286 -PD_0_125 times 4 dd 0.125 ; 1/8 -PB_CENTERJSAMP times 8 db CENTERJSAMPLE +PD_1_414 times 4 dd 1.414213562373095048801689 +PD_1_847 times 4 dd 1.847759065022573512256366 +PD_1_082 times 4 dd 1.082392200292393968799446 +PD_M2_613 times 4 dd -2.613125929752753055713286 +PD_0_125 times 4 dd 0.125 ; 1/8 +PB_CENTERJSAMP times 8 db CENTERJSAMPLE - alignz 16 + alignz 16 ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Perform dequantization and inverse DCT on one block of coefficients. ; @@ -61,512 +61,512 @@ PB_CENTERJSAMP times 8 db CENTERJSAMPLE ; JSAMPARRAY output_buf, JDIMENSION output_col) ; -%define dct_table(b) (b)+8 ; void * dct_table -%define coef_block(b) (b)+12 ; JCOEFPTR coef_block -%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf -%define output_col(b) (b)+20 ; JDIMENSION output_col +%define dct_table(b) (b)+8 ; void * dct_table +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf +%define output_col(b) (b)+20 ; JDIMENSION output_col -%define original_ebp ebp+0 -%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] -%define WK_NUM 2 -%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT - ; FAST_FLOAT workspace[DCTSIZE2] +%define original_ebp ebp+0 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] +%define WK_NUM 2 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] - align 16 - global EXTN(jsimd_idct_float_sse) + align 16 + global EXTN(jsimd_idct_float_sse) EXTN(jsimd_idct_float_sse): - push ebp - mov eax,esp ; eax = original ebp - sub esp, byte 4 - and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits - mov [esp],eax - mov ebp,esp ; ebp = aligned ebp - lea esp, [workspace] - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - - ; ---- Pass 1: process columns from input, store into work array. - -; mov eax, [original_ebp] - mov edx, POINTER [dct_table(eax)] ; quantptr - mov esi, JCOEFPTR [coef_block(eax)] ; inptr - lea edi, [workspace] ; FAST_FLOAT * wsptr - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + +; mov eax, [original_ebp] + mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT * wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .columnloop: %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE - mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] - or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] - jnz near .columnDCT - - movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - por mm1,mm0 - packsswb mm1,mm1 - movd eax,mm1 - test eax,eax - jnz short .columnDCT - - ; -- AC terms all zero - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - - punpckhwd mm1,mm0 ; mm1=(** 02 ** 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) - psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) - cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **) - cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) - movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03) - - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm1,xmm0 - movaps xmm2,xmm0 - movaps xmm3,xmm0 - - shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) - shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) - shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) - shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 - jmp near .nextcolumn - alignx 16,7 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por mm1,mm0 + packsswb mm1,mm1 + movd eax,mm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm1,mm0 ; mm1=(** 02 ** 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in0H=(02 03) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm3,mm1 ; xmm3=(02 03 ** **) + cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) + movlhps xmm0,xmm3 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16,7 %endif .columnDCT: - ; -- Even part - - movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - - punpckhwd mm4,mm0 ; mm4=(** 02 ** 03) - punpcklwd mm0,mm0 ; mm0=(00 00 01 01) - punpckhwd mm5,mm1 ; mm5=(** 22 ** 23) - punpcklwd mm1,mm1 ; mm1=(20 20 21 21) - - psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) - psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) - cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **) - cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) - psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) - psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) - cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **) - cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **) - - punpckhwd mm6,mm2 ; mm6=(** 42 ** 43) - punpcklwd mm2,mm2 ; mm2=(40 40 41 41) - punpckhwd mm7,mm3 ; mm7=(** 62 ** 63) - punpcklwd mm3,mm3 ; mm3=(60 60 61 61) - - psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) - psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) - cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **) - cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **) - psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) - psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) - cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **) - cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **) - - movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03) - movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23) - mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43) - movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63) - mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] - - punpckhwd mm6,mm4 ; mm6=(** 12 ** 13) - punpcklwd mm4,mm4 ; mm4=(10 10 11 11) - punpckhwd mm2,mm0 ; mm2=(** 32 ** 33) - punpcklwd mm0,mm0 ; mm0=(30 30 31 31) - - psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) - psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) - cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **) - cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **) - psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) - psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) - cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **) - cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **) - - punpckhwd mm7,mm5 ; mm7=(** 52 ** 53) - punpcklwd mm5,mm5 ; mm5=(50 50 51 51) - punpckhwd mm3,mm1 ; mm3=(** 72 ** 73) - punpcklwd mm1,mm1 ; mm1=(70 70 71 71) - - movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13) - movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33) - - psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) - psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) - cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **) - cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **) - psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) - psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) - cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **) - cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **) - - mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53) - movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73) - mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) - addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) - subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) - subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,xmm6 ; transpose coefficients(phase 1) - unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) - unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) - movaps xmm3,xmm0 ; transpose coefficients(phase 1) - unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) - unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 - movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 - - movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) - movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm0,xmm7 - movaps xmm3,xmm5 - addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) - addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) - subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) - subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) - - movaps xmm2,xmm7 ; transpose coefficients(phase 1) - unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) - unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) - movaps xmm4,xmm5 ; transpose coefficients(phase 1) - unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) - unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) - - movaps xmm3,xmm6 ; transpose coefficients(phase 2) - unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) - unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) - movaps xmm0,xmm1 ; transpose coefficients(phase 2) - unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) - unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) - - movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) - movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) - - movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 - movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 - movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 - - movaps xmm6,xmm5 ; transpose coefficients(phase 2) - unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) - unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) - movaps xmm3,xmm4 ; transpose coefficients(phase 2) - unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) - unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) - - movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 - movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 - movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 - movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + ; -- Even part + + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm4,mm0 ; mm4=(** 02 ** 03) + punpcklwd mm0,mm0 ; mm0=(00 00 01 01) + punpckhwd mm5,mm1 ; mm5=(** 22 ** 23) + punpcklwd mm1,mm1 ; mm1=(20 20 21 21) + + psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in0H=(02 03) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in0L=(00 01) + cvtpi2ps xmm4,mm4 ; xmm4=(02 03 ** **) + cvtpi2ps xmm0,mm0 ; xmm0=(00 01 ** **) + psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in2H=(22 23) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in2L=(20 21) + cvtpi2ps xmm5,mm5 ; xmm5=(22 23 ** **) + cvtpi2ps xmm1,mm1 ; xmm1=(20 21 ** **) + + punpckhwd mm6,mm2 ; mm6=(** 42 ** 43) + punpcklwd mm2,mm2 ; mm2=(40 40 41 41) + punpckhwd mm7,mm3 ; mm7=(** 62 ** 63) + punpcklwd mm3,mm3 ; mm3=(60 60 61 61) + + psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in4H=(42 43) + psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in4L=(40 41) + cvtpi2ps xmm6,mm6 ; xmm6=(42 43 ** **) + cvtpi2ps xmm2,mm2 ; xmm2=(40 41 ** **) + psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in6H=(62 63) + psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in6L=(60 61) + cvtpi2ps xmm7,mm7 ; xmm7=(62 63 ** **) + cvtpi2ps xmm3,mm3 ; xmm3=(60 61 ** **) + + movlhps xmm0,xmm4 ; xmm0=in0=(00 01 02 03) + movlhps xmm1,xmm5 ; xmm1=in2=(20 21 22 23) + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm2,xmm6 ; xmm2=in4=(40 41 42 43) + movlhps xmm3,xmm7 ; xmm3=in6=(60 61 62 63) + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movq mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movq mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpckhwd mm6,mm4 ; mm6=(** 12 ** 13) + punpcklwd mm4,mm4 ; mm4=(10 10 11 11) + punpckhwd mm2,mm0 ; mm2=(** 32 ** 33) + punpcklwd mm0,mm0 ; mm0=(30 30 31 31) + + psrad mm6,(DWORD_BIT-WORD_BIT) ; mm6=in1H=(12 13) + psrad mm4,(DWORD_BIT-WORD_BIT) ; mm4=in1L=(10 11) + cvtpi2ps xmm4,mm6 ; xmm4=(12 13 ** **) + cvtpi2ps xmm2,mm4 ; xmm2=(10 11 ** **) + psrad mm2,(DWORD_BIT-WORD_BIT) ; mm2=in3H=(32 33) + psrad mm0,(DWORD_BIT-WORD_BIT) ; mm0=in3L=(30 31) + cvtpi2ps xmm0,mm2 ; xmm0=(32 33 ** **) + cvtpi2ps xmm3,mm0 ; xmm3=(30 31 ** **) + + punpckhwd mm7,mm5 ; mm7=(** 52 ** 53) + punpcklwd mm5,mm5 ; mm5=(50 50 51 51) + punpckhwd mm3,mm1 ; mm3=(** 72 ** 73) + punpcklwd mm1,mm1 ; mm1=(70 70 71 71) + + movlhps xmm2,xmm4 ; xmm2=in1=(10 11 12 13) + movlhps xmm3,xmm0 ; xmm3=in3=(30 31 32 33) + + psrad mm7,(DWORD_BIT-WORD_BIT) ; mm7=in5H=(52 53) + psrad mm5,(DWORD_BIT-WORD_BIT) ; mm5=in5L=(50 51) + cvtpi2ps xmm4,mm7 ; xmm4=(52 53 ** **) + cvtpi2ps xmm5,mm5 ; xmm5=(50 51 ** **) + psrad mm3,(DWORD_BIT-WORD_BIT) ; mm3=in7H=(72 73) + psrad mm1,(DWORD_BIT-WORD_BIT) ; mm1=in7L=(70 71) + cvtpi2ps xmm0,mm3 ; xmm0=(72 73 ** **) + cvtpi2ps xmm1,mm1 ; xmm1=(70 71 ** **) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movlhps xmm5,xmm4 ; xmm5=in5=(50 51 52 53) + movlhps xmm1,xmm0 ; xmm1=in7=(70 71 72 73) + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 .nextcolumn: - add esi, byte 4*SIZEOF_JCOEF ; coef_block - add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr - add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr - dec ecx ; ctr - jnz near .columnloop - - ; -- Prefetch the next coefficient block - - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] - prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] - - ; ---- Pass 2: process rows from work array, store into output array. - - mov eax, [original_ebp] - lea esi, [workspace] ; FAST_FLOAT * wsptr - mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) - mov eax, JDIMENSION [output_col(eax)] - mov ecx, DCTSIZE/4 ; ctr - alignx 16,7 + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT * wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 .rowloop: - ; -- Even part - - movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm0 - movaps xmm5,xmm1 - subps xmm0,xmm2 ; xmm0=tmp11 - subps xmm1,xmm3 - addps xmm4,xmm2 ; xmm4=tmp10 - addps xmm5,xmm3 ; xmm5=tmp13 - - mulps xmm1,[GOTOFF(ebx,PD_1_414)] - subps xmm1,xmm5 ; xmm1=tmp12 - - movaps xmm6,xmm4 - movaps xmm7,xmm0 - subps xmm4,xmm5 ; xmm4=tmp3 - subps xmm0,xmm1 ; xmm0=tmp2 - addps xmm6,xmm5 ; xmm6=tmp0 - addps xmm7,xmm1 ; xmm7=tmp1 - - movaps XMMWORD [wk(1)], xmm4 ; tmp3 - movaps XMMWORD [wk(0)], xmm0 ; tmp2 - - ; -- Odd part - - movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] - movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] - - movaps xmm4,xmm2 - movaps xmm0,xmm5 - addps xmm2,xmm1 ; xmm2=z11 - addps xmm5,xmm3 ; xmm5=z13 - subps xmm4,xmm1 ; xmm4=z12 - subps xmm0,xmm3 ; xmm0=z10 - - movaps xmm1,xmm2 - subps xmm2,xmm5 - addps xmm1,xmm5 ; xmm1=tmp7 - - mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 - - movaps xmm3,xmm0 - addps xmm0,xmm4 - mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 - mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) - mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) - addps xmm3,xmm0 ; xmm3=tmp12 - subps xmm4,xmm0 ; xmm4=tmp10 - - ; -- Final output stage - - subps xmm3,xmm1 ; xmm3=tmp6 - movaps xmm5,xmm6 - movaps xmm0,xmm7 - addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) - addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) - subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) - subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) - subps xmm2,xmm3 ; xmm2=tmp5 - - movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] - - mulps xmm6,xmm1 ; descale(1/8) - mulps xmm7,xmm1 ; descale(1/8) - mulps xmm5,xmm1 ; descale(1/8) - mulps xmm0,xmm1 ; descale(1/8) - - movhlps xmm3,xmm6 - movhlps xmm1,xmm7 - cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10) - cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11) - cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30) - cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31) - packssdw mm0,mm2 ; mm0=data0=(00 10 20 30) - packssdw mm1,mm3 ; mm1=data1=(01 11 21 31) - - movhlps xmm6,xmm5 - movhlps xmm7,xmm0 - cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17) - cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16) - cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37) - cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36) - packssdw mm4,mm6 ; mm4=data7=(07 17 27 37) - packssdw mm5,mm7 ; mm5=data6=(06 16 26 36) - - packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36) - packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37) - - movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 - movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 - - movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] - - addps xmm4,xmm2 ; xmm4=tmp4 - movaps xmm5,xmm3 - movaps xmm0,xmm1 - addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32) - addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34) - subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35) - subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33) - - mulps xmm3,xmm6 ; descale(1/8) - mulps xmm1,xmm6 ; descale(1/8) - mulps xmm5,xmm6 ; descale(1/8) - mulps xmm0,xmm6 ; descale(1/8) - - movhlps xmm7,xmm3 - movhlps xmm2,xmm1 - cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12) - cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14) - cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32) - cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34) - packssdw mm2,mm6 ; mm2=data2=(02 12 22 32) - packssdw mm3,mm7 ; mm3=data4=(04 14 24 34) - - movhlps xmm4,xmm5 - movhlps xmm6,xmm0 - cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15) - cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13) - cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35) - cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33) - packssdw mm5,mm6 ; mm5=data5=(05 15 25 35) - packssdw mm4,mm7 ; mm4=data3=(03 13 23 33) - - movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] - - packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34) - packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35) - - paddb mm0,mm6 - paddb mm1,mm6 - paddb mm2,mm6 - paddb mm4,mm6 - - movq mm7,mm0 ; transpose coefficients(phase 1) - punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31) - punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37) - movq mm3,mm2 ; transpose coefficients(phase 1) - punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33) - punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35) - - movq mm5,mm0 ; transpose coefficients(phase 2) - punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13) - punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33) - movq mm6,mm3 ; transpose coefficients(phase 2) - punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17) - punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37) - - movq mm1,mm0 ; transpose coefficients(phase 3) - punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07) - punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17) - movq mm4,mm5 ; transpose coefficients(phase 3) - punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27) - punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37) - - pushpic ebx ; save GOT address - - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 - mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 - movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 - - poppic ebx ; restore GOT address - - add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr - add edi, byte 4*SIZEOF_JSAMPROW - dec ecx ; ctr - jnz near .rowloop - - emms ; empty MMX state - - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - mov esp,ebp ; esp <- aligned ebp - pop esp ; esp <- original ebp - pop ebp - ret + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[GOTOFF(ebx,PD_0_125)] ; xmm1=[PD_0_125] + + mulps xmm6,xmm1 ; descale(1/8) + mulps xmm7,xmm1 ; descale(1/8) + mulps xmm5,xmm1 ; descale(1/8) + mulps xmm0,xmm1 ; descale(1/8) + + movhlps xmm3,xmm6 + movhlps xmm1,xmm7 + cvtps2pi mm0,xmm6 ; round to int32, mm0=data0L=(00 10) + cvtps2pi mm1,xmm7 ; round to int32, mm1=data1L=(01 11) + cvtps2pi mm2,xmm3 ; round to int32, mm2=data0H=(20 30) + cvtps2pi mm3,xmm1 ; round to int32, mm3=data1H=(21 31) + packssdw mm0,mm2 ; mm0=data0=(00 10 20 30) + packssdw mm1,mm3 ; mm1=data1=(01 11 21 31) + + movhlps xmm6,xmm5 + movhlps xmm7,xmm0 + cvtps2pi mm4,xmm5 ; round to int32, mm4=data7L=(07 17) + cvtps2pi mm5,xmm0 ; round to int32, mm5=data6L=(06 16) + cvtps2pi mm6,xmm6 ; round to int32, mm6=data7H=(27 37) + cvtps2pi mm7,xmm7 ; round to int32, mm7=data6H=(26 36) + packssdw mm4,mm6 ; mm4=data7=(07 17 27 37) + packssdw mm5,mm7 ; mm5=data6=(06 16 26 36) + + packsswb mm0,mm5 ; mm0=(00 10 20 30 06 16 26 36) + packsswb mm1,mm4 ; mm1=(01 11 21 31 07 17 27 37) + + movaps xmm3, XMMWORD [wk(0)] ; xmm3=tmp2 + movaps xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movaps xmm6,[GOTOFF(ebx,PD_0_125)] ; xmm6=[PD_0_125] + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm5,xmm3 + movaps xmm0,xmm1 + addps xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + addps xmm1,xmm4 ; xmm1=data4=(04 14 24 34) + subps xmm5,xmm2 ; xmm5=data5=(05 15 25 35) + subps xmm0,xmm4 ; xmm0=data3=(03 13 23 33) + + mulps xmm3,xmm6 ; descale(1/8) + mulps xmm1,xmm6 ; descale(1/8) + mulps xmm5,xmm6 ; descale(1/8) + mulps xmm0,xmm6 ; descale(1/8) + + movhlps xmm7,xmm3 + movhlps xmm2,xmm1 + cvtps2pi mm2,xmm3 ; round to int32, mm2=data2L=(02 12) + cvtps2pi mm3,xmm1 ; round to int32, mm3=data4L=(04 14) + cvtps2pi mm6,xmm7 ; round to int32, mm6=data2H=(22 32) + cvtps2pi mm7,xmm2 ; round to int32, mm7=data4H=(24 34) + packssdw mm2,mm6 ; mm2=data2=(02 12 22 32) + packssdw mm3,mm7 ; mm3=data4=(04 14 24 34) + + movhlps xmm4,xmm5 + movhlps xmm6,xmm0 + cvtps2pi mm5,xmm5 ; round to int32, mm5=data5L=(05 15) + cvtps2pi mm4,xmm0 ; round to int32, mm4=data3L=(03 13) + cvtps2pi mm6,xmm4 ; round to int32, mm6=data5H=(25 35) + cvtps2pi mm7,xmm6 ; round to int32, mm7=data3H=(23 33) + packssdw mm5,mm6 ; mm5=data5=(05 15 25 35) + packssdw mm4,mm7 ; mm4=data3=(03 13 23 33) + + movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP] + + packsswb mm2,mm3 ; mm2=(02 12 22 32 04 14 24 34) + packsswb mm4,mm5 ; mm4=(03 13 23 33 05 15 25 35) + + paddb mm0,mm6 + paddb mm1,mm6 + paddb mm2,mm6 + paddb mm4,mm6 + + movq mm7,mm0 ; transpose coefficients(phase 1) + punpcklbw mm0,mm1 ; mm0=(00 01 10 11 20 21 30 31) + punpckhbw mm7,mm1 ; mm7=(06 07 16 17 26 27 36 37) + movq mm3,mm2 ; transpose coefficients(phase 1) + punpcklbw mm2,mm4 ; mm2=(02 03 12 13 22 23 32 33) + punpckhbw mm3,mm4 ; mm3=(04 05 14 15 24 25 34 35) + + movq mm5,mm0 ; transpose coefficients(phase 2) + punpcklwd mm0,mm2 ; mm0=(00 01 02 03 10 11 12 13) + punpckhwd mm5,mm2 ; mm5=(20 21 22 23 30 31 32 33) + movq mm6,mm3 ; transpose coefficients(phase 2) + punpcklwd mm3,mm7 ; mm3=(04 05 06 07 14 15 16 17) + punpckhwd mm6,mm7 ; mm6=(24 25 26 27 34 35 36 37) + + movq mm1,mm0 ; transpose coefficients(phase 3) + punpckldq mm0,mm3 ; mm0=(00 01 02 03 04 05 06 07) + punpckhdq mm1,mm3 ; mm1=(10 11 12 13 14 15 16 17) + movq mm4,mm5 ; transpose coefficients(phase 3) + punpckldq mm5,mm6 ; mm5=(20 21 22 23 24 25 26 27) + punpckhdq mm4,mm6 ; mm4=(30 31 32 33 34 35 36 37) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + emms ; empty MMX state + + pop edi + pop esi +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jsimd_arm_neon_64.S b/simd/jsimd_arm_neon_64.S index 2d9e95efc..8806abc6e 100644 --- a/simd/jsimd_arm_neon_64.S +++ b/simd/jsimd_arm_neon_64.S @@ -293,7 +293,7 @@ asm_function jsimd_idct_islow_neon mul v30.4h, v30.4h, v6.4h mul v31.4h, v31.4h, v7.4h ins v30.2d[1], v31.2d[0] /* 128 bit q15 */ - sub sp, sp, #32 + sub sp, sp, #32 st1 {v8.4h-v11.4h}, [sp] /* save NEON registers */ sub sp, sp, #32 st1 {v12.4h-v15.4h}, [sp] @@ -365,7 +365,7 @@ asm_function jsimd_idct_islow_neon add v10.4s, v2.4s, v8.4s orr x0, x4, x5 sub v6.4s, v2.4s, v8.4s - /* pop {x4, x5} */ + /* pop {x4, x5} */ ldp x4, x5, [sp], 16 rshrn ROW7L.4h, v4.4s, #11 rshrn ROW3L.4h, v10.4s, #11 @@ -1125,21 +1125,21 @@ jsimd_idct_4x4_neon_consts: smull v28.4s, \x4, v2.4h[2] smlal v28.4s, \x8, v0.4h[0] smlal v28.4s, \x14, v0.4h[1] - + smull v26.4s, \x16, v1.4h[2] smlal v26.4s, \x12, v1.4h[3] smlal v26.4s, \x10, v2.4h[0] smlal v26.4s, \x6, v2.4h[1] - + smull v30.4s, \x4, v2.4h[2] smlsl v30.4s, \x8, v0.4h[0] smlsl v30.4s, \x14, v0.4h[1] - + smull v24.4s, \x16, v0.4h[2] smlal v24.4s, \x12, v0.4h[3] smlal v24.4s, \x10, v1.4h[0] smlal v24.4s, \x6, v1.4h[1] - + add v20.4s, v28.4s, v26.4s sub v28.4s, v28.4s, v26.4s @@ -1148,11 +1148,11 @@ jsimd_idct_4x4_neon_consts: srshr v28.4s, v28.4s, #\shift xtn \y26, v20.4s xtn \y29, v28.4s -.else +.else rshrn \y26, v20.4s, #\shift rshrn \y29, v28.4s, #\shift -.endif - +.endif + add v20.4s, v30.4s, v24.4s sub v30.4s, v30.4s, v24.4s @@ -1279,7 +1279,7 @@ asm_function jsimd_idct_4x4_neon st1 {v27.b}[2], [TMP3], 1 st1 {v26.b}[3], [TMP1], 1 st1 {v27.b}[3], [TMP3], 1 - + st1 {v26.b}[4], [TMP2], 1 st1 {v27.b}[4], [TMP4], 1 st1 {v26.b}[5], [TMP2], 1 diff --git a/simd/jsimdcpu.asm b/simd/jsimdcpu.asm index bdbcc2317..c42c4ad62 100644 --- a/simd/jsimdcpu.asm +++ b/simd/jsimdcpu.asm @@ -19,8 +19,8 @@ %include "jsimdext.inc" ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 + SECTION SEG_TEXT + BITS 32 ; ; Check if the CPU supports SIMD instructions ; @@ -28,78 +28,78 @@ ; jpeg_simd_cpu_support (void) ; - align 16 - global EXTN(jpeg_simd_cpu_support) + align 16 + global EXTN(jpeg_simd_cpu_support) EXTN(jpeg_simd_cpu_support): - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved -; push esi ; unused - push edi + push ebx +; push ecx ; need not be preserved +; push edx ; need not be preserved +; push esi ; unused + push edi - xor edi,edi ; simd support flag + xor edi,edi ; simd support flag - pushfd - pop eax - mov edx,eax - xor eax, 1<<21 ; flip ID bit in EFLAGS - push eax - popfd - pushfd - pop eax - xor eax,edx - jz short .return ; CPUID is not supported + pushfd + pop eax + mov edx,eax + xor eax, 1<<21 ; flip ID bit in EFLAGS + push eax + popfd + pushfd + pop eax + xor eax,edx + jz short .return ; CPUID is not supported - ; Check for MMX instruction support - xor eax,eax - cpuid - test eax,eax - jz short .return + ; Check for MMX instruction support + xor eax,eax + cpuid + test eax,eax + jz short .return - xor eax,eax - inc eax - cpuid - mov eax,edx ; eax = Standard feature flags + xor eax,eax + inc eax + cpuid + mov eax,edx ; eax = Standard feature flags - test eax, 1<<23 ; bit23:MMX - jz short .no_mmx - or edi, byte JSIMD_MMX + test eax, 1<<23 ; bit23:MMX + jz short .no_mmx + or edi, byte JSIMD_MMX .no_mmx: - test eax, 1<<25 ; bit25:SSE - jz short .no_sse - or edi, byte JSIMD_SSE + test eax, 1<<25 ; bit25:SSE + jz short .no_sse + or edi, byte JSIMD_SSE .no_sse: - test eax, 1<<26 ; bit26:SSE2 - jz short .no_sse2 - or edi, byte JSIMD_SSE2 + test eax, 1<<26 ; bit26:SSE2 + jz short .no_sse2 + or edi, byte JSIMD_SSE2 .no_sse2: - ; Check for 3DNow! instruction support - mov eax, 0x80000000 - cpuid - cmp eax, 0x80000000 - jbe short .return + ; Check for 3DNow! instruction support + mov eax, 0x80000000 + cpuid + cmp eax, 0x80000000 + jbe short .return - mov eax, 0x80000001 - cpuid - mov eax,edx ; eax = Extended feature flags + mov eax, 0x80000001 + cpuid + mov eax,edx ; eax = Extended feature flags - test eax, 1<<31 ; bit31:3DNow!(vendor independent) - jz short .no_3dnow - or edi, byte JSIMD_3DNOW + test eax, 1<<31 ; bit31:3DNow!(vendor independent) + jz short .no_3dnow + or edi, byte JSIMD_3DNOW .no_3dnow: .return: - mov eax,edi + mov eax,edi - pop edi -; pop esi ; unused -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - ret + pop edi +; pop esi ; unused +; pop edx ; need not be preserved +; pop ecx ; need not be preserved + pop ebx + ret ; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. - align 16 + align 16 diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc index 253b8972f..0f1a8dacc 100644 --- a/simd/jsimdext.inc +++ b/simd/jsimdext.inc @@ -30,7 +30,7 @@ ; ========================================================================== ; System-dependent configurations -%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- +%ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- ; * Microsoft Visual C++ ; * MinGW (Minimalist GNU for Windows) ; * CygWin @@ -46,7 +46,7 @@ %define SEG_CONST .rdata align=16 public use32 class=CONST %endif -%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- +%elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- ; * Microsoft Visual C++ ; -- segment definition -- @@ -58,9 +58,9 @@ %define SEG_TEXT .text align=16 public use64 class=CODE %define SEG_CONST .rdata align=16 public use64 class=CONST %endif -%define EXTN(name) name ; foo() -> foo +%define EXTN(name) name ; foo() -> foo -%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- +%elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- ; * Borland C++ (Win32) ; -- segment definition -- @@ -68,7 +68,7 @@ %define SEG_TEXT .text align=16 public use32 class=CODE %define SEG_CONST .data align=16 public use32 class=DATA -%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ ; * Linux ; * *BSD family Unix using elf format ; * Unix System V, including Solaris x86, UnixWare and SCO Unix @@ -88,10 +88,10 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; To make the code position-independent, append -DPIC to the commandline ; -%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC -%define EXTN(name) name ; foo() -> foo +%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC +%define EXTN(name) name ; foo() -> foo -%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- +%elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- ; * Older Linux using a.out format (nasm -f aout -DAOUT ...) ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) @@ -102,29 +102,29 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; To make the code position-independent, append -DPIC to the commandline ; -%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC +%define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC -%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- +%elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) ; -- segment definition -- ; -%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? +%define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? %define SEG_CONST .rodata align=16 ; The generation of position-independent code (PIC) is the default on Darwin. ; %define PIC -%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing +%define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing -%else ; ----(Other case)---------------------- +%else ; ----(Other case)---------------------- ; -- segment definition -- ; %define SEG_TEXT .text %define SEG_CONST .data -%endif ; ---------------------------------------------- +%endif ; ---------------------------------------------- ; ========================================================================== @@ -179,7 +179,7 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; External Symbol Name ; %ifndef EXTN -%define EXTN(name) _ %+ name ; foo() -> _foo +%define EXTN(name) _ %+ name ; foo() -> _foo %endif ; -------------------------------------------------------------------------- @@ -196,79 +196,79 @@ section .note.GNU-stack noalloc noexec nowrite progbits ; At present, nasm doesn't seem to support PIC generation for Mach-O. ; The PIC support code below is a little tricky. - SECTION SEG_CONST + SECTION SEG_CONST const_base: %define GOTOFF(got,sym) (got) + (sym) - const_base -%imacro get_GOT 1 - ; NOTE: this macro destroys ecx resister. - call %%geteip - add ecx, byte (%%ref - $) - jmp short %%adjust +%imacro get_GOT 1 + ; NOTE: this macro destroys ecx resister. + call %%geteip + add ecx, byte (%%ref - $) + jmp short %%adjust %%geteip: - mov ecx, POINTER [esp] - ret + mov ecx, POINTER [esp] + ret %%adjust: - push ebp - xor ebp,ebp ; ebp = 0 -%ifidni %1,ebx ; (%1 == ebx) - ; db 0x8D,0x9C + jmp near const_base = - ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) - db 0x8D,0x9C ; 8D,9C - jmp near const_base ; E9,(const_base-%%ref) + push ebp + xor ebp,ebp ; ebp = 0 +%ifidni %1,ebx ; (%1 == ebx) + ; db 0x8D,0x9C + jmp near const_base = + ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) + db 0x8D,0x9C ; 8D,9C + jmp near const_base ; E9,(const_base-%%ref) %%ref: %else ; (%1 != ebx) - ; db 0x8D,0x8C + jmp near const_base = - ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) - db 0x8D,0x8C ; 8D,8C - jmp near const_base ; E9,(const_base-%%ref) -%%ref: mov %1, ecx + ; db 0x8D,0x8C + jmp near const_base = + ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) + db 0x8D,0x8C ; 8D,8C + jmp near const_base ; E9,(const_base-%%ref) +%%ref: mov %1, ecx %endif ; (%1 == ebx) - pop ebp + pop ebp %endmacro -%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- +%else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff -%imacro get_GOT 1 - extern GOT_SYMBOL - call %%geteip - add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc - jmp short %%done +%imacro get_GOT 1 + extern GOT_SYMBOL + call %%geteip + add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc + jmp short %%done %%geteip: - mov %1, POINTER [esp] - ret + mov %1, POINTER [esp] + ret %%done: %endmacro -%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- +%endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- -%imacro pushpic 1.nolist - push %1 +%imacro pushpic 1.nolist + push %1 %endmacro -%imacro poppic 1.nolist - pop %1 +%imacro poppic 1.nolist + pop %1 %endmacro -%imacro movpic 2.nolist - mov %1,%2 +%imacro movpic 2.nolist + mov %1,%2 %endmacro -%else ; !PIC ----------------------------------------- +%else ; !PIC ----------------------------------------- %define GOTOFF(got,sym) (sym) -%imacro get_GOT 1.nolist +%imacro get_GOT 1.nolist %endmacro -%imacro pushpic 1.nolist +%imacro pushpic 1.nolist %endmacro -%imacro poppic 1.nolist +%imacro poppic 1.nolist %endmacro -%imacro movpic 2.nolist +%imacro movpic 2.nolist %endmacro -%endif ; PIC ----------------------------------------- +%endif ; PIC ----------------------------------------- ; -------------------------------------------------------------------------- ; Align the next instruction on {2,4,8,16,..}-byte boundary. @@ -278,28 +278,28 @@ const_base: %define FILLB(b,n) (($$-(b)) & ((n)-1)) %imacro alignx 1-2.nolist 0xFFFF -%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ - db 0x90 ; nop - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ - db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ - db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ - db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ - db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ - db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ - db 0x8B,0xED ; mov ebp,ebp - times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ - db 0x90 ; nop +%%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ + db 0x90 ; nop + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ + db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ + db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ + db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ + db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ + db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ + db 0x8B,0xED ; mov ebp,ebp + times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ + db 0x90 ; nop %endmacro ; Align the next data on {2,4,8,16,..}-byte boundary. ; %imacro alignz 1.nolist - align %1, db 0 ; filling zeros + align %1, db 0 ; filling zeros %endmacro %ifdef __x86_64__ @@ -307,61 +307,61 @@ const_base: %ifdef WIN64 %imacro collect_args 0 - push r12 - push r13 - push r14 - push r15 - mov r10, rcx - mov r11, rdx - mov r12, r8 - mov r13, r9 - mov r14, [rax+48] - mov r15, [rax+56] - push rsi - push rdi - sub rsp, SIZEOF_XMMWORD - movaps XMMWORD [rsp], xmm6 - sub rsp, SIZEOF_XMMWORD - movaps XMMWORD [rsp], xmm7 + push r12 + push r13 + push r14 + push r15 + mov r10, rcx + mov r11, rdx + mov r12, r8 + mov r13, r9 + mov r14, [rax+48] + mov r15, [rax+56] + push rsi + push rdi + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm6 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm7 %endmacro %imacro uncollect_args 0 - movaps xmm7, XMMWORD [rsp] - add rsp, SIZEOF_XMMWORD - movaps xmm6, XMMWORD [rsp] - add rsp, SIZEOF_XMMWORD - pop rdi - pop rsi - pop r15 - pop r14 - pop r13 - pop r12 + movaps xmm7, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + movaps xmm6, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + pop rdi + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 %endmacro %else %imacro collect_args 0 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - mov r10, rdi - mov r11, rsi - mov r12, rdx - mov r13, rcx - mov r14, r8 - mov r15, r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + mov r10, rdi + mov r11, rsi + mov r12, rdx + mov r13, rcx + mov r14, r8 + mov r15, r9 %endmacro %imacro uncollect_args 0 - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 %endmacro %endif diff --git a/structure.txt b/structure.txt index 12549e0f7..21bbc7a05 100644 --- a/structure.txt +++ b/structure.txt @@ -24,8 +24,8 @@ In this document, JPEG-specific terminology follows the JPEG standard: A "coefficient" is a frequency coefficient (a DCT transform output number). A "block" is an 8x8 group of samples or coefficients. An "MCU" (minimum coded unit) is an interleaved set of blocks of size - determined by the sampling factors, or a single block in a - noninterleaved scan. + determined by the sampling factors, or a single block in a + noninterleaved scan. We do not use the terms "pixel" and "sample" interchangeably. When we say pixel, we mean an element of the full-size image, while a sample is an element of the downsampled image. Thus the number of samples may vary across @@ -264,14 +264,14 @@ responsibilities: 1B. Per-pass control. This determines how many passes will be performed and calls each active processing module to configure itself appropriately at the beginning of each pass. End-of-pass processing, - where necessary, is also invoked from the master control module. + where necessary, is also invoked from the master control module. Method selection is partially distributed, in that a particular processing module may contain several possible implementations of a particular method, which it will select among when given its initialization call. The master control code need only be concerned with decisions that affect more than one module. - + 2. Data buffering control. A separate control module exists for each inter-processing-step data buffer. This module is responsible for invoking the processing steps that write or read that data buffer. @@ -573,10 +573,10 @@ there isn't any real need for it. Arrays of pixel sample values use the following data structure: - typedef something JSAMPLE; a pixel component value, 0..MAXJSAMPLE - typedef JSAMPLE *JSAMPROW; ptr to a row of samples - typedef JSAMPROW *JSAMPARRAY; ptr to a list of rows - typedef JSAMPARRAY *JSAMPIMAGE; ptr to a list of color-component arrays + typedef something JSAMPLE; a pixel component value, 0..MAXJSAMPLE + typedef JSAMPLE *JSAMPROW; ptr to a row of samples + typedef JSAMPROW *JSAMPARRAY; ptr to a list of rows + typedef JSAMPARRAY *JSAMPIMAGE; ptr to a list of color-component arrays The basic element type JSAMPLE will typically be one of unsigned char, (signed) char, or short. Short will be used if samples wider than 8 bits are @@ -617,7 +617,7 @@ we can read or write each component to a temporary file independently, which is helpful when dealing with noninterleaved JPEG files. In general, a specific sample value is accessed by code such as - GETJSAMPLE(image[colorcomponent][row][col]) + GETJSAMPLE(image[colorcomponent][row][col]) where col is measured from the image left edge, but row is measured from the first sample row currently in memory. Either of the first two indexings can be precomputed by copying the relevant pointer. @@ -636,11 +636,11 @@ with component-wise storage.) Arrays of DCT-coefficient values use the following data structure: - typedef short JCOEF; a 16-bit signed integer - typedef JCOEF JBLOCK[DCTSIZE2]; an 8x8 block of coefficients - typedef JBLOCK *JBLOCKROW; ptr to one horizontal row of 8x8 blocks - typedef JBLOCKROW *JBLOCKARRAY; ptr to a list of such rows - typedef JBLOCKARRAY *JBLOCKIMAGE; ptr to a list of color component arrays + typedef short JCOEF; a 16-bit signed integer + typedef JCOEF JBLOCK[DCTSIZE2]; an 8x8 block of coefficients + typedef JBLOCK *JBLOCKROW; ptr to one horizontal row of 8x8 blocks + typedef JBLOCKROW *JBLOCKARRAY; ptr to a list of such rows + typedef JBLOCKARRAY *JBLOCKIMAGE; ptr to a list of color component arrays The underlying type is at least a 16-bit signed integer; while "short" is big enough on all machines of interest, on some machines it is preferable to use @@ -759,8 +759,8 @@ can be freed at once. This approach helps prevent storage-leak bugs, and it speeds up operations whenever malloc/free are slow (as they often are). The pools can be regarded as lifetime identifiers for objects. Two pools/lifetimes are defined: - * JPOOL_PERMANENT lasts until master record is destroyed - * JPOOL_IMAGE lasts until done with image (JPEG datastream) + * JPOOL_PERMANENT lasts until master record is destroyed + * JPOOL_IMAGE lasts until done with image (JPEG datastream) Permanent lifetime is used for parameters and tables that should be carried across from one datastream to another; this includes all application-visible parameters. Image lifetime is used for everything else. (A third lifetime, @@ -857,20 +857,20 @@ outlined above are implemented by the front end. The back end provides the following routines for use by the front end (none of these routines are known to the rest of the JPEG code): -jpeg_mem_init, jpeg_mem_term system-dependent initialization/shutdown +jpeg_mem_init, jpeg_mem_term system-dependent initialization/shutdown -jpeg_get_small, jpeg_free_small interface to malloc and free library routines - (or their equivalents) +jpeg_get_small, jpeg_free_small interface to malloc and free library routines + (or their equivalents) -jpeg_get_large, jpeg_free_large interface to FAR malloc/free in MSDOS machines; - else usually the same as - jpeg_get_small/jpeg_free_small +jpeg_get_large, jpeg_free_large interface to FAR malloc/free in MSDOS machines; + else usually the same as + jpeg_get_small/jpeg_free_small -jpeg_mem_available estimate available memory +jpeg_mem_available estimate available memory -jpeg_open_backing_store create a backing-store object +jpeg_open_backing_store create a backing-store object -read_backing_store, manipulate a backing-store object +read_backing_store, manipulate a backing-store object write_backing_store, close_backing_store diff --git a/transupp.c b/transupp.c index a16b20a5b..72ef8d81f 100644 --- a/transupp.c +++ b/transupp.c @@ -21,9 +21,9 @@ #include "jinclude.h" #include "jpeglib.h" -#include "transupp.h" /* My own external interface */ +#include "transupp.h" /* My own external interface */ #include "jpegcomp.h" -#include /* to declare isdigit() */ +#include /* to declare isdigit() */ #if JPEG_LIB_VERSION >= 70 @@ -89,9 +89,9 @@ LOCAL(void) do_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, - jvirt_barray_ptr *src_coef_arrays, - jvirt_barray_ptr *dst_coef_arrays) + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) /* Crop. This is only used when no rotate/flip is requested with the crop. */ { JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks; @@ -107,18 +107,18 @@ do_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, x_crop_blocks = x_crop_offset * compptr->h_samp_factor; y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; - dst_blk_y += compptr->v_samp_factor) { + dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - dst_blk_y + y_crop_blocks, - (JDIMENSION) compptr->v_samp_factor, FALSE); + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_y + y_crop_blocks, + (JDIMENSION) compptr->v_samp_factor, FALSE); for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - jcopy_block_row(src_buffer[offset_y] + x_crop_blocks, - dst_buffer[offset_y], - compptr->width_in_blocks); + jcopy_block_row(src_buffer[offset_y] + x_crop_blocks, + dst_buffer[offset_y], + compptr->width_in_blocks); } } } @@ -127,8 +127,8 @@ do_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_flip_h_no_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, - jvirt_barray_ptr *src_coef_arrays) + JDIMENSION x_crop_offset, + jvirt_barray_ptr *src_coef_arrays) /* Horizontal flip; done in-place, so no separate dest array is required. * NB: this only works when y_crop_offset is zero. */ @@ -153,39 +153,39 @@ do_flip_h_no_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, comp_width = MCU_cols * compptr->h_samp_factor; x_crop_blocks = x_crop_offset * compptr->h_samp_factor; for (blk_y = 0; blk_y < compptr->height_in_blocks; - blk_y += compptr->v_samp_factor) { + blk_y += compptr->v_samp_factor) { buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, src_coef_arrays[ci], blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - /* Do the mirroring */ - for (blk_x = 0; blk_x * 2 < comp_width; blk_x++) { - ptr1 = buffer[offset_y][blk_x]; - ptr2 = buffer[offset_y][comp_width - blk_x - 1]; - /* this unrolled loop doesn't need to know which row it's on... */ - for (k = 0; k < DCTSIZE2; k += 2) { - temp1 = *ptr1; /* swap even column */ - temp2 = *ptr2; - *ptr1++ = temp2; - *ptr2++ = temp1; - temp1 = *ptr1; /* swap odd column with sign change */ - temp2 = *ptr2; - *ptr1++ = -temp2; - *ptr2++ = -temp1; - } - } - if (x_crop_blocks > 0) { - /* Now left-justify the portion of the data to be kept. - * We can't use a single jcopy_block_row() call because that routine - * depends on memcpy(), whose behavior is unspecified for overlapping - * source and destination areas. Sigh. - */ - for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) { - jcopy_block_row(buffer[offset_y] + blk_x + x_crop_blocks, - buffer[offset_y] + blk_x, - (JDIMENSION) 1); - } - } + /* Do the mirroring */ + for (blk_x = 0; blk_x * 2 < comp_width; blk_x++) { + ptr1 = buffer[offset_y][blk_x]; + ptr2 = buffer[offset_y][comp_width - blk_x - 1]; + /* this unrolled loop doesn't need to know which row it's on... */ + for (k = 0; k < DCTSIZE2; k += 2) { + temp1 = *ptr1; /* swap even column */ + temp2 = *ptr2; + *ptr1++ = temp2; + *ptr2++ = temp1; + temp1 = *ptr1; /* swap odd column with sign change */ + temp2 = *ptr2; + *ptr1++ = -temp2; + *ptr2++ = -temp1; + } + } + if (x_crop_blocks > 0) { + /* Now left-justify the portion of the data to be kept. + * We can't use a single jcopy_block_row() call because that routine + * depends on memcpy(), whose behavior is unspecified for overlapping + * source and destination areas. Sigh. + */ + for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) { + jcopy_block_row(buffer[offset_y] + blk_x + x_crop_blocks, + buffer[offset_y] + blk_x, + (JDIMENSION) 1); + } + } } } } @@ -194,9 +194,9 @@ do_flip_h_no_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, - jvirt_barray_ptr *src_coef_arrays, - jvirt_barray_ptr *dst_coef_arrays) + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) /* Horizontal flip in general cropping case */ { JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y; @@ -220,34 +220,34 @@ do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, x_crop_blocks = x_crop_offset * compptr->h_samp_factor; y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; - dst_blk_y += compptr->v_samp_factor) { + dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - dst_blk_y + y_crop_blocks, - (JDIMENSION) compptr->v_samp_factor, FALSE); + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_y + y_crop_blocks, + (JDIMENSION) compptr->v_samp_factor, FALSE); for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - dst_row_ptr = dst_buffer[offset_y]; - src_row_ptr = src_buffer[offset_y]; - for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { - if (x_crop_blocks + dst_blk_x < comp_width) { - /* Do the mirrorable blocks */ - dst_ptr = dst_row_ptr[dst_blk_x]; - src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; - /* this unrolled loop doesn't need to know which row it's on... */ - for (k = 0; k < DCTSIZE2; k += 2) { - *dst_ptr++ = *src_ptr++; /* copy even column */ - *dst_ptr++ = - *src_ptr++; /* copy odd column with sign change */ - } - } else { - /* Copy last partial block(s) verbatim */ - jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks, - dst_row_ptr + dst_blk_x, - (JDIMENSION) 1); - } - } + dst_row_ptr = dst_buffer[offset_y]; + src_row_ptr = src_buffer[offset_y]; + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Do the mirrorable blocks */ + dst_ptr = dst_row_ptr[dst_blk_x]; + src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; + /* this unrolled loop doesn't need to know which row it's on... */ + for (k = 0; k < DCTSIZE2; k += 2) { + *dst_ptr++ = *src_ptr++; /* copy even column */ + *dst_ptr++ = - *src_ptr++; /* copy odd column with sign change */ + } + } else { + /* Copy last partial block(s) verbatim */ + jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks, + dst_row_ptr + dst_blk_x, + (JDIMENSION) 1); + } + } } } } @@ -256,9 +256,9 @@ do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, - jvirt_barray_ptr *src_coef_arrays, - jvirt_barray_ptr *dst_coef_arrays) + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) /* Vertical flip */ { JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y; @@ -285,49 +285,49 @@ do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, x_crop_blocks = x_crop_offset * compptr->h_samp_factor; y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; - dst_blk_y += compptr->v_samp_factor) { + dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); if (y_crop_blocks + dst_blk_y < comp_height) { - /* Row is within the mirrorable area. */ - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - comp_height - y_crop_blocks - dst_blk_y - - (JDIMENSION) compptr->v_samp_factor, - (JDIMENSION) compptr->v_samp_factor, FALSE); + /* Row is within the mirrorable area. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + comp_height - y_crop_blocks - dst_blk_y - + (JDIMENSION) compptr->v_samp_factor, + (JDIMENSION) compptr->v_samp_factor, FALSE); } else { - /* Bottom-edge blocks will be copied verbatim. */ - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - dst_blk_y + y_crop_blocks, - (JDIMENSION) compptr->v_samp_factor, FALSE); + /* Bottom-edge blocks will be copied verbatim. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_y + y_crop_blocks, + (JDIMENSION) compptr->v_samp_factor, FALSE); } for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - if (y_crop_blocks + dst_blk_y < comp_height) { - /* Row is within the mirrorable area. */ - dst_row_ptr = dst_buffer[offset_y]; - src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1]; - src_row_ptr += x_crop_blocks; - for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; - dst_blk_x++) { - dst_ptr = dst_row_ptr[dst_blk_x]; - src_ptr = src_row_ptr[dst_blk_x]; - for (i = 0; i < DCTSIZE; i += 2) { - /* copy even row */ - for (j = 0; j < DCTSIZE; j++) - *dst_ptr++ = *src_ptr++; - /* copy odd row with sign change */ - for (j = 0; j < DCTSIZE; j++) - *dst_ptr++ = - *src_ptr++; - } - } - } else { - /* Just copy row verbatim. */ - jcopy_block_row(src_buffer[offset_y] + x_crop_blocks, - dst_buffer[offset_y], - compptr->width_in_blocks); - } + if (y_crop_blocks + dst_blk_y < comp_height) { + /* Row is within the mirrorable area. */ + dst_row_ptr = dst_buffer[offset_y]; + src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1]; + src_row_ptr += x_crop_blocks; + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; + dst_blk_x++) { + dst_ptr = dst_row_ptr[dst_blk_x]; + src_ptr = src_row_ptr[dst_blk_x]; + for (i = 0; i < DCTSIZE; i += 2) { + /* copy even row */ + for (j = 0; j < DCTSIZE; j++) + *dst_ptr++ = *src_ptr++; + /* copy odd row with sign change */ + for (j = 0; j < DCTSIZE; j++) + *dst_ptr++ = - *src_ptr++; + } + } + } else { + /* Just copy row verbatim. */ + jcopy_block_row(src_buffer[offset_y] + x_crop_blocks, + dst_buffer[offset_y], + compptr->width_in_blocks); + } } } } @@ -336,9 +336,9 @@ do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, - jvirt_barray_ptr *src_coef_arrays, - jvirt_barray_ptr *dst_coef_arrays) + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) /* Transpose source into destination */ { JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks; @@ -357,25 +357,25 @@ do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, x_crop_blocks = x_crop_offset * compptr->h_samp_factor; y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; - dst_blk_y += compptr->v_samp_factor) { + dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; - dst_blk_x += compptr->h_samp_factor) { - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - dst_blk_x + x_crop_blocks, - (JDIMENSION) compptr->h_samp_factor, FALSE); - for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { - dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; - src_ptr = src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks]; - for (i = 0; i < DCTSIZE; i++) - for (j = 0; j < DCTSIZE; j++) - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - } - } + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; + dst_blk_x += compptr->h_samp_factor) { + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_x + x_crop_blocks, + (JDIMENSION) compptr->h_samp_factor, FALSE); + for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { + dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + src_ptr = src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks]; + for (i = 0; i < DCTSIZE; i++) + for (j = 0; j < DCTSIZE; j++) + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + } + } } } } @@ -384,9 +384,9 @@ do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, - jvirt_barray_ptr *src_coef_arrays, - jvirt_barray_ptr *dst_coef_arrays) + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) /* 90 degree rotation is equivalent to * 1. Transposing the image; * 2. Horizontal mirroring. @@ -413,50 +413,50 @@ do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, x_crop_blocks = x_crop_offset * compptr->h_samp_factor; y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; - dst_blk_y += compptr->v_samp_factor) { + dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; - dst_blk_x += compptr->h_samp_factor) { - if (x_crop_blocks + dst_blk_x < comp_width) { - /* Block is within the mirrorable area. */ - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - comp_width - x_crop_blocks - dst_blk_x - - (JDIMENSION) compptr->h_samp_factor, - (JDIMENSION) compptr->h_samp_factor, FALSE); - } else { - /* Edge blocks are transposed but not mirrored. */ - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - dst_blk_x + x_crop_blocks, - (JDIMENSION) compptr->h_samp_factor, FALSE); - } - for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { - dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; - if (x_crop_blocks + dst_blk_x < comp_width) { - /* Block is within the mirrorable area. */ - src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] - [dst_blk_y + offset_y + y_crop_blocks]; - for (i = 0; i < DCTSIZE; i++) { - for (j = 0; j < DCTSIZE; j++) - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - i++; - for (j = 0; j < DCTSIZE; j++) - dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; - } - } else { - /* Edge blocks are transposed but not mirrored. */ - src_ptr = src_buffer[offset_x] - [dst_blk_y + offset_y + y_crop_blocks]; - for (i = 0; i < DCTSIZE; i++) - for (j = 0; j < DCTSIZE; j++) - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - } - } - } + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; + dst_blk_x += compptr->h_samp_factor) { + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Block is within the mirrorable area. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + comp_width - x_crop_blocks - dst_blk_x - + (JDIMENSION) compptr->h_samp_factor, + (JDIMENSION) compptr->h_samp_factor, FALSE); + } else { + /* Edge blocks are transposed but not mirrored. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_x + x_crop_blocks, + (JDIMENSION) compptr->h_samp_factor, FALSE); + } + for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { + dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Block is within the mirrorable area. */ + src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] + [dst_blk_y + offset_y + y_crop_blocks]; + for (i = 0; i < DCTSIZE; i++) { + for (j = 0; j < DCTSIZE; j++) + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + i++; + for (j = 0; j < DCTSIZE; j++) + dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; + } + } else { + /* Edge blocks are transposed but not mirrored. */ + src_ptr = src_buffer[offset_x] + [dst_blk_y + offset_y + y_crop_blocks]; + for (i = 0; i < DCTSIZE; i++) + for (j = 0; j < DCTSIZE; j++) + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + } + } + } } } } @@ -465,9 +465,9 @@ do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, - jvirt_barray_ptr *src_coef_arrays, - jvirt_barray_ptr *dst_coef_arrays) + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) /* 270 degree rotation is equivalent to * 1. Horizontal mirroring; * 2. Transposing the image. @@ -494,40 +494,40 @@ do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, x_crop_blocks = x_crop_offset * compptr->h_samp_factor; y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; - dst_blk_y += compptr->v_samp_factor) { + dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; - dst_blk_x += compptr->h_samp_factor) { - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - dst_blk_x + x_crop_blocks, - (JDIMENSION) compptr->h_samp_factor, FALSE); - for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { - dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; - if (y_crop_blocks + dst_blk_y < comp_height) { - /* Block is within the mirrorable area. */ - src_ptr = src_buffer[offset_x] - [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; - for (i = 0; i < DCTSIZE; i++) { - for (j = 0; j < DCTSIZE; j++) { - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - j++; - dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; - } - } - } else { - /* Edge blocks are transposed but not mirrored. */ - src_ptr = src_buffer[offset_x] - [dst_blk_y + offset_y + y_crop_blocks]; - for (i = 0; i < DCTSIZE; i++) - for (j = 0; j < DCTSIZE; j++) - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - } - } - } + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; + dst_blk_x += compptr->h_samp_factor) { + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_x + x_crop_blocks, + (JDIMENSION) compptr->h_samp_factor, FALSE); + for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { + dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + if (y_crop_blocks + dst_blk_y < comp_height) { + /* Block is within the mirrorable area. */ + src_ptr = src_buffer[offset_x] + [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; + for (i = 0; i < DCTSIZE; i++) { + for (j = 0; j < DCTSIZE; j++) { + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + j++; + dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; + } + } + } else { + /* Edge blocks are transposed but not mirrored. */ + src_ptr = src_buffer[offset_x] + [dst_blk_y + offset_y + y_crop_blocks]; + for (i = 0; i < DCTSIZE; i++) + for (j = 0; j < DCTSIZE; j++) + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + } + } + } } } } @@ -536,9 +536,9 @@ do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, - jvirt_barray_ptr *src_coef_arrays, - jvirt_barray_ptr *dst_coef_arrays) + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) /* 180 degree rotation is equivalent to * 1. Vertical mirroring; * 2. Horizontal mirroring. @@ -565,77 +565,77 @@ do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, x_crop_blocks = x_crop_offset * compptr->h_samp_factor; y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; - dst_blk_y += compptr->v_samp_factor) { + dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); if (y_crop_blocks + dst_blk_y < comp_height) { - /* Row is within the vertically mirrorable area. */ - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - comp_height - y_crop_blocks - dst_blk_y - - (JDIMENSION) compptr->v_samp_factor, - (JDIMENSION) compptr->v_samp_factor, FALSE); + /* Row is within the vertically mirrorable area. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + comp_height - y_crop_blocks - dst_blk_y - + (JDIMENSION) compptr->v_samp_factor, + (JDIMENSION) compptr->v_samp_factor, FALSE); } else { - /* Bottom-edge rows are only mirrored horizontally. */ - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - dst_blk_y + y_crop_blocks, - (JDIMENSION) compptr->v_samp_factor, FALSE); + /* Bottom-edge rows are only mirrored horizontally. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_y + y_crop_blocks, + (JDIMENSION) compptr->v_samp_factor, FALSE); } for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - dst_row_ptr = dst_buffer[offset_y]; - if (y_crop_blocks + dst_blk_y < comp_height) { - /* Row is within the mirrorable area. */ - src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1]; - for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { - dst_ptr = dst_row_ptr[dst_blk_x]; - if (x_crop_blocks + dst_blk_x < comp_width) { - /* Process the blocks that can be mirrored both ways. */ - src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; - for (i = 0; i < DCTSIZE; i += 2) { - /* For even row, negate every odd column. */ - for (j = 0; j < DCTSIZE; j += 2) { - *dst_ptr++ = *src_ptr++; - *dst_ptr++ = - *src_ptr++; - } - /* For odd row, negate every even column. */ - for (j = 0; j < DCTSIZE; j += 2) { - *dst_ptr++ = - *src_ptr++; - *dst_ptr++ = *src_ptr++; - } - } - } else { - /* Any remaining right-edge blocks are only mirrored vertically. */ - src_ptr = src_row_ptr[x_crop_blocks + dst_blk_x]; - for (i = 0; i < DCTSIZE; i += 2) { - for (j = 0; j < DCTSIZE; j++) - *dst_ptr++ = *src_ptr++; - for (j = 0; j < DCTSIZE; j++) - *dst_ptr++ = - *src_ptr++; - } - } - } - } else { - /* Remaining rows are just mirrored horizontally. */ - src_row_ptr = src_buffer[offset_y]; - for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { - if (x_crop_blocks + dst_blk_x < comp_width) { - /* Process the blocks that can be mirrored. */ - dst_ptr = dst_row_ptr[dst_blk_x]; - src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; - for (i = 0; i < DCTSIZE2; i += 2) { - *dst_ptr++ = *src_ptr++; - *dst_ptr++ = - *src_ptr++; - } - } else { - /* Any remaining right-edge blocks are only copied. */ - jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks, - dst_row_ptr + dst_blk_x, - (JDIMENSION) 1); - } - } - } + dst_row_ptr = dst_buffer[offset_y]; + if (y_crop_blocks + dst_blk_y < comp_height) { + /* Row is within the mirrorable area. */ + src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1]; + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { + dst_ptr = dst_row_ptr[dst_blk_x]; + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Process the blocks that can be mirrored both ways. */ + src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; + for (i = 0; i < DCTSIZE; i += 2) { + /* For even row, negate every odd column. */ + for (j = 0; j < DCTSIZE; j += 2) { + *dst_ptr++ = *src_ptr++; + *dst_ptr++ = - *src_ptr++; + } + /* For odd row, negate every even column. */ + for (j = 0; j < DCTSIZE; j += 2) { + *dst_ptr++ = - *src_ptr++; + *dst_ptr++ = *src_ptr++; + } + } + } else { + /* Any remaining right-edge blocks are only mirrored vertically. */ + src_ptr = src_row_ptr[x_crop_blocks + dst_blk_x]; + for (i = 0; i < DCTSIZE; i += 2) { + for (j = 0; j < DCTSIZE; j++) + *dst_ptr++ = *src_ptr++; + for (j = 0; j < DCTSIZE; j++) + *dst_ptr++ = - *src_ptr++; + } + } + } + } else { + /* Remaining rows are just mirrored horizontally. */ + src_row_ptr = src_buffer[offset_y]; + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) { + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Process the blocks that can be mirrored. */ + dst_ptr = dst_row_ptr[dst_blk_x]; + src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1]; + for (i = 0; i < DCTSIZE2; i += 2) { + *dst_ptr++ = *src_ptr++; + *dst_ptr++ = - *src_ptr++; + } + } else { + /* Any remaining right-edge blocks are only copied. */ + jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks, + dst_row_ptr + dst_blk_x, + (JDIMENSION) 1); + } + } + } } } } @@ -644,9 +644,9 @@ do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, LOCAL(void) do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, - jvirt_barray_ptr *src_coef_arrays, - jvirt_barray_ptr *dst_coef_arrays) + JDIMENSION x_crop_offset, JDIMENSION y_crop_offset, + jvirt_barray_ptr *src_coef_arrays, + jvirt_barray_ptr *dst_coef_arrays) /* Transverse transpose is equivalent to * 1. 180 degree rotation; * 2. Transposition; @@ -676,81 +676,81 @@ do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, x_crop_blocks = x_crop_offset * compptr->h_samp_factor; y_crop_blocks = y_crop_offset * compptr->v_samp_factor; for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks; - dst_blk_y += compptr->v_samp_factor) { + dst_blk_y += compptr->v_samp_factor) { dst_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, - (JDIMENSION) compptr->v_samp_factor, TRUE); + ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y, + (JDIMENSION) compptr->v_samp_factor, TRUE); for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) { - for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; - dst_blk_x += compptr->h_samp_factor) { - if (x_crop_blocks + dst_blk_x < comp_width) { - /* Block is within the mirrorable area. */ - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - comp_width - x_crop_blocks - dst_blk_x - - (JDIMENSION) compptr->h_samp_factor, - (JDIMENSION) compptr->h_samp_factor, FALSE); - } else { - src_buffer = (*srcinfo->mem->access_virt_barray) - ((j_common_ptr) srcinfo, src_coef_arrays[ci], - dst_blk_x + x_crop_blocks, - (JDIMENSION) compptr->h_samp_factor, FALSE); - } - for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { - dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; - if (y_crop_blocks + dst_blk_y < comp_height) { - if (x_crop_blocks + dst_blk_x < comp_width) { - /* Block is within the mirrorable area. */ - src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] - [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; - for (i = 0; i < DCTSIZE; i++) { - for (j = 0; j < DCTSIZE; j++) { - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - j++; - dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; - } - i++; - for (j = 0; j < DCTSIZE; j++) { - dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; - j++; - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - } - } - } else { - /* Right-edge blocks are mirrored in y only */ - src_ptr = src_buffer[offset_x] - [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; - for (i = 0; i < DCTSIZE; i++) { - for (j = 0; j < DCTSIZE; j++) { - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - j++; - dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; - } - } - } - } else { - if (x_crop_blocks + dst_blk_x < comp_width) { - /* Bottom-edge blocks are mirrored in x only */ - src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] - [dst_blk_y + offset_y + y_crop_blocks]; - for (i = 0; i < DCTSIZE; i++) { - for (j = 0; j < DCTSIZE; j++) - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - i++; - for (j = 0; j < DCTSIZE; j++) - dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; - } - } else { - /* At lower right corner, just transpose, no mirroring */ - src_ptr = src_buffer[offset_x] - [dst_blk_y + offset_y + y_crop_blocks]; - for (i = 0; i < DCTSIZE; i++) - for (j = 0; j < DCTSIZE; j++) - dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; - } - } - } - } + for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; + dst_blk_x += compptr->h_samp_factor) { + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Block is within the mirrorable area. */ + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + comp_width - x_crop_blocks - dst_blk_x - + (JDIMENSION) compptr->h_samp_factor, + (JDIMENSION) compptr->h_samp_factor, FALSE); + } else { + src_buffer = (*srcinfo->mem->access_virt_barray) + ((j_common_ptr) srcinfo, src_coef_arrays[ci], + dst_blk_x + x_crop_blocks, + (JDIMENSION) compptr->h_samp_factor, FALSE); + } + for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) { + dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x]; + if (y_crop_blocks + dst_blk_y < comp_height) { + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Block is within the mirrorable area. */ + src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] + [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; + for (i = 0; i < DCTSIZE; i++) { + for (j = 0; j < DCTSIZE; j++) { + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + j++; + dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; + } + i++; + for (j = 0; j < DCTSIZE; j++) { + dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; + j++; + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + } + } + } else { + /* Right-edge blocks are mirrored in y only */ + src_ptr = src_buffer[offset_x] + [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1]; + for (i = 0; i < DCTSIZE; i++) { + for (j = 0; j < DCTSIZE; j++) { + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + j++; + dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; + } + } + } + } else { + if (x_crop_blocks + dst_blk_x < comp_width) { + /* Bottom-edge blocks are mirrored in x only */ + src_ptr = src_buffer[compptr->h_samp_factor - offset_x - 1] + [dst_blk_y + offset_y + y_crop_blocks]; + for (i = 0; i < DCTSIZE; i++) { + for (j = 0; j < DCTSIZE; j++) + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + i++; + for (j = 0; j < DCTSIZE; j++) + dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j]; + } + } else { + /* At lower right corner, just transpose, no mirroring */ + src_ptr = src_buffer[offset_x] + [dst_blk_y + offset_y + y_crop_blocks]; + for (i = 0; i < DCTSIZE; i++) + for (j = 0; j < DCTSIZE; j++) + dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j]; + } + } + } + } } } } @@ -773,7 +773,7 @@ jt_read_integer (const char ** strptr, JDIMENSION * result) } *result = val; if (ptr == *strptr) - return FALSE; /* oops, no digits */ + return FALSE; /* oops, no digits */ *strptr = ptr; return TRUE; } @@ -783,7 +783,7 @@ jt_read_integer (const char ** strptr, JDIMENSION * result) * The routine returns TRUE if the spec string is valid, FALSE if not. * * The crop spec string should have the format - * [f]x[f]{+-}{+-} + * [f]x[f]{+-}{+-} * where width, height, xoffset, and yoffset are unsigned integers. * Each of the elements can be omitted to indicate a default value. * (A weakness of this style is that it is not possible to omit xoffset @@ -888,7 +888,7 @@ trim_bottom_edge (jpeg_transform_info *info, JDIMENSION full_height) GLOBAL(boolean) jtransform_request_workspace (j_decompress_ptr srcinfo, - jpeg_transform_info *info) + jpeg_transform_info *info) { jvirt_barray_ptr *coef_arrays; boolean need_workspace, transpose_it; @@ -921,18 +921,18 @@ jtransform_request_workspace (j_decompress_ptr srcinfo, if (info->perfect) { if (info->num_components == 1) { if (!jtransform_perfect_transform(srcinfo->output_width, - srcinfo->output_height, - srcinfo->_min_DCT_h_scaled_size, - srcinfo->_min_DCT_v_scaled_size, - info->transform)) - return FALSE; + srcinfo->output_height, + srcinfo->_min_DCT_h_scaled_size, + srcinfo->_min_DCT_v_scaled_size, + info->transform)) + return FALSE; } else { if (!jtransform_perfect_transform(srcinfo->output_width, - srcinfo->output_height, - srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size, - srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size, - info->transform)) - return FALSE; + srcinfo->output_height, + srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size, + srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size, + info->transform)) + return FALSE; } } @@ -953,9 +953,9 @@ jtransform_request_workspace (j_decompress_ptr srcinfo, info->iMCU_sample_height = srcinfo->_min_DCT_h_scaled_size; } else { info->iMCU_sample_width = - srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size; + srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size; info->iMCU_sample_height = - srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size; + srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size; } break; default: @@ -966,9 +966,9 @@ jtransform_request_workspace (j_decompress_ptr srcinfo, info->iMCU_sample_height = srcinfo->_min_DCT_v_scaled_size; } else { info->iMCU_sample_width = - srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size; + srcinfo->max_h_samp_factor * srcinfo->_min_DCT_h_scaled_size; info->iMCU_sample_height = - srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size; + srcinfo->max_v_samp_factor * srcinfo->_min_DCT_v_scaled_size; } break; } @@ -979,11 +979,11 @@ jtransform_request_workspace (j_decompress_ptr srcinfo, if (info->crop) { /* Insert default values for unset crop parameters */ if (info->crop_xoffset_set == JCROP_UNSET) - info->crop_xoffset = 0; /* default to +0 */ + info->crop_xoffset = 0; /* default to +0 */ if (info->crop_yoffset_set == JCROP_UNSET) - info->crop_yoffset = 0; /* default to +0 */ + info->crop_yoffset = 0; /* default to +0 */ if (info->crop_xoffset >= info->output_width || - info->crop_yoffset >= info->output_height) + info->crop_yoffset >= info->output_height) ERREXIT(srcinfo, JERR_BAD_CROP_SPEC); if (info->crop_width_set == JCROP_UNSET) info->crop_width = info->output_width - info->crop_xoffset; @@ -991,9 +991,9 @@ jtransform_request_workspace (j_decompress_ptr srcinfo, info->crop_height = info->output_height - info->crop_yoffset; /* Ensure parameters are valid */ if (info->crop_width <= 0 || info->crop_width > info->output_width || - info->crop_height <= 0 || info->crop_height > info->output_height || - info->crop_xoffset > info->output_width - info->crop_width || - info->crop_yoffset > info->output_height - info->crop_height) + info->crop_height <= 0 || info->crop_height > info->output_height || + info->crop_xoffset > info->output_width - info->crop_width || + info->crop_yoffset > info->output_height - info->crop_height) ERREXIT(srcinfo, JERR_BAD_CROP_SPEC); /* Convert negative crop offsets into regular offsets */ if (info->crop_xoffset_set == JCROP_NEG) @@ -1093,30 +1093,30 @@ jtransform_request_workspace (j_decompress_ptr srcinfo, if (need_workspace) { coef_arrays = (jvirt_barray_ptr *) (*srcinfo->mem->alloc_small) ((j_common_ptr) srcinfo, JPOOL_IMAGE, - SIZEOF(jvirt_barray_ptr) * info->num_components); + SIZEOF(jvirt_barray_ptr) * info->num_components); width_in_iMCUs = (JDIMENSION) jdiv_round_up((long) info->output_width, - (long) info->iMCU_sample_width); + (long) info->iMCU_sample_width); height_in_iMCUs = (JDIMENSION) jdiv_round_up((long) info->output_height, - (long) info->iMCU_sample_height); + (long) info->iMCU_sample_height); for (ci = 0; ci < info->num_components; ci++) { compptr = srcinfo->comp_info + ci; if (info->num_components == 1) { - /* we're going to force samp factors to 1x1 in this case */ - h_samp_factor = v_samp_factor = 1; + /* we're going to force samp factors to 1x1 in this case */ + h_samp_factor = v_samp_factor = 1; } else if (transpose_it) { - h_samp_factor = compptr->v_samp_factor; - v_samp_factor = compptr->h_samp_factor; + h_samp_factor = compptr->v_samp_factor; + v_samp_factor = compptr->h_samp_factor; } else { - h_samp_factor = compptr->h_samp_factor; - v_samp_factor = compptr->v_samp_factor; + h_samp_factor = compptr->h_samp_factor; + v_samp_factor = compptr->v_samp_factor; } width_in_blocks = width_in_iMCUs * h_samp_factor; height_in_blocks = height_in_iMCUs * v_samp_factor; coef_arrays[ci] = (*srcinfo->mem->request_virt_barray) - ((j_common_ptr) srcinfo, JPOOL_IMAGE, FALSE, - width_in_blocks, height_in_blocks, (JDIMENSION) v_samp_factor); + ((j_common_ptr) srcinfo, JPOOL_IMAGE, FALSE, + width_in_blocks, height_in_blocks, (JDIMENSION) v_samp_factor); } info->workspace_coef_arrays = coef_arrays; } else @@ -1160,11 +1160,11 @@ transpose_critical_parameters (j_compress_ptr dstinfo) qtblptr = dstinfo->quant_tbl_ptrs[tblno]; if (qtblptr != NULL) { for (i = 0; i < DCTSIZE; i++) { - for (j = 0; j < i; j++) { - qtemp = qtblptr->quantval[i*DCTSIZE+j]; - qtblptr->quantval[i*DCTSIZE+j] = qtblptr->quantval[j*DCTSIZE+i]; - qtblptr->quantval[j*DCTSIZE+i] = qtemp; - } + for (j = 0; j < i; j++) { + qtemp = qtblptr->quantval[i*DCTSIZE+j]; + qtblptr->quantval[i*DCTSIZE+j] = qtblptr->quantval[j*DCTSIZE+i]; + qtblptr->quantval[j*DCTSIZE+i] = qtemp; + } } } } @@ -1179,7 +1179,7 @@ transpose_critical_parameters (j_compress_ptr dstinfo) #if JPEG_LIB_VERSION >= 70 LOCAL(void) adjust_exif_parameters (JOCTET FAR * data, unsigned int length, - JDIMENSION new_width, JDIMENSION new_height) + JDIMENSION new_width, JDIMENSION new_height) { boolean is_motorola; /* Flag for byte order */ unsigned int number_of_tags, tagnum; @@ -1296,31 +1296,31 @@ adjust_exif_parameters (JOCTET FAR * data, unsigned int length, } if (tagnum == 0xA002 || tagnum == 0xA003) { if (tagnum == 0xA002) - new_value = new_width; /* ExifImageWidth Tag */ + new_value = new_width; /* ExifImageWidth Tag */ else - new_value = new_height; /* ExifImageHeight Tag */ + new_value = new_height; /* ExifImageHeight Tag */ if (is_motorola) { - data[offset+2] = 0; /* Format = unsigned long (4 octets) */ - data[offset+3] = 4; - data[offset+4] = 0; /* Number Of Components = 1 */ - data[offset+5] = 0; - data[offset+6] = 0; - data[offset+7] = 1; - data[offset+8] = 0; - data[offset+9] = 0; - data[offset+10] = (JOCTET)((new_value >> 8) & 0xFF); - data[offset+11] = (JOCTET)(new_value & 0xFF); + data[offset+2] = 0; /* Format = unsigned long (4 octets) */ + data[offset+3] = 4; + data[offset+4] = 0; /* Number Of Components = 1 */ + data[offset+5] = 0; + data[offset+6] = 0; + data[offset+7] = 1; + data[offset+8] = 0; + data[offset+9] = 0; + data[offset+10] = (JOCTET)((new_value >> 8) & 0xFF); + data[offset+11] = (JOCTET)(new_value & 0xFF); } else { - data[offset+2] = 4; /* Format = unsigned long (4 octets) */ - data[offset+3] = 0; - data[offset+4] = 1; /* Number Of Components = 1 */ - data[offset+5] = 0; - data[offset+6] = 0; - data[offset+7] = 0; - data[offset+8] = (JOCTET)(new_value & 0xFF); - data[offset+9] = (JOCTET)((new_value >> 8) & 0xFF); - data[offset+10] = 0; - data[offset+11] = 0; + data[offset+2] = 4; /* Format = unsigned long (4 octets) */ + data[offset+3] = 0; + data[offset+4] = 1; /* Number Of Components = 1 */ + data[offset+5] = 0; + data[offset+6] = 0; + data[offset+7] = 0; + data[offset+8] = (JOCTET)(new_value & 0xFF); + data[offset+9] = (JOCTET)((new_value >> 8) & 0xFF); + data[offset+10] = 0; + data[offset+11] = 0; } } offset += 12; @@ -1342,9 +1342,9 @@ adjust_exif_parameters (JOCTET FAR * data, unsigned int length, GLOBAL(jvirt_barray_ptr *) jtransform_adjust_parameters (j_decompress_ptr srcinfo, - j_compress_ptr dstinfo, - jvirt_barray_ptr *src_coef_arrays, - jpeg_transform_info *info) + j_compress_ptr dstinfo, + jvirt_barray_ptr *src_coef_arrays, + jpeg_transform_info *info) { /* If force-to-grayscale is requested, adjust destination parameters */ if (info->force_grayscale) { @@ -1354,11 +1354,11 @@ jtransform_adjust_parameters (j_decompress_ptr srcinfo, * isn't worth extra code space. But we check it to avoid crashing.) */ if (((dstinfo->jpeg_color_space == JCS_YCbCr && - dstinfo->num_components == 3) || - (dstinfo->jpeg_color_space == JCS_GRAYSCALE && - dstinfo->num_components == 1)) && - srcinfo->comp_info[0].h_samp_factor == srcinfo->max_h_samp_factor && - srcinfo->comp_info[0].v_samp_factor == srcinfo->max_v_samp_factor) { + dstinfo->num_components == 3) || + (dstinfo->jpeg_color_space == JCS_GRAYSCALE && + dstinfo->num_components == 1)) && + srcinfo->comp_info[0].h_samp_factor == srcinfo->max_h_samp_factor && + srcinfo->comp_info[0].v_samp_factor == srcinfo->max_v_samp_factor) { /* We use jpeg_set_colorspace to make sure subsidiary settings get fixed * properly. Among other things, it sets the target h_samp_factor & * v_samp_factor to 1, which typically won't match the source. @@ -1423,11 +1423,11 @@ jtransform_adjust_parameters (j_decompress_ptr srcinfo, #if JPEG_LIB_VERSION >= 70 /* Adjust Exif image parameters */ if (dstinfo->jpeg_width != srcinfo->image_width || - dstinfo->jpeg_height != srcinfo->image_height) + dstinfo->jpeg_height != srcinfo->image_height) /* Align data segment to start of TIFF structure for parsing */ adjust_exif_parameters(srcinfo->marker_list->data + 6, - srcinfo->marker_list->data_length - 6, - dstinfo->jpeg_width, dstinfo->jpeg_height); + srcinfo->marker_list->data_length - 6, + dstinfo->jpeg_width, dstinfo->jpeg_height); #endif } @@ -1449,9 +1449,9 @@ jtransform_adjust_parameters (j_decompress_ptr srcinfo, GLOBAL(void) jtransform_execute_transform (j_decompress_ptr srcinfo, - j_compress_ptr dstinfo, - jvirt_barray_ptr *src_coef_arrays, - jpeg_transform_info *info) + j_compress_ptr dstinfo, + jvirt_barray_ptr *src_coef_arrays, + jpeg_transform_info *info) { jvirt_barray_ptr *dst_coef_arrays = info->workspace_coef_arrays; @@ -1462,39 +1462,39 @@ jtransform_execute_transform (j_decompress_ptr srcinfo, case JXFORM_NONE: if (info->x_crop_offset != 0 || info->y_crop_offset != 0) do_crop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, - src_coef_arrays, dst_coef_arrays); + src_coef_arrays, dst_coef_arrays); break; case JXFORM_FLIP_H: if (info->y_crop_offset != 0 || info->slow_hflip) do_flip_h(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, - src_coef_arrays, dst_coef_arrays); + src_coef_arrays, dst_coef_arrays); else do_flip_h_no_crop(srcinfo, dstinfo, info->x_crop_offset, - src_coef_arrays); + src_coef_arrays); break; case JXFORM_FLIP_V: do_flip_v(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, - src_coef_arrays, dst_coef_arrays); + src_coef_arrays, dst_coef_arrays); break; case JXFORM_TRANSPOSE: do_transpose(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, - src_coef_arrays, dst_coef_arrays); + src_coef_arrays, dst_coef_arrays); break; case JXFORM_TRANSVERSE: do_transverse(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, - src_coef_arrays, dst_coef_arrays); + src_coef_arrays, dst_coef_arrays); break; case JXFORM_ROT_90: do_rot_90(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, - src_coef_arrays, dst_coef_arrays); + src_coef_arrays, dst_coef_arrays); break; case JXFORM_ROT_180: do_rot_180(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, - src_coef_arrays, dst_coef_arrays); + src_coef_arrays, dst_coef_arrays); break; case JXFORM_ROT_270: do_rot_270(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset, - src_coef_arrays, dst_coef_arrays); + src_coef_arrays, dst_coef_arrays); break; } } @@ -1522,8 +1522,8 @@ jtransform_execute_transform (j_decompress_ptr srcinfo, GLOBAL(boolean) jtransform_perfect_transform(JDIMENSION image_width, JDIMENSION image_height, - int MCU_width, int MCU_height, - JXFORM_CODE transform) + int MCU_width, int MCU_height, + JXFORM_CODE transform) { boolean result = TRUE; /* initialize TRUE */ @@ -1586,7 +1586,7 @@ jcopy_markers_setup (j_decompress_ptr srcinfo, JCOPY_OPTION option) GLOBAL(void) jcopy_markers_execute (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JCOPY_OPTION option) + JCOPY_OPTION option) { jpeg_saved_marker_ptr marker; @@ -1597,34 +1597,34 @@ jcopy_markers_execute (j_decompress_ptr srcinfo, j_compress_ptr dstinfo, */ for (marker = srcinfo->marker_list; marker != NULL; marker = marker->next) { if (dstinfo->write_JFIF_header && - marker->marker == JPEG_APP0 && - marker->data_length >= 5 && - GETJOCTET(marker->data[0]) == 0x4A && - GETJOCTET(marker->data[1]) == 0x46 && - GETJOCTET(marker->data[2]) == 0x49 && - GETJOCTET(marker->data[3]) == 0x46 && - GETJOCTET(marker->data[4]) == 0) - continue; /* reject duplicate JFIF */ + marker->marker == JPEG_APP0 && + marker->data_length >= 5 && + GETJOCTET(marker->data[0]) == 0x4A && + GETJOCTET(marker->data[1]) == 0x46 && + GETJOCTET(marker->data[2]) == 0x49 && + GETJOCTET(marker->data[3]) == 0x46 && + GETJOCTET(marker->data[4]) == 0) + continue; /* reject duplicate JFIF */ if (dstinfo->write_Adobe_marker && - marker->marker == JPEG_APP0+14 && - marker->data_length >= 5 && - GETJOCTET(marker->data[0]) == 0x41 && - GETJOCTET(marker->data[1]) == 0x64 && - GETJOCTET(marker->data[2]) == 0x6F && - GETJOCTET(marker->data[3]) == 0x62 && - GETJOCTET(marker->data[4]) == 0x65) - continue; /* reject duplicate Adobe */ + marker->marker == JPEG_APP0+14 && + marker->data_length >= 5 && + GETJOCTET(marker->data[0]) == 0x41 && + GETJOCTET(marker->data[1]) == 0x64 && + GETJOCTET(marker->data[2]) == 0x6F && + GETJOCTET(marker->data[3]) == 0x62 && + GETJOCTET(marker->data[4]) == 0x65) + continue; /* reject duplicate Adobe */ #ifdef NEED_FAR_POINTERS /* We could use jpeg_write_marker if the data weren't FAR... */ { unsigned int i; jpeg_write_m_header(dstinfo, marker->marker, marker->data_length); for (i = 0; i < marker->data_length; i++) - jpeg_write_m_byte(dstinfo, marker->data[i]); + jpeg_write_m_byte(dstinfo, marker->data[i]); } #else jpeg_write_marker(dstinfo, marker->marker, - marker->data, marker->data_length); + marker->data, marker->data_length); #endif } } diff --git a/transupp.h b/transupp.h index cfbaca40f..710b345b0 100644 --- a/transupp.h +++ b/transupp.h @@ -19,7 +19,7 @@ /* If you happen not to want the image transform support, disable it here */ #ifndef TRANSFORMS_SUPPORTED -#define TRANSFORMS_SUPPORTED 1 /* 0 disables transform code */ +#define TRANSFORMS_SUPPORTED 1 /* 0 disables transform code */ #endif /* @@ -80,13 +80,13 @@ /* Short forms of external names for systems with brain-damaged linkers. */ #ifdef NEED_SHORT_EXTERNAL_NAMES -#define jtransform_parse_crop_spec jTrParCrop -#define jtransform_request_workspace jTrRequest -#define jtransform_adjust_parameters jTrAdjust -#define jtransform_execute_transform jTrExec -#define jtransform_perfect_transform jTrPerfect -#define jcopy_markers_setup jCMrkSetup -#define jcopy_markers_execute jCMrkExec +#define jtransform_parse_crop_spec jTrParCrop +#define jtransform_request_workspace jTrRequest +#define jtransform_adjust_parameters jTrAdjust +#define jtransform_execute_transform jTrExec +#define jtransform_perfect_transform jTrPerfect +#define jcopy_markers_setup jCMrkSetup +#define jcopy_markers_execute jCMrkExec #endif /* NEED_SHORT_EXTERNAL_NAMES */ @@ -95,14 +95,14 @@ */ typedef enum { - JXFORM_NONE, /* no transformation */ - JXFORM_FLIP_H, /* horizontal flip */ - JXFORM_FLIP_V, /* vertical flip */ - JXFORM_TRANSPOSE, /* transpose across UL-to-LR axis */ - JXFORM_TRANSVERSE, /* transpose across UR-to-LL axis */ - JXFORM_ROT_90, /* 90-degree clockwise rotation */ - JXFORM_ROT_180, /* 180-degree rotation */ - JXFORM_ROT_270 /* 270-degree clockwise (or 90 ccw) */ + JXFORM_NONE, /* no transformation */ + JXFORM_FLIP_H, /* horizontal flip */ + JXFORM_FLIP_V, /* vertical flip */ + JXFORM_TRANSPOSE, /* transpose across UL-to-LR axis */ + JXFORM_TRANSVERSE, /* transpose across UR-to-LL axis */ + JXFORM_ROT_90, /* 90-degree clockwise rotation */ + JXFORM_ROT_180, /* 180-degree rotation */ + JXFORM_ROT_270 /* 270-degree clockwise (or 90 ccw) */ } JXFORM_CODE; /* @@ -126,11 +126,11 @@ typedef enum { typedef struct { /* Options: set by caller */ - JXFORM_CODE transform; /* image transform operator */ - boolean perfect; /* if TRUE, fail if partial MCUs are requested */ - boolean trim; /* if TRUE, trim partial MCUs as needed */ - boolean force_grayscale; /* if TRUE, convert color image to grayscale */ - boolean crop; /* if TRUE, crop source image */ + JXFORM_CODE transform; /* image transform operator */ + boolean perfect; /* if TRUE, fail if partial MCUs are requested */ + boolean trim; /* if TRUE, trim partial MCUs as needed */ + boolean force_grayscale; /* if TRUE, convert color image to grayscale */ + boolean crop; /* if TRUE, crop source image */ boolean slow_hflip; /* For best performance, the JXFORM_FLIP_H transform normally modifies the source coefficients in place. Setting this to TRUE will instead use a slower, @@ -142,23 +142,23 @@ typedef struct { /* Crop parameters: application need not set these unless crop is TRUE. * These can be filled in by jtransform_parse_crop_spec(). */ - JDIMENSION crop_width; /* Width of selected region */ - JCROP_CODE crop_width_set; /* (forced disables adjustment) */ - JDIMENSION crop_height; /* Height of selected region */ - JCROP_CODE crop_height_set; /* (forced disables adjustment) */ - JDIMENSION crop_xoffset; /* X offset of selected region */ - JCROP_CODE crop_xoffset_set; /* (negative measures from right edge) */ - JDIMENSION crop_yoffset; /* Y offset of selected region */ - JCROP_CODE crop_yoffset_set; /* (negative measures from bottom edge) */ + JDIMENSION crop_width; /* Width of selected region */ + JCROP_CODE crop_width_set; /* (forced disables adjustment) */ + JDIMENSION crop_height; /* Height of selected region */ + JCROP_CODE crop_height_set; /* (forced disables adjustment) */ + JDIMENSION crop_xoffset; /* X offset of selected region */ + JCROP_CODE crop_xoffset_set; /* (negative measures from right edge) */ + JDIMENSION crop_yoffset; /* Y offset of selected region */ + JCROP_CODE crop_yoffset_set; /* (negative measures from bottom edge) */ /* Internal workspace: caller should not touch these */ - int num_components; /* # of components in workspace */ + int num_components; /* # of components in workspace */ jvirt_barray_ptr * workspace_coef_arrays; /* workspace for transformations */ - JDIMENSION output_width; /* cropped destination dimensions */ + JDIMENSION output_width; /* cropped destination dimensions */ JDIMENSION output_height; - JDIMENSION x_crop_offset; /* destination crop offsets measured in iMCUs */ + JDIMENSION x_crop_offset; /* destination crop offsets measured in iMCUs */ JDIMENSION y_crop_offset; - int iMCU_sample_width; /* destination iMCU size */ + int iMCU_sample_width; /* destination iMCU size */ int iMCU_sample_height; } jpeg_transform_info; @@ -167,34 +167,34 @@ typedef struct { /* Parse a crop specification (written in X11 geometry style) */ EXTERN(boolean) jtransform_parse_crop_spec - JPP((jpeg_transform_info *info, const char *spec)); + JPP((jpeg_transform_info *info, const char *spec)); /* Request any required workspace */ EXTERN(boolean) jtransform_request_workspace - JPP((j_decompress_ptr srcinfo, jpeg_transform_info *info)); + JPP((j_decompress_ptr srcinfo, jpeg_transform_info *info)); /* Adjust output image parameters */ EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters - JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - jvirt_barray_ptr *src_coef_arrays, - jpeg_transform_info *info)); + JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + jvirt_barray_ptr *src_coef_arrays, + jpeg_transform_info *info)); /* Execute the actual transformation, if any */ EXTERN(void) jtransform_execute_transform - JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - jvirt_barray_ptr *src_coef_arrays, - jpeg_transform_info *info)); + JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + jvirt_barray_ptr *src_coef_arrays, + jpeg_transform_info *info)); /* Determine whether lossless transformation is perfectly * possible for a specified image and transformation. */ EXTERN(boolean) jtransform_perfect_transform - JPP((JDIMENSION image_width, JDIMENSION image_height, - int MCU_width, int MCU_height, - JXFORM_CODE transform)); + JPP((JDIMENSION image_width, JDIMENSION image_height, + int MCU_width, int MCU_height, + JXFORM_CODE transform)); /* jtransform_execute_transform used to be called * jtransform_execute_transformation, but some compilers complain about * routine names that long. This macro is here to avoid breaking any * old source code that uses the original name... */ -#define jtransform_execute_transformation jtransform_execute_transform +#define jtransform_execute_transformation jtransform_execute_transform #endif /* TRANSFORMS_SUPPORTED */ @@ -204,17 +204,17 @@ EXTERN(boolean) jtransform_perfect_transform */ typedef enum { - JCOPYOPT_NONE, /* copy no optional markers */ - JCOPYOPT_COMMENTS, /* copy only comment (COM) markers */ - JCOPYOPT_ALL /* copy all optional markers */ + JCOPYOPT_NONE, /* copy no optional markers */ + JCOPYOPT_COMMENTS, /* copy only comment (COM) markers */ + JCOPYOPT_ALL /* copy all optional markers */ } JCOPY_OPTION; -#define JCOPYOPT_DEFAULT JCOPYOPT_COMMENTS /* recommended default */ +#define JCOPYOPT_DEFAULT JCOPYOPT_COMMENTS /* recommended default */ /* Setup decompression object to save desired markers in memory */ EXTERN(void) jcopy_markers_setup - JPP((j_decompress_ptr srcinfo, JCOPY_OPTION option)); + JPP((j_decompress_ptr srcinfo, JCOPY_OPTION option)); /* Copy markers saved in the given source object to the destination object */ EXTERN(void) jcopy_markers_execute - JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo, - JCOPY_OPTION option)); + JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo, + JCOPY_OPTION option)); diff --git a/usage.txt b/usage.txt index 775a5440b..14ab77b2f 100644 --- a/usage.txt +++ b/usage.txt @@ -26,27 +26,27 @@ We provide two programs, cjpeg to compress an image file into JPEG format, and djpeg to decompress a JPEG file back into a conventional image format. On Unix-like systems, you say: - cjpeg [switches] [imagefile] >jpegfile + cjpeg [switches] [imagefile] >jpegfile or - djpeg [switches] [jpegfile] >imagefile + djpeg [switches] [jpegfile] >imagefile The programs read the specified input file, or standard input if none is named. They always write to standard output (with trace/error messages to standard error). These conventions are handy for piping images between programs. On most non-Unix systems, you say: - cjpeg [switches] imagefile jpegfile + cjpeg [switches] imagefile jpegfile or - djpeg [switches] jpegfile imagefile + djpeg [switches] jpegfile imagefile i.e., both the input and output files are named on the command line. This style is a little more foolproof, and it loses no functionality if you don't have pipes. (You can get this style on Unix too, if you prefer, by defining TWO_FILE_COMMANDLINE when you compile the programs; see install.txt.) You can also say: - cjpeg [switches] -outfile jpegfile imagefile + cjpeg [switches] -outfile jpegfile imagefile or - djpeg [switches] -outfile imagefile jpegfile + djpeg [switches] -outfile imagefile jpegfile This syntax works on all systems, so it is useful for scripts. The currently supported image file formats are: PPM (PBMPLUS color format), @@ -69,35 +69,35 @@ CJPEG DETAILS The basic command line switches for cjpeg are: - -quality N[,...] Scale quantization tables to adjust image quality. - Quality is 0 (worst) to 100 (best); default is 75. - (See below for more info.) + -quality N[,...] Scale quantization tables to adjust image quality. + Quality is 0 (worst) to 100 (best); default is 75. + (See below for more info.) - -grayscale Create monochrome JPEG file from color input. - Be sure to use this switch when compressing a grayscale - BMP file, because cjpeg isn't bright enough to notice - whether a BMP file uses only shades of gray. By - saying -grayscale, you'll get a smaller JPEG file that - takes less time to process. + -grayscale Create monochrome JPEG file from color input. + Be sure to use this switch when compressing a grayscale + BMP file, because cjpeg isn't bright enough to notice + whether a BMP file uses only shades of gray. By + saying -grayscale, you'll get a smaller JPEG file that + takes less time to process. - -rgb Create RGB JPEG file. - Using this switch suppresses the conversion from RGB - colorspace input to the default YCbCr JPEG colorspace. + -rgb Create RGB JPEG file. + Using this switch suppresses the conversion from RGB + colorspace input to the default YCbCr JPEG colorspace. - -optimize Perform optimization of entropy encoding parameters. - Without this, default encoding parameters are used. - -optimize usually makes the JPEG file a little smaller, - but cjpeg runs somewhat slower and needs much more - memory. Image quality and speed of decompression are - unaffected by -optimize. + -optimize Perform optimization of entropy encoding parameters. + Without this, default encoding parameters are used. + -optimize usually makes the JPEG file a little smaller, + but cjpeg runs somewhat slower and needs much more + memory. Image quality and speed of decompression are + unaffected by -optimize. - -progressive Create progressive JPEG file (see below). + -progressive Create progressive JPEG file (see below). - -targa Input file is Targa format. Targa files that contain - an "identification" field will not be automatically - recognized by cjpeg; for such files you must specify - -targa to make cjpeg treat the input as Targa format. - For most Targa files, you won't need this switch. + -targa Input file is Targa format. Targa files that contain + an "identification" field will not be automatically + recognized by cjpeg; for such files you must specify + -targa to make cjpeg treat the input as Targa format. + For most Targa files, you won't need this switch. The -quality switch lets you trade off compressed file size against quality of the reconstructed image: the higher the quality setting, the larger the JPEG @@ -164,38 +164,38 @@ file size is about the same --- often a little smaller. Switches for advanced users: - -arithmetic Use arithmetic coding. CAUTION: arithmetic coded JPEG - is not yet widely implemented, so many decoders will - be unable to view an arithmetic coded JPEG file at - all. - - -dct int Use integer DCT method (default). - -dct fast Use fast integer DCT (less accurate). - -dct float Use floating-point DCT method. - The float method is very slightly more accurate than - the int method, but is much slower unless your machine - has very fast floating-point hardware. Also note that - results of the floating-point method may vary slightly - across machines, while the integer methods should give - the same results everywhere. The fast integer method - is much less accurate than the other two. - - -restart N Emit a JPEG restart marker every N MCU rows, or every - N MCU blocks if "B" is attached to the number. - -restart 0 (the default) means no restart markers. - - -smooth N Smooth the input image to eliminate dithering noise. - N, ranging from 1 to 100, indicates the strength of - smoothing. 0 (the default) means no smoothing. - - -maxmemory N Set limit for amount of memory to use in processing - large images. Value is in thousands of bytes, or - millions of bytes if "M" is attached to the number. - For example, -max 4m selects 4000000 bytes. If more - space is needed, temporary files will be used. - - -verbose Enable debug printout. More -v's give more printout. - or -debug Also, version information is printed at startup. + -arithmetic Use arithmetic coding. CAUTION: arithmetic coded JPEG + is not yet widely implemented, so many decoders will + be unable to view an arithmetic coded JPEG file at + all. + + -dct int Use integer DCT method (default). + -dct fast Use fast integer DCT (less accurate). + -dct float Use floating-point DCT method. + The float method is very slightly more accurate than + the int method, but is much slower unless your machine + has very fast floating-point hardware. Also note that + results of the floating-point method may vary slightly + across machines, while the integer methods should give + the same results everywhere. The fast integer method + is much less accurate than the other two. + + -restart N Emit a JPEG restart marker every N MCU rows, or every + N MCU blocks if "B" is attached to the number. + -restart 0 (the default) means no restart markers. + + -smooth N Smooth the input image to eliminate dithering noise. + N, ranging from 1 to 100, indicates the strength of + smoothing. 0 (the default) means no smoothing. + + -maxmemory N Set limit for amount of memory to use in processing + large images. Value is in thousands of bytes, or + millions of bytes if "M" is attached to the number. + For example, -max 4m selects 4000000 bytes. If more + space is needed, temporary files will be used. + + -verbose Enable debug printout. More -v's give more printout. + or -debug Also, version information is printed at startup. The -restart option inserts extra markers that allow a JPEG decoder to resynchronize after a transmission error. Without restart markers, any damage @@ -213,22 +213,22 @@ factor will visibly blur the image, however. Switches for wizards: - -baseline Force baseline-compatible quantization tables to be - generated. This clamps quantization values to 8 bits - even at low quality settings. (This switch is poorly - named, since it does not ensure that the output is - actually baseline JPEG. For example, you can use - -baseline and -progressive together.) + -baseline Force baseline-compatible quantization tables to be + generated. This clamps quantization values to 8 bits + even at low quality settings. (This switch is poorly + named, since it does not ensure that the output is + actually baseline JPEG. For example, you can use + -baseline and -progressive together.) - -qtables file Use the quantization tables given in the specified - text file. + -qtables file Use the quantization tables given in the specified + text file. - -qslots N[,...] Select which quantization table to use for each color - component. + -qslots N[,...] Select which quantization table to use for each color + component. - -sample HxV[,...] Set JPEG sampling factors for each color component. + -sample HxV[,...] Set JPEG sampling factors for each color component. - -scans file Use the scan script given in the specified text file. + -scans file Use the scan script given in the specified text file. The "wizard" switches are intended for experimentation with JPEG. If you don't know what you are doing, DON'T USE THEM. These switches are documented @@ -239,106 +239,106 @@ DJPEG DETAILS The basic command line switches for djpeg are: - -colors N Reduce image to at most N colors. This reduces the - or -quantize N number of colors used in the output image, so that it - can be displayed on a colormapped display or stored in - a colormapped file format. For example, if you have - an 8-bit display, you'd need to reduce to 256 or fewer - colors. (-colors is the recommended name, -quantize - is provided only for backwards compatibility.) - - -fast Select recommended processing options for fast, low - quality output. (The default options are chosen for - highest quality output.) Currently, this is equivalent - to "-dct fast -nosmooth -onepass -dither ordered". - - -grayscale Force gray-scale output even if JPEG file is color. - Useful for viewing on monochrome displays; also, - djpeg runs noticeably faster in this mode. - - -scale M/N Scale the output image by a factor M/N. Currently - the scale factor must be M/8, where M is an integer - between 1 and 16 inclusive, or any reduced fraction - thereof (such as 1/2, 3/4, etc. Scaling is handy if - the image is larger than your screen; also, djpeg runs - much faster when scaling down the output. - - -bmp Select BMP output format (Windows flavor). 8-bit - colormapped format is emitted if -colors or -grayscale - is specified, or if the JPEG file is gray-scale; - otherwise, 24-bit full-color format is emitted. - - -gif Select GIF output format. Since GIF does not support - more than 256 colors, -colors 256 is assumed (unless - you specify a smaller number of colors). If you - specify -fast, the default number of colors is 216. - - -os2 Select BMP output format (OS/2 1.x flavor). 8-bit - colormapped format is emitted if -colors or -grayscale - is specified, or if the JPEG file is gray-scale; - otherwise, 24-bit full-color format is emitted. - - -pnm Select PBMPLUS (PPM/PGM) output format (this is the - default format). PGM is emitted if the JPEG file is - gray-scale or if -grayscale is specified; otherwise - PPM is emitted. - - -rle Select RLE output format. (Requires URT library.) - - -targa Select Targa output format. Gray-scale format is - emitted if the JPEG file is gray-scale or if - -grayscale is specified; otherwise, colormapped format - is emitted if -colors is specified; otherwise, 24-bit - full-color format is emitted. + -colors N Reduce image to at most N colors. This reduces the + or -quantize N number of colors used in the output image, so that it + can be displayed on a colormapped display or stored in + a colormapped file format. For example, if you have + an 8-bit display, you'd need to reduce to 256 or fewer + colors. (-colors is the recommended name, -quantize + is provided only for backwards compatibility.) + + -fast Select recommended processing options for fast, low + quality output. (The default options are chosen for + highest quality output.) Currently, this is equivalent + to "-dct fast -nosmooth -onepass -dither ordered". + + -grayscale Force gray-scale output even if JPEG file is color. + Useful for viewing on monochrome displays; also, + djpeg runs noticeably faster in this mode. + + -scale M/N Scale the output image by a factor M/N. Currently + the scale factor must be M/8, where M is an integer + between 1 and 16 inclusive, or any reduced fraction + thereof (such as 1/2, 3/4, etc. Scaling is handy if + the image is larger than your screen; also, djpeg runs + much faster when scaling down the output. + + -bmp Select BMP output format (Windows flavor). 8-bit + colormapped format is emitted if -colors or -grayscale + is specified, or if the JPEG file is gray-scale; + otherwise, 24-bit full-color format is emitted. + + -gif Select GIF output format. Since GIF does not support + more than 256 colors, -colors 256 is assumed (unless + you specify a smaller number of colors). If you + specify -fast, the default number of colors is 216. + + -os2 Select BMP output format (OS/2 1.x flavor). 8-bit + colormapped format is emitted if -colors or -grayscale + is specified, or if the JPEG file is gray-scale; + otherwise, 24-bit full-color format is emitted. + + -pnm Select PBMPLUS (PPM/PGM) output format (this is the + default format). PGM is emitted if the JPEG file is + gray-scale or if -grayscale is specified; otherwise + PPM is emitted. + + -rle Select RLE output format. (Requires URT library.) + + -targa Select Targa output format. Gray-scale format is + emitted if the JPEG file is gray-scale or if + -grayscale is specified; otherwise, colormapped format + is emitted if -colors is specified; otherwise, 24-bit + full-color format is emitted. Switches for advanced users: - -dct int Use integer DCT method (default). - -dct fast Use fast integer DCT (less accurate). - -dct float Use floating-point DCT method. - The float method is very slightly more accurate than - the int method, but is much slower unless your machine - has very fast floating-point hardware. Also note that - results of the floating-point method may vary slightly - across machines, while the integer methods should give - the same results everywhere. The fast integer method - is much less accurate than the other two. - - -dither fs Use Floyd-Steinberg dithering in color quantization. - -dither ordered Use ordered dithering in color quantization. - -dither none Do not use dithering in color quantization. - By default, Floyd-Steinberg dithering is applied when - quantizing colors; this is slow but usually produces - the best results. Ordered dither is a compromise - between speed and quality; no dithering is fast but - usually looks awful. Note that these switches have - no effect unless color quantization is being done. - Ordered dither is only available in -onepass mode. - - -map FILE Quantize to the colors used in the specified image - file. This is useful for producing multiple files - with identical color maps, or for forcing a predefined - set of colors to be used. The FILE must be a GIF - or PPM file. This option overrides -colors and - -onepass. - - -nosmooth Use a faster, lower-quality upsampling routine. - - -onepass Use one-pass instead of two-pass color quantization. - The one-pass method is faster and needs less memory, - but it produces a lower-quality image. -onepass is - ignored unless you also say -colors N. Also, - the one-pass method is always used for gray-scale - output (the two-pass method is no improvement then). - - -maxmemory N Set limit for amount of memory to use in processing - large images. Value is in thousands of bytes, or - millions of bytes if "M" is attached to the number. - For example, -max 4m selects 4000000 bytes. If more - space is needed, temporary files will be used. - - -verbose Enable debug printout. More -v's give more printout. - or -debug Also, version information is printed at startup. + -dct int Use integer DCT method (default). + -dct fast Use fast integer DCT (less accurate). + -dct float Use floating-point DCT method. + The float method is very slightly more accurate than + the int method, but is much slower unless your machine + has very fast floating-point hardware. Also note that + results of the floating-point method may vary slightly + across machines, while the integer methods should give + the same results everywhere. The fast integer method + is much less accurate than the other two. + + -dither fs Use Floyd-Steinberg dithering in color quantization. + -dither ordered Use ordered dithering in color quantization. + -dither none Do not use dithering in color quantization. + By default, Floyd-Steinberg dithering is applied when + quantizing colors; this is slow but usually produces + the best results. Ordered dither is a compromise + between speed and quality; no dithering is fast but + usually looks awful. Note that these switches have + no effect unless color quantization is being done. + Ordered dither is only available in -onepass mode. + + -map FILE Quantize to the colors used in the specified image + file. This is useful for producing multiple files + with identical color maps, or for forcing a predefined + set of colors to be used. The FILE must be a GIF + or PPM file. This option overrides -colors and + -onepass. + + -nosmooth Use a faster, lower-quality upsampling routine. + + -onepass Use one-pass instead of two-pass color quantization. + The one-pass method is faster and needs less memory, + but it produces a lower-quality image. -onepass is + ignored unless you also say -colors N. Also, + the one-pass method is always used for gray-scale + output (the two-pass method is no improvement then). + + -maxmemory N Set limit for amount of memory to use in processing + large images. Value is in thousands of bytes, or + millions of bytes if "M" is attached to the number. + For example, -max 4m selects 4000000 bytes. If more + space is needed, temporary files will be used. + + -verbose Enable debug printout. More -v's give more printout. + or -debug Also, version information is printed at startup. HINTS FOR CJPEG @@ -446,31 +446,31 @@ quality. jpegtran uses a command line syntax similar to cjpeg or djpeg. On Unix-like systems, you say: - jpegtran [switches] [inputfile] >outputfile + jpegtran [switches] [inputfile] >outputfile On most non-Unix systems, you say: - jpegtran [switches] inputfile outputfile + jpegtran [switches] inputfile outputfile where both the input and output files are JPEG files. To specify the coded JPEG representation used in the output file, jpegtran accepts a subset of the switches recognized by cjpeg: - -optimize Perform optimization of entropy encoding parameters. - -progressive Create progressive JPEG file. - -arithmetic Use arithmetic coding. - -restart N Emit a JPEG restart marker every N MCU rows, or every - N MCU blocks if "B" is attached to the number. - -scans file Use the scan script given in the specified text file. + -optimize Perform optimization of entropy encoding parameters. + -progressive Create progressive JPEG file. + -arithmetic Use arithmetic coding. + -restart N Emit a JPEG restart marker every N MCU rows, or every + N MCU blocks if "B" is attached to the number. + -scans file Use the scan script given in the specified text file. See the previous discussion of cjpeg for more details about these switches. If you specify none of these switches, you get a plain baseline-JPEG output file. The quality setting and so forth are determined by the input file. The image can be losslessly transformed by giving one of these switches: - -flip horizontal Mirror image horizontally (left-right). - -flip vertical Mirror image vertically (top-bottom). - -rotate 90 Rotate image 90 degrees clockwise. - -rotate 180 Rotate image 180 degrees. - -rotate 270 Rotate image 270 degrees clockwise (or 90 ccw). - -transpose Transpose image (across UL-to-LR axis). - -transverse Transverse transpose (across UR-to-LL axis). + -flip horizontal Mirror image horizontally (left-right). + -flip vertical Mirror image vertically (top-bottom). + -rotate 90 Rotate image 90 degrees clockwise. + -rotate 180 Rotate image 180 degrees. + -rotate 270 Rotate image 270 degrees clockwise (or 90 ccw). + -transpose Transpose image (across UL-to-LR axis). + -transverse Transverse transpose (across UR-to-LL axis). The transpose transformation has no restrictions regarding image dimensions. The other transformations operate rather oddly if the image dimensions are not @@ -491,7 +491,7 @@ transpose-and-flip sequence. For practical use, you may prefer to discard any untransformable edge pixels rather than having a strange-looking strip along the right and/or bottom edges of a transformed image. To do this, add the -trim switch: - -trim Drop non-transformable edge blocks. + -trim Drop non-transformable edge blocks. Obviously, a transformation with -trim is not reversible, so strictly speaking jpegtran with this switch is not lossless. Also, the expected mathematical equivalences between the transformations no longer hold. For example, @@ -499,8 +499,8 @@ equivalences between the transformations no longer hold. For example, "-rot 180 -trim" trims both edges. If you are only interested in perfect transformations, add the -perfect switch: - -perfect Fail with an error if the transformation is not - perfect. + -perfect Fail with an error if the transformation is not + perfect. For example, you may want to do jpegtran -rot 90 -perfect foo.jpg || djpeg foo.jpg | pnmflip -r90 | cjpeg to do a perfect rotation, if available, or an approximated one if not. @@ -513,12 +513,12 @@ boundary. If it doesn't, then it is silently moved up and/or left to the nearest iMCU boundary (the lower right corner is unchanged.) The image can be losslessly cropped by giving the switch: - -crop WxH+X+Y Crop to a rectangular region of width W and height H, - starting at point X,Y. + -crop WxH+X+Y Crop to a rectangular region of width W and height H, + starting at point X,Y. Other not-strictly-lossless transformation switches are: - -grayscale Force grayscale output. + -grayscale Force grayscale output. This option discards the chrominance channels if the input image is YCbCr (ie, a standard color JPEG), resulting in a grayscale JPEG file. The luminance channel is preserved exactly, so this is a better method of reducing @@ -530,24 +530,24 @@ a grayscale JPEG is substantially less than that for a color JPEG.) jpegtran also recognizes these switches that control what to do with "extra" markers, such as comment blocks: - -copy none Copy no extra markers from source file. This setting - suppresses all comments and other excess baggage - present in the source file. - -copy comments Copy only comment markers. This setting copies - comments from the source file but discards - any other data that is inessential for image display. - -copy all Copy all extra markers. This setting preserves - miscellaneous markers found in the source file, such - as JFIF thumbnails, Exif data, and Photoshop settings. - In some files, these extra markers can be sizable. + -copy none Copy no extra markers from source file. This setting + suppresses all comments and other excess baggage + present in the source file. + -copy comments Copy only comment markers. This setting copies + comments from the source file but discards + any other data that is inessential for image display. + -copy all Copy all extra markers. This setting preserves + miscellaneous markers found in the source file, such + as JFIF thumbnails, Exif data, and Photoshop settings. + In some files, these extra markers can be sizable. The default behavior is -copy comments. (Note: in IJG releases v6 and v6a, jpegtran always did the equivalent of -copy none.) Additional switches recognized by jpegtran are: - -outfile filename - -maxmemory N - -verbose - -debug + -outfile filename + -maxmemory N + -verbose + -debug These work the same as in cjpeg or djpeg. @@ -566,7 +566,7 @@ blocks to a JPEG file. rdjpgcom searches a JPEG file and prints the contents of any COM blocks on standard output. The command line syntax is - rdjpgcom [-raw] [-verbose] [inputfilename] + rdjpgcom [-raw] [-verbose] [inputfilename] The switch "-raw" (or just "-r") causes rdjpgcom to output non-printable characters in JPEG comments. These characters are normally escaped for security reasons. @@ -584,18 +584,18 @@ just destroy your file. The command line syntax for wrjpgcom is similar to cjpeg's. On Unix-like systems, it is - wrjpgcom [switches] [inputfilename] + wrjpgcom [switches] [inputfilename] The output file is written to standard output. The input file comes from the named file, or from standard input if no input file is named. On most non-Unix systems, the syntax is - wrjpgcom [switches] inputfilename outputfilename + wrjpgcom [switches] inputfilename outputfilename where both input and output file names must be given explicitly. wrjpgcom understands three switches: - -replace Delete any existing COM blocks from the file. - -comment "Comment text" Supply new COM text on command line. - -cfile name Read text for new COM block from named file. + -replace Delete any existing COM blocks from the file. + -comment "Comment text" Supply new COM text on command line. + -cfile name Read text for new COM block from named file. (Switch names can be abbreviated.) If you have only one line of comment text to add, you can provide it on the command line with -comment. The comment text must be surrounded with quotes so that it is treated as a single diff --git a/wizard.txt b/wizard.txt index 54170b227..ede721e72 100644 --- a/wizard.txt +++ b/wizard.txt @@ -30,7 +30,7 @@ size, since more bits than necessary are expended on higher AC coefficients. You can substitute a different set of quantization values by using the -qtables switch: - -qtables file Use the quantization tables given in the named file. + -qtables file Use the quantization tables given in the named file. The specified file should be a text file containing decimal quantization values. The file should contain one to four tables, each of 64 elements. @@ -43,27 +43,27 @@ appear between numbers. Also, comments can be included: a comment starts with '#' and extends to the end of the line. Here is an example file that duplicates the default quantization tables: - # Quantization tables given in JPEG spec, section K.1 - - # This is table 0 (the luminance table): - 16 11 10 16 24 40 51 61 - 12 12 14 19 26 58 60 55 - 14 13 16 24 40 57 69 56 - 14 17 22 29 51 87 80 62 - 18 22 37 56 68 109 103 77 - 24 35 55 64 81 104 113 92 - 49 64 78 87 103 121 120 101 - 72 92 95 98 112 100 103 99 - - # This is table 1 (the chrominance table): - 17 18 24 47 99 99 99 99 - 18 21 26 66 99 99 99 99 - 24 26 56 99 99 99 99 99 - 47 66 99 99 99 99 99 99 - 99 99 99 99 99 99 99 99 - 99 99 99 99 99 99 99 99 - 99 99 99 99 99 99 99 99 - 99 99 99 99 99 99 99 99 + # Quantization tables given in JPEG spec, section K.1 + + # This is table 0 (the luminance table): + 16 11 10 16 24 40 51 61 + 12 12 14 19 26 58 60 55 + 14 13 16 24 40 57 69 56 + 14 17 22 29 51 87 80 62 + 18 22 37 56 68 109 103 77 + 24 35 55 64 81 104 113 92 + 49 64 78 87 103 121 120 101 + 72 92 95 98 112 100 103 99 + + # This is table 1 (the chrominance table): + 17 18 24 47 99 99 99 99 + 18 21 26 66 99 99 99 99 + 24 26 56 99 99 99 99 99 + 47 66 99 99 99 99 99 99 + 99 99 99 99 99 99 99 99 + 99 99 99 99 99 99 99 99 + 99 99 99 99 99 99 99 99 + 99 99 99 99 99 99 99 99 If the -qtables switch is used without -quality, then the specified tables are used exactly as-is. If both -qtables and -quality are used, then the @@ -75,8 +75,8 @@ By default, cjpeg will use quantization table 0 for luminance components and table 1 for chrominance components. To override this choice, use the -qslots switch: - -qslots N[,...] Select which quantization table to use for - each color component. + -qslots N[,...] Select which quantization table to use for + each color component. The -qslots switch specifies a quantization table number for each color component, in the order in which the components appear in the JPEG SOF marker. @@ -93,8 +93,8 @@ By default, cjpeg uses 2:1 horizontal and vertical downsampling when compressing YCbCr data, and no downsampling for all other color spaces. You can override this default with the -sample switch: - -sample HxV[,...] Set JPEG sampling factors for each color - component. + -sample HxV[,...] Set JPEG sampling factors for each color + component. The -sample switch specifies the JPEG sampling factors for each color component, in the order in which they appear in the JPEG SOF marker. @@ -119,7 +119,7 @@ of progression parameters. You can create multiple-scan sequential JPEG files or progressive JPEG files with custom progression parameters by using the -scans switch: - -scans file Use the scan sequence given in the named file. + -scans file Use the scan sequence given in the named file. The specified file should be a text file containing a "scan script". The script specifies the contents and ordering of the scans to be emitted. @@ -138,10 +138,10 @@ indexes are not the "component ID" codes assigned to the components, just positional indexes.) The progression parameters for each scan are: - Ss Zigzag index of first coefficient included in scan - Se Zigzag index of last coefficient included in scan - Ah Zero for first scan of a coefficient, else Al of prior scan - Al Successive approximation low bit position for scan + Ss Zigzag index of first coefficient included in scan + Se Zigzag index of last coefficient included in scan + Ah Zero for first scan of a coefficient, else Al of prior scan + Al Successive approximation low bit position for scan If the progression parameters are omitted, the values 0,63,0,0 are used, producing a sequential JPEG file. cjpeg automatically determines whether the script represents a progressive or sequential file, by observing whether @@ -156,52 +156,52 @@ comment starts with '#' and extends to the end of the line. For additional legibility, commas or dashes can be placed between values. (Actually, any single punctuation character other than ':' or ';' can be inserted.) For example, the following two scan definitions are equivalent: - 0 1 2: 0 63 0 0; - 0,1,2 : 0-63, 0,0 ; + 0 1 2: 0 63 0 0; + 0,1,2 : 0-63, 0,0 ; Here is an example of a scan script that generates a partially interleaved sequential JPEG file: - 0; # Y only in first scan - 1 2; # Cb and Cr in second scan + 0; # Y only in first scan + 1 2; # Cb and Cr in second scan Here is an example of a progressive scan script using only spectral selection (no successive approximation): - # Interleaved DC scan for Y,Cb,Cr: - 0,1,2: 0-0, 0, 0 ; - # AC scans: - 0: 1-2, 0, 0 ; # First two Y AC coefficients - 0: 3-5, 0, 0 ; # Three more - 1: 1-63, 0, 0 ; # All AC coefficients for Cb - 2: 1-63, 0, 0 ; # All AC coefficients for Cr - 0: 6-9, 0, 0 ; # More Y coefficients - 0: 10-63, 0, 0 ; # Remaining Y coefficients + # Interleaved DC scan for Y,Cb,Cr: + 0,1,2: 0-0, 0, 0 ; + # AC scans: + 0: 1-2, 0, 0 ; # First two Y AC coefficients + 0: 3-5, 0, 0 ; # Three more + 1: 1-63, 0, 0 ; # All AC coefficients for Cb + 2: 1-63, 0, 0 ; # All AC coefficients for Cr + 0: 6-9, 0, 0 ; # More Y coefficients + 0: 10-63, 0, 0 ; # Remaining Y coefficients Here is an example of a successive-approximation script. This is equivalent to the default script used by "cjpeg -progressive" for YCbCr images: - # Initial DC scan for Y,Cb,Cr (lowest bit not sent) - 0,1,2: 0-0, 0, 1 ; - # First AC scan: send first 5 Y AC coefficients, minus 2 lowest bits: - 0: 1-5, 0, 2 ; - # Send all Cr,Cb AC coefficients, minus lowest bit: - # (chroma data is usually too small to be worth subdividing further; - # but note we send Cr first since eye is least sensitive to Cb) - 2: 1-63, 0, 1 ; - 1: 1-63, 0, 1 ; - # Send remaining Y AC coefficients, minus 2 lowest bits: - 0: 6-63, 0, 2 ; - # Send next-to-lowest bit of all Y AC coefficients: - 0: 1-63, 2, 1 ; - # At this point we've sent all but the lowest bit of all coefficients. - # Send lowest bit of DC coefficients - 0,1,2: 0-0, 1, 0 ; - # Send lowest bit of AC coefficients - 2: 1-63, 1, 0 ; - 1: 1-63, 1, 0 ; - # Y AC lowest bit scan is last; it's usually the largest scan - 0: 1-63, 1, 0 ; + # Initial DC scan for Y,Cb,Cr (lowest bit not sent) + 0,1,2: 0-0, 0, 1 ; + # First AC scan: send first 5 Y AC coefficients, minus 2 lowest bits: + 0: 1-5, 0, 2 ; + # Send all Cr,Cb AC coefficients, minus lowest bit: + # (chroma data is usually too small to be worth subdividing further; + # but note we send Cr first since eye is least sensitive to Cb) + 2: 1-63, 0, 1 ; + 1: 1-63, 0, 1 ; + # Send remaining Y AC coefficients, minus 2 lowest bits: + 0: 6-63, 0, 2 ; + # Send next-to-lowest bit of all Y AC coefficients: + 0: 1-63, 2, 1 ; + # At this point we've sent all but the lowest bit of all coefficients. + # Send lowest bit of DC coefficients + 0,1,2: 0-0, 1, 0 ; + # Send lowest bit of AC coefficients + 2: 1-63, 1, 0 ; + 1: 1-63, 1, 0 ; + # Y AC lowest bit scan is last; it's usually the largest scan + 0: 1-63, 1, 0 ; It may be worth pointing out that this script is tuned for quality settings of around 50 to 75. For lower quality settings, you'd probably want to use diff --git a/wrbmp.c b/wrbmp.c index 3283b0f15..b8e213b2e 100644 --- a/wrbmp.c +++ b/wrbmp.c @@ -17,7 +17,7 @@ * This code contributed by James Arthur Boucher. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef BMP_SUPPORTED @@ -42,15 +42,15 @@ /* Private version of data destination object */ typedef struct { - struct djpeg_dest_struct pub; /* public fields */ + struct djpeg_dest_struct pub; /* public fields */ - boolean is_os2; /* saves the OS2 format request flag */ + boolean is_os2; /* saves the OS2 format request flag */ - jvirt_sarray_ptr whole_image; /* needed to reverse row order */ - JDIMENSION data_width; /* JSAMPLEs per row */ - JDIMENSION row_width; /* physical width of one row in the BMP file */ - int pad_bytes; /* number of padding bytes needed per row */ - JDIMENSION cur_output_row; /* next row# to write to virtual array */ + jvirt_sarray_ptr whole_image; /* needed to reverse row order */ + JDIMENSION data_width; /* JSAMPLEs per row */ + JDIMENSION row_width; /* physical width of one row in the BMP file */ + int pad_bytes; /* number of padding bytes needed per row */ + JDIMENSION cur_output_row; /* next row# to write to virtual array */ } bmp_dest_struct; typedef bmp_dest_struct * bmp_dest_ptr; @@ -58,8 +58,8 @@ typedef bmp_dest_struct * bmp_dest_ptr; /* Forward declarations */ LOCAL(void) write_colormap - JPP((j_decompress_ptr cinfo, bmp_dest_ptr dest, - int map_colors, int map_entry_size)); + JPP((j_decompress_ptr cinfo, bmp_dest_ptr dest, + int map_colors, int map_entry_size)); /* @@ -69,7 +69,7 @@ LOCAL(void) write_colormap METHODDEF(void) put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied) + JDIMENSION rows_supplied) /* This version is for writing 24-bit pixels */ { bmp_dest_ptr dest = (bmp_dest_ptr) dinfo; @@ -90,7 +90,7 @@ put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, inptr = dest->pub.buffer[0]; outptr = image_ptr[0]; for (col = cinfo->output_width; col > 0; col--) { - outptr[2] = *inptr++; /* can omit GETJSAMPLE() safely */ + outptr[2] = *inptr++; /* can omit GETJSAMPLE() safely */ outptr[1] = *inptr++; outptr[0] = *inptr++; outptr += 3; @@ -104,7 +104,7 @@ put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, METHODDEF(void) put_gray_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied) + JDIMENSION rows_supplied) /* This version is for grayscale OR quantized color output */ { bmp_dest_ptr dest = (bmp_dest_ptr) dinfo; @@ -123,7 +123,7 @@ put_gray_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, inptr = dest->pub.buffer[0]; outptr = image_ptr[0]; for (col = cinfo->output_width; col > 0; col--) { - *outptr++ = *inptr++; /* can omit GETJSAMPLE() safely */ + *outptr++ = *inptr++; /* can omit GETJSAMPLE() safely */ } /* Zero out the pad bytes. */ @@ -160,13 +160,13 @@ write_bmp_header (j_decompress_ptr cinfo, bmp_dest_ptr dest) char bmpfileheader[14]; char bmpinfoheader[40]; #define PUT_2B(array,offset,value) \ - (array[offset] = (char) ((value) & 0xFF), \ - array[offset+1] = (char) (((value) >> 8) & 0xFF)) + (array[offset] = (char) ((value) & 0xFF), \ + array[offset+1] = (char) (((value) >> 8) & 0xFF)) #define PUT_4B(array,offset,value) \ - (array[offset] = (char) ((value) & 0xFF), \ - array[offset+1] = (char) (((value) >> 8) & 0xFF), \ - array[offset+2] = (char) (((value) >> 16) & 0xFF), \ - array[offset+3] = (char) (((value) >> 24) & 0xFF)) + (array[offset] = (char) ((value) & 0xFF), \ + array[offset+1] = (char) (((value) >> 8) & 0xFF), \ + array[offset+2] = (char) (((value) >> 16) & 0xFF), \ + array[offset+3] = (char) (((value) >> 24) & 0xFF)) INT32 headersize, bfSize; int bits_per_pixel, cmap_entries; @@ -189,23 +189,23 @@ write_bmp_header (j_decompress_ptr cinfo, bmp_dest_ptr dest) /* File size */ headersize = 14 + 40 + cmap_entries * 4; /* Header and colormap */ bfSize = headersize + (INT32) dest->row_width * (INT32) cinfo->output_height; - + /* Set unused fields of header to 0 */ MEMZERO(bmpfileheader, SIZEOF(bmpfileheader)); MEMZERO(bmpinfoheader, SIZEOF(bmpinfoheader)); /* Fill the file header */ - bmpfileheader[0] = 0x42; /* first 2 bytes are ASCII 'B', 'M' */ + bmpfileheader[0] = 0x42; /* first 2 bytes are ASCII 'B', 'M' */ bmpfileheader[1] = 0x4D; PUT_4B(bmpfileheader, 2, bfSize); /* bfSize */ /* we leave bfReserved1 & bfReserved2 = 0 */ PUT_4B(bmpfileheader, 10, headersize); /* bfOffBits */ /* Fill the info header (Microsoft calls this a BITMAPINFOHEADER) */ - PUT_2B(bmpinfoheader, 0, 40); /* biSize */ + PUT_2B(bmpinfoheader, 0, 40); /* biSize */ PUT_4B(bmpinfoheader, 4, cinfo->output_width); /* biWidth */ PUT_4B(bmpinfoheader, 8, cinfo->output_height); /* biHeight */ - PUT_2B(bmpinfoheader, 12, 1); /* biPlanes - must be 1 */ + PUT_2B(bmpinfoheader, 12, 1); /* biPlanes - must be 1 */ PUT_2B(bmpinfoheader, 14, bits_per_pixel); /* biBitCount */ /* we leave biCompression = 0, for none */ /* we leave biSizeImage = 0; this is correct for uncompressed data */ @@ -254,23 +254,23 @@ write_os2_header (j_decompress_ptr cinfo, bmp_dest_ptr dest) /* File size */ headersize = 14 + 12 + cmap_entries * 3; /* Header and colormap */ bfSize = headersize + (INT32) dest->row_width * (INT32) cinfo->output_height; - + /* Set unused fields of header to 0 */ MEMZERO(bmpfileheader, SIZEOF(bmpfileheader)); MEMZERO(bmpcoreheader, SIZEOF(bmpcoreheader)); /* Fill the file header */ - bmpfileheader[0] = 0x42; /* first 2 bytes are ASCII 'B', 'M' */ + bmpfileheader[0] = 0x42; /* first 2 bytes are ASCII 'B', 'M' */ bmpfileheader[1] = 0x4D; PUT_4B(bmpfileheader, 2, bfSize); /* bfSize */ /* we leave bfReserved1 & bfReserved2 = 0 */ PUT_4B(bmpfileheader, 10, headersize); /* bfOffBits */ /* Fill the info header (Microsoft calls this a BITMAPCOREHEADER) */ - PUT_2B(bmpcoreheader, 0, 12); /* bcSize */ + PUT_2B(bmpcoreheader, 0, 12); /* bcSize */ PUT_2B(bmpcoreheader, 4, cinfo->output_width); /* bcWidth */ PUT_2B(bmpcoreheader, 6, cinfo->output_height); /* bcHeight */ - PUT_2B(bmpcoreheader, 8, 1); /* bcPlanes - must be 1 */ + PUT_2B(bmpcoreheader, 8, 1); /* bcPlanes - must be 1 */ PUT_2B(bmpcoreheader, 10, bits_per_pixel); /* bcBitCount */ if (JFWRITE(dest->pub.output_file, bmpfileheader, 14) != (size_t) 14) @@ -290,7 +290,7 @@ write_os2_header (j_decompress_ptr cinfo, bmp_dest_ptr dest) LOCAL(void) write_colormap (j_decompress_ptr cinfo, bmp_dest_ptr dest, - int map_colors, int map_entry_size) + int map_colors, int map_entry_size) { JSAMPARRAY colormap = cinfo->colormap; int num_colors = cinfo->actual_number_of_colors; @@ -301,20 +301,20 @@ write_colormap (j_decompress_ptr cinfo, bmp_dest_ptr dest, if (cinfo->out_color_components == 3) { /* Normal case with RGB colormap */ for (i = 0; i < num_colors; i++) { - putc(GETJSAMPLE(colormap[2][i]), outfile); - putc(GETJSAMPLE(colormap[1][i]), outfile); - putc(GETJSAMPLE(colormap[0][i]), outfile); - if (map_entry_size == 4) - putc(0, outfile); + putc(GETJSAMPLE(colormap[2][i]), outfile); + putc(GETJSAMPLE(colormap[1][i]), outfile); + putc(GETJSAMPLE(colormap[0][i]), outfile); + if (map_entry_size == 4) + putc(0, outfile); } } else { /* Grayscale colormap (only happens with grayscale quantization) */ for (i = 0; i < num_colors; i++) { - putc(GETJSAMPLE(colormap[0][i]), outfile); - putc(GETJSAMPLE(colormap[0][i]), outfile); - putc(GETJSAMPLE(colormap[0][i]), outfile); - if (map_entry_size == 4) - putc(0, outfile); + putc(GETJSAMPLE(colormap[0][i]), outfile); + putc(GETJSAMPLE(colormap[0][i]), outfile); + putc(GETJSAMPLE(colormap[0][i]), outfile); + if (map_entry_size == 4) + putc(0, outfile); } } } else { @@ -324,10 +324,10 @@ write_colormap (j_decompress_ptr cinfo, bmp_dest_ptr dest, putc(i, outfile); putc(i, outfile); if (map_entry_size == 4) - putc(0, outfile); + putc(0, outfile); } } - /* Pad colormap with zeros to ensure specified number of colormap entries */ + /* Pad colormap with zeros to ensure specified number of colormap entries */ if (i > map_colors) ERREXIT1(cinfo, JERR_TOO_MANY_COLORS, i); for (; i < map_colors; i++) { @@ -395,7 +395,7 @@ jinit_write_bmp (j_decompress_ptr cinfo, boolean is_os2) /* Create module interface object, fill in method pointers */ dest = (bmp_dest_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(bmp_dest_struct)); + SIZEOF(bmp_dest_struct)); dest->pub.start_output = start_output_bmp; dest->pub.finish_output = finish_output_bmp; dest->is_os2 = is_os2; diff --git a/wrgif.c b/wrgif.c index 5fe832839..0533f6423 100644 --- a/wrgif.c +++ b/wrgif.c @@ -37,7 +37,7 @@ * CompuServe Incorporated." */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef GIF_SUPPORTED @@ -45,31 +45,31 @@ /* Private version of data destination object */ typedef struct { - struct djpeg_dest_struct pub; /* public fields */ + struct djpeg_dest_struct pub; /* public fields */ - j_decompress_ptr cinfo; /* back link saves passing separate parm */ + j_decompress_ptr cinfo; /* back link saves passing separate parm */ /* State for packing variable-width codes into a bitstream */ - int n_bits; /* current number of bits/code */ - int maxcode; /* maximum code, given n_bits */ - INT32 cur_accum; /* holds bits not yet output */ - int cur_bits; /* # of bits in cur_accum */ + int n_bits; /* current number of bits/code */ + int maxcode; /* maximum code, given n_bits */ + INT32 cur_accum; /* holds bits not yet output */ + int cur_bits; /* # of bits in cur_accum */ /* State for GIF code assignment */ - int ClearCode; /* clear code (doesn't change) */ - int EOFCode; /* EOF code (ditto) */ - int code_counter; /* counts output symbols */ + int ClearCode; /* clear code (doesn't change) */ + int EOFCode; /* EOF code (ditto) */ + int code_counter; /* counts output symbols */ /* GIF data packet construction buffer */ - int bytesinpkt; /* # of bytes in current packet */ - char packetbuf[256]; /* workspace for accumulating packet */ + int bytesinpkt; /* # of bytes in current packet */ + char packetbuf[256]; /* workspace for accumulating packet */ } gif_dest_struct; typedef gif_dest_struct * gif_dest_ptr; /* Largest value that will fit in N bits */ -#define MAXCODE(n_bits) ((1 << (n_bits)) - 1) +#define MAXCODE(n_bits) ((1 << (n_bits)) - 1) /* @@ -81,10 +81,10 @@ LOCAL(void) flush_packet (gif_dest_ptr dinfo) /* flush any accumulated data */ { - if (dinfo->bytesinpkt > 0) { /* never write zero-length packet */ + if (dinfo->bytesinpkt > 0) { /* never write zero-length packet */ dinfo->packetbuf[0] = (char) dinfo->bytesinpkt++; if (JFWRITE(dinfo->pub.output_file, dinfo->packetbuf, dinfo->bytesinpkt) - != (size_t) dinfo->bytesinpkt) + != (size_t) dinfo->bytesinpkt) ERREXIT(dinfo->cinfo, JERR_FILE_WRITE); dinfo->bytesinpkt = 0; } @@ -93,10 +93,10 @@ flush_packet (gif_dest_ptr dinfo) /* Add a character to current packet; flush to disk if necessary */ #define CHAR_OUT(dinfo,c) \ - { (dinfo)->packetbuf[++(dinfo)->bytesinpkt] = (char) (c); \ - if ((dinfo)->bytesinpkt >= 255) \ - flush_packet(dinfo); \ - } + { (dinfo)->packetbuf[++(dinfo)->bytesinpkt] = (char) (c); \ + if ((dinfo)->bytesinpkt >= 255) \ + flush_packet(dinfo); \ + } /* Routine to convert variable-width codes into a byte stream */ @@ -173,7 +173,7 @@ compress_pixel (gif_dest_ptr dinfo, int c) dinfo->code_counter++; } else { output(dinfo, dinfo->ClearCode); - dinfo->code_counter = dinfo->ClearCode + 2; /* reset the counter */ + dinfo->code_counter = dinfo->ClearCode + 2; /* reset the counter */ } } @@ -248,9 +248,9 @@ emit_header (gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap) /* Write the Logical Screen Descriptor */ put_word(dinfo, (unsigned int) dinfo->cinfo->output_width); put_word(dinfo, (unsigned int) dinfo->cinfo->output_height); - FlagByte = 0x80; /* Yes, there is a global color table */ + FlagByte = 0x80; /* Yes, there is a global color table */ FlagByte |= (BitsPerPixel-1) << 4; /* color resolution */ - FlagByte |= (BitsPerPixel-1); /* size of global color table */ + FlagByte |= (BitsPerPixel-1); /* size of global color table */ putc(FlagByte, dinfo->pub.output_file); putc(0, dinfo->pub.output_file); /* Background color index */ putc(0, dinfo->pub.output_file); /* Reserved (aspect ratio in GIF89) */ @@ -260,18 +260,18 @@ emit_header (gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap) for (i=0; i < ColorMapSize; i++) { if (i < num_colors) { if (colormap != NULL) { - if (dinfo->cinfo->out_color_space == JCS_RGB) { - /* Normal case: RGB color map */ - putc(GETJSAMPLE(colormap[0][i]) >> cshift, dinfo->pub.output_file); - putc(GETJSAMPLE(colormap[1][i]) >> cshift, dinfo->pub.output_file); - putc(GETJSAMPLE(colormap[2][i]) >> cshift, dinfo->pub.output_file); - } else { - /* Grayscale "color map": possible if quantizing grayscale image */ - put_3bytes(dinfo, GETJSAMPLE(colormap[0][i]) >> cshift); - } + if (dinfo->cinfo->out_color_space == JCS_RGB) { + /* Normal case: RGB color map */ + putc(GETJSAMPLE(colormap[0][i]) >> cshift, dinfo->pub.output_file); + putc(GETJSAMPLE(colormap[1][i]) >> cshift, dinfo->pub.output_file); + putc(GETJSAMPLE(colormap[2][i]) >> cshift, dinfo->pub.output_file); + } else { + /* Grayscale "color map": possible if quantizing grayscale image */ + put_3bytes(dinfo, GETJSAMPLE(colormap[0][i]) >> cshift); + } } else { - /* Create a gray-scale map of num_colors values, range 0..255 */ - put_3bytes(dinfo, (i * 255 + (num_colors-1)/2) / (num_colors-1)); + /* Create a gray-scale map of num_colors values, range 0..255 */ + put_3bytes(dinfo, (i * 255 + (num_colors-1)/2) / (num_colors-1)); } } else { /* fill out the map to a power of 2 */ @@ -280,7 +280,7 @@ emit_header (gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap) } /* Write image separator and Image Descriptor */ putc(',', dinfo->pub.output_file); /* separator */ - put_word(dinfo, 0); /* left/top offset */ + put_word(dinfo, 0); /* left/top offset */ put_word(dinfo, 0); put_word(dinfo, (unsigned int) dinfo->cinfo->output_width); /* image size */ put_word(dinfo, (unsigned int) dinfo->cinfo->output_height); @@ -317,7 +317,7 @@ start_output_gif (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo) METHODDEF(void) put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied) + JDIMENSION rows_supplied) { gif_dest_ptr dest = (gif_dest_ptr) dinfo; register JSAMPROW ptr; @@ -364,8 +364,8 @@ jinit_write_gif (j_decompress_ptr cinfo) /* Create module interface object, fill in method pointers */ dest = (gif_dest_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(gif_dest_struct)); - dest->cinfo = cinfo; /* make back link for subroutines */ + SIZEOF(gif_dest_struct)); + dest->cinfo = cinfo; /* make back link for subroutines */ dest->pub.start_output = start_output_gif; dest->pub.put_pixel_rows = put_pixel_rows; dest->pub.finish_output = finish_output_gif; diff --git a/wrjpgcom.c b/wrjpgcom.c index 8c04b0551..1055d862d 100644 --- a/wrjpgcom.c +++ b/wrjpgcom.c @@ -11,48 +11,48 @@ * JPEG markers. */ -#define JPEG_CJPEG_DJPEG /* to get the command-line config symbols */ -#include "jinclude.h" /* get auto-config symbols, */ +#define JPEG_CJPEG_DJPEG /* to get the command-line config symbols */ +#include "jinclude.h" /* get auto-config symbols, */ -#ifndef HAVE_STDLIB_H /* should declare malloc() */ +#ifndef HAVE_STDLIB_H /* should declare malloc() */ extern void * malloc (); #endif -#include /* to declare isupper(), tolower() */ +#include /* to declare isupper(), tolower() */ #ifdef USE_SETMODE -#include /* to declare setmode()'s parameter macros */ +#include /* to declare setmode()'s parameter macros */ /* If you have setmode() but not , just delete this line: */ -#include /* to declare setmode() */ +#include /* to declare setmode() */ #endif -#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ +#ifdef USE_CCOMMAND /* command-line reader for Macintosh */ #ifdef __MWERKS__ #include /* Metrowerks needs this */ -#include /* ... and this */ +#include /* ... and this */ #endif #ifdef THINK_C -#include /* Think declares it here */ +#include /* Think declares it here */ #endif #endif -#ifdef DONT_USE_B_MODE /* define mode parameters for fopen() */ -#define READ_BINARY "r" -#define WRITE_BINARY "w" +#ifdef DONT_USE_B_MODE /* define mode parameters for fopen() */ +#define READ_BINARY "r" +#define WRITE_BINARY "w" #else -#ifdef VMS /* VMS is very nonstandard */ -#define READ_BINARY "rb", "ctx=stm" -#define WRITE_BINARY "wb", "ctx=stm" -#else /* standard ANSI-compliant case */ -#define READ_BINARY "rb" -#define WRITE_BINARY "wb" +#ifdef VMS /* VMS is very nonstandard */ +#define READ_BINARY "rb", "ctx=stm" +#define WRITE_BINARY "wb", "ctx=stm" +#else /* standard ANSI-compliant case */ +#define READ_BINARY "rb" +#define WRITE_BINARY "wb" #endif #endif -#ifndef EXIT_FAILURE /* define exit() codes if not provided */ +#ifndef EXIT_FAILURE /* define exit() codes if not provided */ #define EXIT_FAILURE 1 #endif #ifndef EXIT_SUCCESS #ifdef VMS -#define EXIT_SUCCESS 1 /* VMS is very nonstandard */ +#define EXIT_SUCCESS 1 /* VMS is very nonstandard */ #else #define EXIT_SUCCESS 0 #endif @@ -63,7 +63,7 @@ extern void * malloc (); */ #ifndef MAX_COM_LENGTH -#define MAX_COM_LENGTH 65000L /* must be <= 65533 in any case */ +#define MAX_COM_LENGTH 65000L /* must be <= 65533 in any case */ #endif @@ -72,12 +72,12 @@ extern void * malloc (); * To reuse this code in another application, you might need to change these. */ -static FILE * infile; /* input JPEG file */ +static FILE * infile; /* input JPEG file */ /* Return next input byte, or EOF if no more */ #define NEXTBYTE() getc(infile) -static FILE * outfile; /* output JPEG file */ +static FILE * outfile; /* output JPEG file */ /* Emit an output byte */ #define PUTBYTE(x) putc((x), outfile) @@ -154,11 +154,11 @@ copy_rest_of_file (void) * in this program. (See jdmarker.c for a more complete list.) */ -#define M_SOF0 0xC0 /* Start Of Frame N */ -#define M_SOF1 0xC1 /* N indicates which compression process */ -#define M_SOF2 0xC2 /* Only SOF0-SOF2 are now in common use */ +#define M_SOF0 0xC0 /* Start Of Frame N */ +#define M_SOF1 0xC1 /* N indicates which compression process */ +#define M_SOF2 0xC2 /* Only SOF0-SOF2 are now in common use */ #define M_SOF3 0xC3 -#define M_SOF5 0xC5 /* NB: codes C4 and CC are NOT SOF markers */ +#define M_SOF5 0xC5 /* NB: codes C4 and CC are NOT SOF markers */ #define M_SOF6 0xC6 #define M_SOF7 0xC7 #define M_SOF9 0xC9 @@ -167,10 +167,10 @@ copy_rest_of_file (void) #define M_SOF13 0xCD #define M_SOF14 0xCE #define M_SOF15 0xCF -#define M_SOI 0xD8 /* Start Of Image (beginning of datastream) */ -#define M_EOI 0xD9 /* End Of Image (end of datastream) */ -#define M_SOS 0xDA /* Start Of Scan (begins compressed data) */ -#define M_COM 0xFE /* COMment */ +#define M_SOI 0xD8 /* Start Of Image (beginning of datastream) */ +#define M_EOI 0xD9 /* End Of Image (end of datastream) */ +#define M_SOS 0xDA /* Start Of Scan (begins compressed data) */ +#define M_COM 0xFE /* COMment */ /* @@ -302,40 +302,40 @@ scan_JPEG_header (int keep_COM) /* Note that marker codes 0xC4, 0xC8, 0xCC are not, and must not be, * treated as SOFn. C4 in particular is actually DHT. */ - case M_SOF0: /* Baseline */ - case M_SOF1: /* Extended sequential, Huffman */ - case M_SOF2: /* Progressive, Huffman */ - case M_SOF3: /* Lossless, Huffman */ - case M_SOF5: /* Differential sequential, Huffman */ - case M_SOF6: /* Differential progressive, Huffman */ - case M_SOF7: /* Differential lossless, Huffman */ - case M_SOF9: /* Extended sequential, arithmetic */ - case M_SOF10: /* Progressive, arithmetic */ - case M_SOF11: /* Lossless, arithmetic */ - case M_SOF13: /* Differential sequential, arithmetic */ - case M_SOF14: /* Differential progressive, arithmetic */ - case M_SOF15: /* Differential lossless, arithmetic */ + case M_SOF0: /* Baseline */ + case M_SOF1: /* Extended sequential, Huffman */ + case M_SOF2: /* Progressive, Huffman */ + case M_SOF3: /* Lossless, Huffman */ + case M_SOF5: /* Differential sequential, Huffman */ + case M_SOF6: /* Differential progressive, Huffman */ + case M_SOF7: /* Differential lossless, Huffman */ + case M_SOF9: /* Extended sequential, arithmetic */ + case M_SOF10: /* Progressive, arithmetic */ + case M_SOF11: /* Lossless, arithmetic */ + case M_SOF13: /* Differential sequential, arithmetic */ + case M_SOF14: /* Differential progressive, arithmetic */ + case M_SOF15: /* Differential lossless, arithmetic */ return marker; - case M_SOS: /* should not see compressed data before SOF */ + case M_SOS: /* should not see compressed data before SOF */ ERREXIT("SOS without prior SOFn"); break; - case M_EOI: /* in case it's a tables-only JPEG stream */ + case M_EOI: /* in case it's a tables-only JPEG stream */ return marker; - case M_COM: /* Existing COM: conditionally discard */ + case M_COM: /* Existing COM: conditionally discard */ if (keep_COM) { - write_marker(marker); - copy_variable(); + write_marker(marker); + copy_variable(); } else { - skip_variable(); + skip_variable(); } break; - default: /* Anything else just gets copied */ + default: /* Anything else just gets copied */ write_marker(marker); - copy_variable(); /* we assume it has a parameter count... */ + copy_variable(); /* we assume it has a parameter count... */ break; } } /* end loop */ @@ -344,7 +344,7 @@ scan_JPEG_header (int keep_COM) /* Command line parsing code */ -static const char * progname; /* program name for error messages */ +static const char * progname; /* program name for error messages */ static void @@ -370,7 +370,7 @@ usage (void) fprintf(stderr, "If you do not give either -comment or -cfile on the command line,\n"); fprintf(stderr, "then the comment text is read from standard input.\n"); fprintf(stderr, "It can be multiple lines, up to %u characters total.\n", - (unsigned int) MAX_COM_LENGTH); + (unsigned int) MAX_COM_LENGTH); #ifndef TWO_FILE_COMMANDLINE fprintf(stderr, "You must specify an input JPEG file name when supplying\n"); fprintf(stderr, "comment text from standard input.\n"); @@ -391,17 +391,17 @@ keymatch (char * arg, const char * keyword, int minchars) while ((ca = *arg++) != '\0') { if ((ck = *keyword++) == '\0') - return 0; /* arg longer than keyword, no good */ - if (isupper(ca)) /* force arg to lcase (assume ck is already) */ + return 0; /* arg longer than keyword, no good */ + if (isupper(ca)) /* force arg to lcase (assume ck is already) */ ca = tolower(ca); if (ca != ck) - return 0; /* no good */ - nmatched++; /* count matched characters */ + return 0; /* no good */ + nmatched++; /* count matched characters */ } /* reached end of argument; fail if it's too short for unique abbrev */ if (nmatched < minchars) return 0; - return 1; /* A-OK */ + return 1; /* A-OK */ } @@ -427,21 +427,21 @@ main (int argc, char **argv) progname = argv[0]; if (progname == NULL || progname[0] == 0) - progname = "wrjpgcom"; /* in case C library doesn't provide it */ + progname = "wrjpgcom"; /* in case C library doesn't provide it */ /* Parse switches, if any */ for (argn = 1; argn < argc; argn++) { arg = argv[argn]; if (arg[0] != '-') - break; /* not switch, must be file name */ - arg++; /* advance over '-' */ + break; /* not switch, must be file name */ + arg++; /* advance over '-' */ if (keymatch(arg, "replace", 1)) { keep_COM = 0; } else if (keymatch(arg, "cfile", 2)) { if (++argn >= argc) usage(); if ((comment_file = fopen(argv[argn], "r")) == NULL) { - fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]); - exit(EXIT_FAILURE); + fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]); + exit(EXIT_FAILURE); } } else if (keymatch(arg, "comment", 1)) { if (++argn >= argc) usage(); @@ -450,21 +450,21 @@ main (int argc, char **argv) * under MS-DOG and must parse out the quoted string ourselves. Sigh. */ if (comment_arg[0] == '"') { - comment_arg = (char *) malloc((size_t) MAX_COM_LENGTH); - if (comment_arg == NULL) - ERREXIT("Insufficient memory"); - strcpy(comment_arg, argv[argn]+1); - for (;;) { - comment_length = (unsigned int) strlen(comment_arg); - if (comment_length > 0 && comment_arg[comment_length-1] == '"') { - comment_arg[comment_length-1] = '\0'; /* zap terminating quote */ - break; - } - if (++argn >= argc) - ERREXIT("Missing ending quote mark"); - strcat(comment_arg, " "); - strcat(comment_arg, argv[argn]); - } + comment_arg = (char *) malloc((size_t) MAX_COM_LENGTH); + if (comment_arg == NULL) + ERREXIT("Insufficient memory"); + strcpy(comment_arg, argv[argn]+1); + for (;;) { + comment_length = (unsigned int) strlen(comment_arg); + if (comment_length > 0 && comment_arg[comment_length-1] == '"') { + comment_arg[comment_length-1] = '\0'; /* zap terminating quote */ + break; + } + if (++argn >= argc) + ERREXIT("Missing ending quote mark"); + strcat(comment_arg, " "); + strcat(comment_arg, argv[argn]); + } } comment_length = (unsigned int) strlen(comment_arg); } else @@ -488,10 +488,10 @@ main (int argc, char **argv) } } else { /* default input file is stdin */ -#ifdef USE_SETMODE /* need to hack file mode? */ +#ifdef USE_SETMODE /* need to hack file mode? */ setmode(fileno(stdin), O_BINARY); #endif -#ifdef USE_FDOPEN /* need to re-open in binary mode? */ +#ifdef USE_FDOPEN /* need to re-open in binary mode? */ if ((infile = fdopen(fileno(stdin), READ_BINARY)) == NULL) { fprintf(stderr, "%s: can't open stdin\n", progname); exit(EXIT_FAILURE); @@ -506,7 +506,7 @@ main (int argc, char **argv) /* Must have explicit output file name */ if (argn != argc-2) { fprintf(stderr, "%s: must name one input and one output file\n", - progname); + progname); usage(); } if ((outfile = fopen(argv[argn+1], WRITE_BINARY)) == NULL) { @@ -520,10 +520,10 @@ main (int argc, char **argv) usage(); } /* default output file is stdout */ -#ifdef USE_SETMODE /* need to hack file mode? */ +#ifdef USE_SETMODE /* need to hack file mode? */ setmode(fileno(stdout), O_BINARY); #endif -#ifdef USE_FDOPEN /* need to re-open in binary mode? */ +#ifdef USE_FDOPEN /* need to re-open in binary mode? */ if ((outfile = fdopen(fileno(stdout), WRITE_BINARY)) == NULL) { fprintf(stderr, "%s: can't open stdout\n", progname); exit(EXIT_FAILURE); @@ -545,9 +545,9 @@ main (int argc, char **argv) src_file = (comment_file != NULL ? comment_file : stdin); while ((c = getc(src_file)) != EOF) { if (comment_length >= (unsigned int) MAX_COM_LENGTH) { - fprintf(stderr, "Comment text may not exceed %u bytes\n", - (unsigned int) MAX_COM_LENGTH); - exit(EXIT_FAILURE); + fprintf(stderr, "Comment text may not exceed %u bytes\n", + (unsigned int) MAX_COM_LENGTH); + exit(EXIT_FAILURE); } comment_arg[comment_length++] = (char) c; } @@ -579,5 +579,5 @@ main (int argc, char **argv) /* All done. */ exit(EXIT_SUCCESS); - return 0; /* suppress no-return-value warnings */ + return 0; /* suppress no-return-value warnings */ } diff --git a/wrppm.c b/wrppm.c index 68e0c85c3..6b7ebf42a 100644 --- a/wrppm.c +++ b/wrppm.c @@ -16,7 +16,7 @@ * an ordinary stdio stream. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef PPM_SUPPORTED @@ -42,11 +42,11 @@ #define PPM_MAXVAL 255 #else /* The word-per-sample format always puts the MSB first. */ -#define PUTPPMSAMPLE(ptr,v) \ - { register int val_ = v; \ - *ptr++ = (char) ((val_ >> 8) & 0xFF); \ - *ptr++ = (char) (val_ & 0xFF); \ - } +#define PUTPPMSAMPLE(ptr,v) \ + { register int val_ = v; \ + *ptr++ = (char) ((val_ >> 8) & 0xFF); \ + *ptr++ = (char) (val_ & 0xFF); \ + } #define BYTESPERSAMPLE 2 #define PPM_MAXVAL ((1<pub.output_file, "P5\n%ld %ld\n%d\n", - (long) cinfo->output_width, (long) cinfo->output_height, - PPM_MAXVAL); + (long) cinfo->output_width, (long) cinfo->output_height, + PPM_MAXVAL); break; case JCS_RGB: /* emit header for raw PPM format */ fprintf(dest->pub.output_file, "P6\n%ld %ld\n%d\n", - (long) cinfo->output_width, (long) cinfo->output_height, - PPM_MAXVAL); + (long) cinfo->output_width, (long) cinfo->output_height, + PPM_MAXVAL); break; default: ERREXIT(cinfo, JERR_PPM_COLORSPACE); @@ -224,7 +224,7 @@ jinit_write_ppm (j_decompress_ptr cinfo) /* Create module interface object, fill in method pointers */ dest = (ppm_dest_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(ppm_dest_struct)); + SIZEOF(ppm_dest_struct)); dest->pub.start_output = start_output_ppm; dest->pub.finish_output = finish_output_ppm; diff --git a/wrrle.c b/wrrle.c index a4e73372d..4fdf372a0 100644 --- a/wrrle.c +++ b/wrrle.c @@ -16,7 +16,7 @@ * with updates from Robert Hutchinson. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef RLE_SUPPORTED @@ -47,15 +47,15 @@ * though not all of the entries need be used. */ -#define CMAPBITS 8 -#define CMAPLENGTH (1<<(CMAPBITS)) +#define CMAPBITS 8 +#define CMAPLENGTH (1<<(CMAPBITS)) typedef struct { struct djpeg_dest_struct pub; /* public fields */ - jvirt_sarray_ptr image; /* virtual array to store the output image */ - rle_map *colormap; /* RLE-style color map, or NULL if none */ - rle_pixel **rle_row; /* To pass rows to rle_putrow() */ + jvirt_sarray_ptr image; /* virtual array to store the output image */ + rle_map *colormap; /* RLE-style color map, or NULL if none */ + rle_pixel **rle_row; /* To pass rows to rle_putrow() */ } rle_dest_struct; @@ -64,7 +64,7 @@ typedef rle_dest_struct * rle_dest_ptr; /* Forward declarations */ METHODDEF(void) rle_put_pixel_rows JPP((j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied)); + JDIMENSION rows_supplied)); /* @@ -97,8 +97,8 @@ start_output_rle (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo) */ if (cinfo->output_width > 32767 || cinfo->output_height > 32767) - ERREXIT2(cinfo, JERR_RLE_DIMENSIONS, cinfo->output_width, - cinfo->output_height); + ERREXIT2(cinfo, JERR_RLE_DIMENSIONS, cinfo->output_width, + cinfo->output_height); if (cinfo->out_color_space != JCS_GRAYSCALE && cinfo->out_color_space != JCS_RGB) @@ -151,7 +151,7 @@ start_output_rle (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo) METHODDEF(void) rle_put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied) + JDIMENSION rows_supplied) { rle_dest_ptr dest = (rle_dest_ptr) dinfo; @@ -172,7 +172,7 @@ METHODDEF(void) finish_output_rle (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo) { rle_dest_ptr dest = (rle_dest_ptr) dinfo; - rle_hdr header; /* Output file information */ + rle_hdr header; /* Output file information */ rle_pixel **rle_row, *red, *green, *blue; JSAMPROW output_row; char cmapcomment[80]; @@ -223,7 +223,7 @@ finish_output_rle (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo) for (row = cinfo->output_height-1; row >= 0; row--) { rle_row = (rle_pixel **) (*cinfo->mem->access_virt_sarray) ((j_common_ptr) cinfo, dest->image, - (JDIMENSION) row, (JDIMENSION) 1, FALSE); + (JDIMENSION) row, (JDIMENSION) 1, FALSE); rle_putrow(rle_row, (int) cinfo->output_width, &header); #ifdef PROGRESS_REPORT if (progress != NULL) { @@ -237,7 +237,7 @@ finish_output_rle (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo) rle_row = (rle_pixel **) dest->rle_row; output_row = * (*cinfo->mem->access_virt_sarray) ((j_common_ptr) cinfo, dest->image, - (JDIMENSION) row, (JDIMENSION) 1, FALSE); + (JDIMENSION) row, (JDIMENSION) 1, FALSE); red = rle_row[0]; green = rle_row[1]; blue = rle_row[2]; diff --git a/wrtarga.c b/wrtarga.c index cf104d2de..84084be24 100644 --- a/wrtarga.c +++ b/wrtarga.c @@ -14,7 +14,7 @@ * Based on code contributed by Lee Daniel Crocker. */ -#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ +#include "cdjpeg.h" /* Common decls for cjpeg/djpeg applications */ #ifdef TARGA_SUPPORTED @@ -41,10 +41,10 @@ /* Private version of data destination object */ typedef struct { - struct djpeg_dest_struct pub; /* public fields */ + struct djpeg_dest_struct pub; /* public fields */ - char *iobuffer; /* physical I/O buffer */ - JDIMENSION buffer_width; /* width of one row */ + char *iobuffer; /* physical I/O buffer */ + JDIMENSION buffer_width; /* width of one row */ } tga_dest_struct; typedef tga_dest_struct * tga_dest_ptr; @@ -60,27 +60,27 @@ write_header (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, int num_colors) MEMZERO(targaheader, SIZEOF(targaheader)); if (num_colors > 0) { - targaheader[1] = 1; /* color map type 1 */ + targaheader[1] = 1; /* color map type 1 */ targaheader[5] = (char) (num_colors & 0xFF); targaheader[6] = (char) (num_colors >> 8); - targaheader[7] = 24; /* 24 bits per cmap entry */ + targaheader[7] = 24; /* 24 bits per cmap entry */ } targaheader[12] = (char) (cinfo->output_width & 0xFF); targaheader[13] = (char) (cinfo->output_width >> 8); targaheader[14] = (char) (cinfo->output_height & 0xFF); targaheader[15] = (char) (cinfo->output_height >> 8); - targaheader[17] = 0x20; /* Top-down, non-interlaced */ + targaheader[17] = 0x20; /* Top-down, non-interlaced */ if (cinfo->out_color_space == JCS_GRAYSCALE) { - targaheader[2] = 3; /* image type = uncompressed gray-scale */ - targaheader[16] = 8; /* bits per pixel */ - } else { /* must be RGB */ + targaheader[2] = 3; /* image type = uncompressed gray-scale */ + targaheader[16] = 8; /* bits per pixel */ + } else { /* must be RGB */ if (num_colors > 0) { - targaheader[2] = 1; /* image type = colormapped RGB */ + targaheader[2] = 1; /* image type = colormapped RGB */ targaheader[16] = 8; } else { - targaheader[2] = 2; /* image type = uncompressed RGB */ + targaheader[2] = 2; /* image type = uncompressed RGB */ targaheader[16] = 24; } } @@ -97,7 +97,7 @@ write_header (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, int num_colors) METHODDEF(void) put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied) + JDIMENSION rows_supplied) /* used for unquantized full-color output */ { tga_dest_ptr dest = (tga_dest_ptr) dinfo; @@ -118,7 +118,7 @@ put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, METHODDEF(void) put_gray_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied) + JDIMENSION rows_supplied) /* used for grayscale OR quantized color output */ { tga_dest_ptr dest = (tga_dest_ptr) dinfo; @@ -142,7 +142,7 @@ put_gray_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, METHODDEF(void) put_demapped_gray (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, - JDIMENSION rows_supplied) + JDIMENSION rows_supplied) { tga_dest_ptr dest = (tga_dest_ptr) dinfo; register JSAMPROW inptr; @@ -183,14 +183,14 @@ start_output_tga (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo) /* We only support 8-bit colormap indexes, so only 256 colors */ num_colors = cinfo->actual_number_of_colors; if (num_colors > 256) - ERREXIT1(cinfo, JERR_TOO_MANY_COLORS, num_colors); + ERREXIT1(cinfo, JERR_TOO_MANY_COLORS, num_colors); write_header(cinfo, dinfo, num_colors); /* Write the colormap. Note Targa uses BGR byte order */ outfile = dest->pub.output_file; for (i = 0; i < num_colors; i++) { - putc(GETJSAMPLE(cinfo->colormap[2][i]), outfile); - putc(GETJSAMPLE(cinfo->colormap[1][i]), outfile); - putc(GETJSAMPLE(cinfo->colormap[0][i]), outfile); + putc(GETJSAMPLE(cinfo->colormap[2][i]), outfile); + putc(GETJSAMPLE(cinfo->colormap[1][i]), outfile); + putc(GETJSAMPLE(cinfo->colormap[0][i]), outfile); } dest->pub.put_pixel_rows = put_gray_rows; } else { @@ -229,7 +229,7 @@ jinit_write_targa (j_decompress_ptr cinfo) /* Create module interface object, fill in method pointers */ dest = (tga_dest_ptr) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - SIZEOF(tga_dest_struct)); + SIZEOF(tga_dest_struct)); dest->pub.start_output = start_output_tga; dest->pub.finish_output = finish_output_tga; @@ -240,7 +240,7 @@ jinit_write_targa (j_decompress_ptr cinfo) dest->buffer_width = cinfo->output_width * cinfo->output_components; dest->iobuffer = (char *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - (size_t) (dest->buffer_width * SIZEOF(char))); + (size_t) (dest->buffer_width * SIZEOF(char))); /* Create decompressor output buffer. */ dest->pub.buffer = (*cinfo->mem->alloc_sarray)