diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..775f878 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +Release +*.la +*.o diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..b60e702 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,99 @@ +cmake_minimum_required(VERSION 3.16) + +project(debug_h265) + +include(CMakePackageConfigHelpers) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +include_directories(libde265) +include_directories(.) + +set (libde265_sources + alloc_pool.cc + bitstream.cc + cabac.cc + configparam.cc + contextmodel.cc + de265.cc + deblock.cc + decctx.cc + dpb.cc + # en265.cc + fallback-dct.cc + fallback-motion.cc + fallback.cc + image-io.cc + image.cc + intrapred.cc + md5.cc + motion.cc + nal-parser.cc + nal.cc + pps.cc + quality.cc + refpic.cc + sao.cc + scan.cc + sei.cc + slice.cc + sps.cc + threads.cc + transform.cc + util.cc + visualize.cc + vps.cc + vui.cc +) + +set (libde265_headers + acceleration.h + alloc_pool.h + bitstream.h + cabac.h + configparam.h + deblock.h + decctx.h + dpb.h + en265.h + fallback-dct.h + fallback-motion.h + fallback.h + image-io.h + image.h + intrapred.h + md5.h + motion.h + nal-parser.h + nal.h + pps.h + quality.h + refpic.h + sao.h + scan.h + sei.h + slice.h + sps.h + threads.h + transform.h + util.h + visualize.h + vps.h + vui.h +) + + +add_definitions(-DLIBDE265_EXPORTS) + +#add_subdirectory (encoder) + +if(SUPPORTS_SSE4_1) + add_definitions(-DHAVE_SSE4_1) + add_subdirectory (x86) +endif() + +add_library(${PROJECT_NAME} STATIC ${libde265_sources} ${ENCODER_OBJECTS} ${X86_OBJECTS}) +find_package(Threads) +target_link_libraries(${PROJECT_NAME} PRIVATE Threads::Threads) diff --git a/alloc_pool.cc b/alloc_pool.cc new file mode 100644 index 0000000..b056397 --- /dev/null +++ b/alloc_pool.cc @@ -0,0 +1,100 @@ +/* + * H.265 video codec. + * Copyright (c) 2014 struktur AG, Dirk Farin + * + * Authors: Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include +#include "libde265/alloc_pool.h" +#include "libde265/util.h" +#include +#include + +#define DEBUG_MEMORY 1 + + +alloc_pool::alloc_pool(size_t objSize, int poolSize, bool grow) + : mObjSize(objSize), + mPoolSize(poolSize), + mGrow(grow) +{ + m_freeList.reserve(poolSize); + m_memBlocks.reserve(8); + + add_memory_block(); +} + + +void alloc_pool::add_memory_block() +{ + uint8_t* p = new uint8_t[mObjSize * mPoolSize]; + m_memBlocks.push_back(p); + + for (int i=0;i&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/libde265 +pkgincludedir = $(includedir)/libde265 +pkglibdir = $(libdir)/libde265 +pkglibexecdir = $(libexecdir)/libde265 +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = x86_64-pc-linux-gnu +host_triplet = x86_64-pc-linux-gnu +target_triplet = x86_64-pc-linux-gnu + +# NEON specific functions +#am__append_1 = libde265_arm_neon.la +#am__append_2 = libde265_arm_neon.la +subdir = libde265/arm +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libde265_arm_la_DEPENDENCIES = $(am__append_2) +am_libde265_arm_la_OBJECTS = libde265_arm_la-arm.lo +libde265_arm_la_OBJECTS = $(am_libde265_arm_la_OBJECTS) +AM_V_lt = $(am__v_lt_$(V)) +am__v_lt_ = $(am__v_lt_$(AM_DEFAULT_VERBOSITY)) +am__v_lt_0 = --silent +am__v_lt_1 = +libde265_arm_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +libde265_arm_neon_la_LIBADD = +am__libde265_arm_neon_la_SOURCES_DIST = asm.S cpudetect.S \ + hevcdsp_qpel_neon.S neon.S +#am_libde265_arm_neon_la_OBJECTS = \ +# libde265_arm_neon_la-asm.lo \ +# libde265_arm_neon_la-cpudetect.lo \ +# libde265_arm_neon_la-hevcdsp_qpel_neon.lo \ +# libde265_arm_neon_la-neon.lo +libde265_arm_neon_la_OBJECTS = $(am_libde265_arm_neon_la_OBJECTS) +#am_libde265_arm_neon_la_rpath = +AM_V_P = $(am__v_P_$(V)) +am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY)) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_$(V)) +am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_$(V)) +am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I. -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/libde265_arm_la-arm.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo +am__mv = mv -f +CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS) +LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CCASFLAGS) $(CCASFLAGS) +AM_V_CPPAS = $(am__v_CPPAS_$(V)) +am__v_CPPAS_ = $(am__v_CPPAS_$(AM_DEFAULT_VERBOSITY)) +am__v_CPPAS_0 = @echo " CPPAS " $@; +am__v_CPPAS_1 = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_$(V)) +am__v_CXX_ = $(am__v_CXX_$(AM_DEFAULT_VERBOSITY)) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_$(V)) +am__v_CXXLD_ = $(am__v_CXXLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_$(V)) +am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_$(V)) +am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libde265_arm_la_SOURCES) $(libde265_arm_neon_la_SOURCES) +DIST_SOURCES = $(libde265_arm_la_SOURCES) \ + $(am__libde265_arm_neon_la_SOURCES_DIST) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = ${SHELL} /home/dima/git/libde265/missing aclocal-1.16 +ALLOCA = +AMTAR = $${TAR-tar} +AM_DEFAULT_VERBOSITY = 1 +AR = ar +AUTOCONF = ${SHELL} /home/dima/git/libde265/missing autoconf +AUTOHEADER = ${SHELL} /home/dima/git/libde265/missing autoheader +AUTOMAKE = ${SHELL} /home/dima/git/libde265/missing automake-1.16 +AWK = gawk +CC = gcc +CCAS = gcc +CCASDEPMODE = depmode=gcc3 +CCASFLAGS = -g -O2 +CCDEPMODE = depmode=gcc3 +CFLAGS = -g -O2 -std=c99 -Wall +CPP = gcc -E +CPPFLAGS = +CXX = g++ +CXXCPP = g++ -E +CXXDEPMODE = depmode=gcc3 +CXXFLAGS = -g -O2 -Werror=return-type -Werror=unused-result -Werror=reorder -DDE265_LOG_ERROR +CYGPATH_W = echo +DEFS = -DHAVE_CONFIG_H +DEPDIR = .deps +DLLTOOL = false +DSYMUTIL = +DUMPBIN = +ECHO_C = +ECHO_N = -n +ECHO_T = +EGREP = /usr/bin/grep -E +EXEEXT = +FGREP = /usr/bin/grep -F +GREP = /usr/bin/grep +HAVE_CXX11 = +INSTALL = /usr/bin/install -c +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_PROGRAM = ${INSTALL} +INSTALL_SCRIPT = ${INSTALL} +INSTALL_STRIP_PROGRAM = $(install_sh) -c -s +LD = /usr/bin/ld -m elf_x86_64 +LDFLAGS = +LIBDE265_AGE = 0 +LIBDE265_CURRENT = 0 +LIBDE265_REVISION = 12 +LIBOBJS = +LIBS = -lpthread -lm +LIBTOOL = $(SHELL) $(top_builddir)/libtool +LIPO = +LN_S = ln -s +LTLIBOBJS = +LT_SYS_LIBRARY_PATH = +MAKEINFO = ${SHELL} /home/dima/git/libde265/missing makeinfo +MANIFEST_TOOL = : +MKDIR_P = /usr/bin/mkdir -p +NM = /usr/bin/nm -B +NMEDIT = +NUMERIC_VERSION = 0x01000500 +OBJDUMP = objdump +OBJEXT = o +OTOOL = +OTOOL64 = +PACKAGE = libde265 +PACKAGE_BUGREPORT = farin@struktur.de +PACKAGE_NAME = libde265 +PACKAGE_STRING = libde265 1.0.5 +PACKAGE_TARNAME = libde265 +PACKAGE_URL = +PACKAGE_VERSION = 1.0.5 +PATH_SEPARATOR = : +PKG_CONFIG = /usr/bin/pkg-config +PKG_CONFIG_LIBDIR = +PKG_CONFIG_PATH = +QTCHOOSER = +QTMOC = /usr/bin/moc-qt5 +QT_CFLAGS = -I/usr/include/qt/QtCore -I/usr/include/qt -I/usr/include/qt/QtGui -DQT_WIDGETS_LIB -I/usr/include/qt/QtWidgets -DQT_GUI_LIB -DQT_CORE_LIB +QT_LIBS = -lQt5Widgets -lQt5Gui -lQt5Core +RANLIB = ranlib +SDL_CFLAGS = -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT +SDL_LIBS = -lSDL -lpthread +SED = /usr/bin/sed +SET_MAKE = +SHELL = /bin/sh +STRIP = strip +SWSCALE_CFLAGS = +SWSCALE_LIBS = -lswscale +VERSION = 1.0.5 +VIDEOGFX_CFLAGS = +VIDEOGFX_LIBS = +abs_builddir = /home/dima/git/libde265/libde265/arm +abs_srcdir = /home/dima/git/libde265/libde265/arm +abs_top_builddir = /home/dima/git/libde265 +abs_top_srcdir = /home/dima/git/libde265 +ac_ct_AR = ar +ac_ct_CC = gcc +ac_ct_CXX = g++ +ac_ct_DUMPBIN = +am__include = include +am__leading_dot = . +am__quote = +am__tar = $${TAR-tar} chof - "$$tardir" +am__untar = $${TAR-tar} xf - +bindir = ${exec_prefix}/bin +build = x86_64-pc-linux-gnu +build_alias = +build_cpu = x86_64 +build_os = linux-gnu +build_vendor = pc +builddir = . +datadir = ${datarootdir} +datarootdir = ${prefix}/share +docdir = ${datarootdir}/doc/${PACKAGE_TARNAME} +dvidir = ${docdir} +exec_prefix = ${prefix} +host = x86_64-pc-linux-gnu +host_alias = +host_cpu = x86_64 +host_os = linux-gnu +host_vendor = pc +htmldir = ${docdir} +includedir = ${prefix}/include +infodir = ${datarootdir}/info +install_sh = ${SHELL} /home/dima/git/libde265/install-sh +libdir = ${exec_prefix}/lib +libexecdir = ${exec_prefix}/libexec +localedir = ${datarootdir}/locale +localstatedir = ${prefix}/var +mandir = ${datarootdir}/man +mkdir_p = $(MKDIR_P) +oldincludedir = /usr/include +pdfdir = ${docdir} +prefix = /usr/local +program_transform_name = s,x,x, +psdir = ${docdir} +sbindir = ${exec_prefix}/sbin +sharedstatedir = ${prefix}/com +srcdir = . +sysconfdir = ${prefix}/etc +target = x86_64-pc-linux-gnu +target_alias = +target_cpu = x86_64 +target_os = linux-gnu +target_vendor = pc +top_build_prefix = ../../ +top_builddir = ../.. +top_srcdir = ../.. +noinst_LTLIBRARIES = libde265_arm.la $(am__append_1) +libde265_arm_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) +libde265_arm_la_SOURCES = arm.cc arm.h +libde265_arm_la_LIBADD = $(am__append_2) +#libde265_arm_neon_la_CXXFLAGS = -mfpu=neon -I.. $(CFLAG_VISIBILITY) +#libde265_arm_neon_la_CCASFLAGS = -mfpu=neon -I.. \ +# -DHAVE_NEON \ +# -DEXTERN_ASM= \ +# -DHAVE_AS_FUNC \ +# -DHAVE_SECTION_DATA_REL_RO + +#libde265_arm_neon_la_SOURCES = \ +# asm.S \ +# cpudetect.S \ +# hevcdsp_qpel_neon.S \ +# neon.S + +all: all-am + +.SUFFIXES: +.SUFFIXES: .S .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/arm/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu libde265/arm/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libde265_arm.la: $(libde265_arm_la_OBJECTS) $(libde265_arm_la_DEPENDENCIES) $(EXTRA_libde265_arm_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_arm_la_LINK) $(libde265_arm_la_OBJECTS) $(libde265_arm_la_LIBADD) $(LIBS) + +libde265_arm_neon.la: $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_DEPENDENCIES) $(EXTRA_libde265_arm_neon_la_DEPENDENCIES) + $(AM_V_CCLD)$(LINK) $(am_libde265_arm_neon_la_rpath) $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +include ./$(DEPDIR)/libde265_arm_la-arm.Plo # am--include-marker +include ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo # am--include-marker +include ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo # am--include-marker +include ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo # am--include-marker +include ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.S.o: + $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CPPAS)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(CPPASCOMPILE) -c -o $@ $< + +.S.obj: + $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CPPAS)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.S.lo: + $(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +# $(AM_V_CPPAS)source='$<' object='$@' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LTCPPASCOMPILE) -c -o $@ $< + +libde265_arm_neon_la-asm.lo: asm.S + $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-asm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-asm.Tpo -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-asm.Tpo $(DEPDIR)/libde265_arm_neon_la-asm.Plo +# $(AM_V_CPPAS)source='asm.S' object='libde265_arm_neon_la-asm.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S + +libde265_arm_neon_la-cpudetect.lo: cpudetect.S + $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-cpudetect.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo $(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo +# $(AM_V_CPPAS)source='cpudetect.S' object='libde265_arm_neon_la-cpudetect.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S + +libde265_arm_neon_la-hevcdsp_qpel_neon.lo: hevcdsp_qpel_neon.S + $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-hevcdsp_qpel_neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo +# $(AM_V_CPPAS)source='hevcdsp_qpel_neon.S' object='libde265_arm_neon_la-hevcdsp_qpel_neon.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S + +libde265_arm_neon_la-neon.lo: neon.S + $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-neon.Tpo -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-neon.Tpo $(DEPDIR)/libde265_arm_neon_la-neon.Plo +# $(AM_V_CPPAS)source='neon.S' object='libde265_arm_neon_la-neon.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) \ +# $(AM_V_CPPAS_no)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S + +.cc.o: + $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CXX)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: + $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CXX)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: + $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +# $(AM_V_CXX)source='$<' object='$@' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LTCXXCOMPILE) -c -o $@ $< + +libde265_arm_la-arm.lo: arm.cc + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_arm_la-arm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_la-arm.Tpo -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_la-arm.Tpo $(DEPDIR)/libde265_arm_la-arm.Plo +# $(AM_V_CXX)source='arm.cc' object='libde265_arm_la-arm.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/libde265_arm_la-arm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/libde265_arm_la-arm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-generic clean-libtool clean-noinstLTLIBRARIES \ + cscopelist-am ctags ctags-am distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am html html-am info info-am install install-am \ + install-data install-data-am install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +# libde265_arm_la_CXXFLAGS += -DHAVE_VISIBILITY + +# libde265_arm_neon_la_CCASFLAGS += -DCONFIG_THUMB + +## libde265_arm_neon_la_CXXFLAGS += -DHAVE_VISIBILITY + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/arm/Makefile.am b/arm/Makefile.am new file mode 100644 index 0000000..9ef62d9 --- /dev/null +++ b/arm/Makefile.am @@ -0,0 +1,38 @@ +noinst_LTLIBRARIES = libde265_arm.la + +libde265_arm_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) +libde265_arm_la_SOURCES = arm.cc arm.h +libde265_arm_la_LIBADD = + +if HAVE_VISIBILITY + libde265_arm_la_CXXFLAGS += -DHAVE_VISIBILITY +endif + + +if ENABLE_NEON_OPT +# NEON specific functions + +noinst_LTLIBRARIES += libde265_arm_neon.la +libde265_arm_la_LIBADD += libde265_arm_neon.la +libde265_arm_neon_la_CXXFLAGS = -mfpu=neon -I.. $(CFLAG_VISIBILITY) +libde265_arm_neon_la_CCASFLAGS = -mfpu=neon -I.. \ + -DHAVE_NEON \ + -DEXTERN_ASM= \ + -DHAVE_AS_FUNC \ + -DHAVE_SECTION_DATA_REL_RO + +if ENABLE_ARM_THUMB + libde265_arm_neon_la_CCASFLAGS += -DCONFIG_THUMB +endif + +libde265_arm_neon_la_SOURCES = \ + asm.S \ + cpudetect.S \ + hevcdsp_qpel_neon.S \ + neon.S + +if HAVE_VISIBILITY + libde265_arm_neon_la_CXXFLAGS += -DHAVE_VISIBILITY +endif + +endif diff --git a/arm/Makefile.in b/arm/Makefile.in new file mode 100644 index 0000000..fb1575b --- /dev/null +++ b/arm/Makefile.in @@ -0,0 +1,770 @@ +# Makefile.in generated by automake 1.16.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2018 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ + +# NEON specific functions +@ENABLE_NEON_OPT_TRUE@am__append_1 = libde265_arm_neon.la +@ENABLE_NEON_OPT_TRUE@am__append_2 = libde265_arm_neon.la +subdir = libde265/arm +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libde265_arm_la_DEPENDENCIES = $(am__append_2) +am_libde265_arm_la_OBJECTS = libde265_arm_la-arm.lo +libde265_arm_la_OBJECTS = $(am_libde265_arm_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +libde265_arm_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +libde265_arm_neon_la_LIBADD = +am__libde265_arm_neon_la_SOURCES_DIST = asm.S cpudetect.S \ + hevcdsp_qpel_neon.S neon.S +@ENABLE_NEON_OPT_TRUE@am_libde265_arm_neon_la_OBJECTS = \ +@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la-asm.lo \ +@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la-cpudetect.lo \ +@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la-hevcdsp_qpel_neon.lo \ +@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la-neon.lo +libde265_arm_neon_la_OBJECTS = $(am_libde265_arm_neon_la_OBJECTS) +@ENABLE_NEON_OPT_TRUE@am_libde265_arm_neon_la_rpath = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/libde265_arm_la-arm.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo \ + ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo +am__mv = mv -f +CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS) +LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CCASFLAGS) $(CCASFLAGS) +AM_V_CPPAS = $(am__v_CPPAS_@AM_V@) +am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@) +am__v_CPPAS_0 = @echo " CPPAS " $@; +am__v_CPPAS_1 = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_@AM_V@) +am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) +am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libde265_arm_la_SOURCES) $(libde265_arm_neon_la_SOURCES) +DIST_SOURCES = $(libde265_arm_la_SOURCES) \ + $(am__libde265_arm_neon_la_SOURCES_DIST) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +HAVE_CXX11 = @HAVE_CXX11@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBDE265_AGE = @LIBDE265_AGE@ +LIBDE265_CURRENT = @LIBDE265_CURRENT@ +LIBDE265_REVISION = @LIBDE265_REVISION@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +NUMERIC_VERSION = @NUMERIC_VERSION@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +QTCHOOSER = @QTCHOOSER@ +QTMOC = @QTMOC@ +QT_CFLAGS = @QT_CFLAGS@ +QT_LIBS = @QT_LIBS@ +RANLIB = @RANLIB@ +SDL_CFLAGS = @SDL_CFLAGS@ +SDL_LIBS = @SDL_LIBS@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +SWSCALE_CFLAGS = @SWSCALE_CFLAGS@ +SWSCALE_LIBS = @SWSCALE_LIBS@ +VERSION = @VERSION@ +VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@ +VIDEOGFX_LIBS = @VIDEOGFX_LIBS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +noinst_LTLIBRARIES = libde265_arm.la $(am__append_1) +libde265_arm_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) +libde265_arm_la_SOURCES = arm.cc arm.h +libde265_arm_la_LIBADD = $(am__append_2) +@ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_CXXFLAGS = -mfpu=neon -I.. $(CFLAG_VISIBILITY) +@ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_CCASFLAGS = -mfpu=neon -I.. \ +@ENABLE_NEON_OPT_TRUE@ -DHAVE_NEON \ +@ENABLE_NEON_OPT_TRUE@ -DEXTERN_ASM= \ +@ENABLE_NEON_OPT_TRUE@ -DHAVE_AS_FUNC \ +@ENABLE_NEON_OPT_TRUE@ -DHAVE_SECTION_DATA_REL_RO + +@ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_SOURCES = \ +@ENABLE_NEON_OPT_TRUE@ asm.S \ +@ENABLE_NEON_OPT_TRUE@ cpudetect.S \ +@ENABLE_NEON_OPT_TRUE@ hevcdsp_qpel_neon.S \ +@ENABLE_NEON_OPT_TRUE@ neon.S + +all: all-am + +.SUFFIXES: +.SUFFIXES: .S .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/arm/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu libde265/arm/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libde265_arm.la: $(libde265_arm_la_OBJECTS) $(libde265_arm_la_DEPENDENCIES) $(EXTRA_libde265_arm_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_arm_la_LINK) $(libde265_arm_la_OBJECTS) $(libde265_arm_la_LIBADD) $(LIBS) + +libde265_arm_neon.la: $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_DEPENDENCIES) $(EXTRA_libde265_arm_neon_la_DEPENDENCIES) + $(AM_V_CCLD)$(LINK) $(am_libde265_arm_neon_la_rpath) $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_la-arm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_neon_la-asm.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_arm_neon_la-neon.Plo@am__quote@ # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.S.o: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ $< + +.S.obj: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.S.lo: +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $< + +libde265_arm_neon_la-asm.lo: asm.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-asm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-asm.Tpo -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-asm.Tpo $(DEPDIR)/libde265_arm_neon_la-asm.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='asm.S' object='libde265_arm_neon_la-asm.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S + +libde265_arm_neon_la-cpudetect.lo: cpudetect.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-cpudetect.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo $(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='cpudetect.S' object='libde265_arm_neon_la-cpudetect.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S + +libde265_arm_neon_la-hevcdsp_qpel_neon.lo: hevcdsp_qpel_neon.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-hevcdsp_qpel_neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='hevcdsp_qpel_neon.S' object='libde265_arm_neon_la-hevcdsp_qpel_neon.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S + +libde265_arm_neon_la-neon.lo: neon.S +@am__fastdepCCAS_TRUE@ $(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-neon.Tpo -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S +@am__fastdepCCAS_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-neon.Tpo $(DEPDIR)/libde265_arm_neon_la-neon.Plo +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS)source='neon.S' object='libde265_arm_neon_la-neon.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@ DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCCAS_FALSE@ $(AM_V_CPPAS@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S + +.cc.o: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< + +libde265_arm_la-arm.lo: arm.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_arm_la-arm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_la-arm.Tpo -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_la-arm.Tpo $(DEPDIR)/libde265_arm_la-arm.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='arm.cc' object='libde265_arm_la-arm.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/libde265_arm_la-arm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/libde265_arm_la-arm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo + -rm -f ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-generic clean-libtool clean-noinstLTLIBRARIES \ + cscopelist-am ctags ctags-am distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am html html-am info info-am install install-am \ + install-data install-data-am install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +@HAVE_VISIBILITY_TRUE@ libde265_arm_la_CXXFLAGS += -DHAVE_VISIBILITY + +@ENABLE_ARM_THUMB_TRUE@@ENABLE_NEON_OPT_TRUE@ libde265_arm_neon_la_CCASFLAGS += -DCONFIG_THUMB + +@ENABLE_NEON_OPT_TRUE@@HAVE_VISIBILITY_TRUE@ libde265_arm_neon_la_CXXFLAGS += -DHAVE_VISIBILITY + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/arm/arm.cc b/arm/arm.cc new file mode 100644 index 0000000..9791f15 --- /dev/null +++ b/arm/arm.cc @@ -0,0 +1,123 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2015 struktur AG, Joachim Bauch + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "arm.h" + +#ifdef HAVE_NEON + +#define QPEL_FUNC(name) \ + extern "C" void ff_##name(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, \ + int height, int width); \ + void libde265_##name(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, \ + int width, int height, int16_t* mcbuffer) { \ + ff_##name(dst, dststride, src, srcstride, height, width); \ + } + +QPEL_FUNC(hevc_put_qpel_v1_neon_8); +QPEL_FUNC(hevc_put_qpel_v2_neon_8); +QPEL_FUNC(hevc_put_qpel_v3_neon_8); +QPEL_FUNC(hevc_put_qpel_h1_neon_8); +QPEL_FUNC(hevc_put_qpel_h2_neon_8); +QPEL_FUNC(hevc_put_qpel_h3_neon_8); +QPEL_FUNC(hevc_put_qpel_h1v1_neon_8); +QPEL_FUNC(hevc_put_qpel_h1v2_neon_8); +QPEL_FUNC(hevc_put_qpel_h1v3_neon_8); +QPEL_FUNC(hevc_put_qpel_h2v1_neon_8); +QPEL_FUNC(hevc_put_qpel_h2v2_neon_8); +QPEL_FUNC(hevc_put_qpel_h2v3_neon_8); +QPEL_FUNC(hevc_put_qpel_h3v1_neon_8); +QPEL_FUNC(hevc_put_qpel_h3v2_neon_8); +QPEL_FUNC(hevc_put_qpel_h3v3_neon_8); +#undef QPEL_FUNC + +#if defined(HAVE_SIGNAL_H) && defined(HAVE_SETJMP_H) + +#include +#include + +extern "C" void libde265_detect_neon(void); + +static jmp_buf jump_env; + +static void sighandler(int sig) { + (void)sig; + longjmp(jump_env, 1); +} + +static bool has_NEON() { + static bool checked_NEON = false; + static bool have_NEON = false; + + if (!checked_NEON) { + void (*oldsignal)(int); + + checked_NEON = true; + oldsignal = signal(SIGILL, sighandler); + if (setjmp(jump_env)) { + signal(SIGILL, oldsignal); + have_NEON = false; + return false; + } + libde265_detect_neon(); + signal(SIGILL, oldsignal); + have_NEON = true; + } + + return have_NEON; +} + +#else // #if defined(HAVE_SIGNAL_H) && defined(HAVE_SETJMP_H) + +#warning "Don't know how to detect NEON support at runtime- will be disabled" + +static bool has_NEON() { + return false; +} + +#endif + +#endif // #ifdef HAVE_NEON + +void init_acceleration_functions_arm(struct acceleration_functions* accel) +{ +#ifdef HAVE_NEON + if (has_NEON()) { + accel->put_hevc_qpel_8[0][1] = libde265_hevc_put_qpel_v1_neon_8; + accel->put_hevc_qpel_8[0][2] = libde265_hevc_put_qpel_v2_neon_8; + accel->put_hevc_qpel_8[0][3] = libde265_hevc_put_qpel_v3_neon_8; + accel->put_hevc_qpel_8[1][0] = libde265_hevc_put_qpel_h1_neon_8; + accel->put_hevc_qpel_8[1][1] = libde265_hevc_put_qpel_h1v1_neon_8; + accel->put_hevc_qpel_8[1][2] = libde265_hevc_put_qpel_h1v2_neon_8; + accel->put_hevc_qpel_8[1][3] = libde265_hevc_put_qpel_h1v3_neon_8; + accel->put_hevc_qpel_8[2][0] = libde265_hevc_put_qpel_h2_neon_8; + accel->put_hevc_qpel_8[2][1] = libde265_hevc_put_qpel_h2v1_neon_8; + accel->put_hevc_qpel_8[2][2] = libde265_hevc_put_qpel_h2v2_neon_8; + accel->put_hevc_qpel_8[2][3] = libde265_hevc_put_qpel_h2v3_neon_8; + accel->put_hevc_qpel_8[3][0] = libde265_hevc_put_qpel_h3_neon_8; + accel->put_hevc_qpel_8[3][1] = libde265_hevc_put_qpel_h3v1_neon_8; + accel->put_hevc_qpel_8[3][2] = libde265_hevc_put_qpel_h3v2_neon_8; + accel->put_hevc_qpel_8[3][3] = libde265_hevc_put_qpel_h3v3_neon_8; + } +#endif // #ifdef HAVE_NEON +} diff --git a/arm/arm.h b/arm/arm.h new file mode 100644 index 0000000..d64172a --- /dev/null +++ b/arm/arm.h @@ -0,0 +1,28 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2015 struktur AG, Joachim Bauch + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef LIBDE265_ARM_H +#define LIBDE265_ARM_H + +#include "acceleration.h" + +void init_acceleration_functions_arm(struct acceleration_functions* accel); + +#endif // LIBDE265_ARM_H diff --git a/arm/asm.S b/arm/asm.S new file mode 100644 index 0000000..1d0e5a9 --- /dev/null +++ b/arm/asm.S @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + +#if CONFIG_THUMB +# define A @ +# define T +#else +# define A +# define T @ +#endif + +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC @ +#endif + +#if HAVE_NEON + .arch armv7-a +#elif HAVE_ARMV6T2 + .arch armv6t2 +#elif HAVE_ARMV6 + .arch armv6 +#elif HAVE_ARMV5TE + .arch armv5te +#endif + +#if HAVE_NEON + .fpu neon +#elif HAVE_VFP + .fpu vfp +#endif + + .syntax unified +T .thumb +ELF .eabi_attribute 25, 1 @ Tag_ABI_align_preserved +ELF .section .note.GNU-stack,"",%progbits @ Mark stack as non-executable + +.macro function name, export=0, align=2 + .set .Lpic_idx, 0 + .set .Lpic_gp, 0 + .macro endfunc + .if .Lpic_idx + .align 2 + .altmacro + put_pic %(.Lpic_idx - 1) + .noaltmacro + .endif +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function +FUNC .func EXTERN_ASM\name +EXTERN_ASM\name: + .else +ELF .type \name, %function +FUNC .func \name +\name: + .endif +.endm + +.macro const name, align=2, relocate=0 + .macro endconst +ELF .size \name, . - \name + .purgem endconst + .endm +.if HAVE_SECTION_DATA_REL_RO && \relocate + .section .data.rel.ro +.else + .section .rodata +.endif + .align \align +\name: +.endm + +#if !HAVE_ARMV6T2_EXTERNAL +.macro movw rd, val + mov \rd, \val & 255 + orr \rd, \val & ~255 +.endm +#endif + +.macro mov32 rd, val +#if HAVE_ARMV6T2_EXTERNAL + movw \rd, #(\val) & 0xffff + .if (\val) >> 16 + movt \rd, #(\val) >> 16 + .endif +#else + ldr \rd, =\val +#endif +.endm + +.macro put_pic num + put_pic_\num +.endm + +.macro do_def_pic num, val, label + .macro put_pic_\num + .if \num + .altmacro + put_pic %(\num - 1) + .noaltmacro + .endif +\label: .word \val + .purgem put_pic_\num + .endm +.endm + +.macro def_pic val, label + .altmacro + do_def_pic %.Lpic_idx, \val, \label + .noaltmacro + .set .Lpic_idx, .Lpic_idx + 1 +.endm + +.macro ldpic rd, val, indir=0 + ldr \rd, .Lpicoff\@ +.Lpic\@: + .if \indir +A ldr \rd, [pc, \rd] +T add \rd, pc +T ldr \rd, [\rd] + .else + add \rd, pc + .endif + def_pic \val - (.Lpic\@ + (8 >> CONFIG_THUMB)), .Lpicoff\@ +.endm + +.macro movrel rd, val +#if CONFIG_PIC + ldpic \rd, \val +#elif HAVE_ARMV6T2_EXTERNAL && !defined(__APPLE__) + movw \rd, #:lower16:\val + movt \rd, #:upper16:\val +#else + ldr \rd, =\val +#endif +.endm + +.macro movrelx rd, val, gp +#if CONFIG_PIC && defined(__ELF__) + .ifnb \gp + .if .Lpic_gp + .unreq gp + .endif + gp .req \gp + ldpic gp, _GLOBAL_OFFSET_TABLE_ + .elseif !.Lpic_gp + gp .req r12 + ldpic gp, _GLOBAL_OFFSET_TABLE_ + .endif + .set .Lpic_gp, 1 + ldr \rd, .Lpicoff\@ + ldr \rd, [gp, \rd] + def_pic \val(GOT), .Lpicoff\@ +#elif CONFIG_PIC && defined(__APPLE__) + ldpic \rd, .Lpic\@, indir=1 + .non_lazy_symbol_pointer +.Lpic\@: + .indirect_symbol \val + .word 0 + .text +#else + movrel \rd, \val +#endif +.endm + +.macro add_sh rd, rn, rm, sh:vararg +A add \rd, \rn, \rm, \sh +T mov \rm, \rm, \sh +T add \rd, \rn, \rm +.endm + +.macro ldr_pre rt, rn, rm:vararg +A ldr \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T ldr \rt, [\rn] +.endm + +.macro ldr_dpre rt, rn, rm:vararg +A ldr \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T ldr \rt, [\rn] +.endm + +.macro ldr_nreg rt, rn, rm:vararg +A ldr \rt, [\rn, -\rm] +T sub \rt, \rn, \rm +T ldr \rt, [\rt] +.endm + +.macro ldr_post rt, rn, rm:vararg +A ldr \rt, [\rn], \rm +T ldr \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro ldrc_pre cc, rt, rn, rm:vararg +A ldr\cc \rt, [\rn, \rm]! +T itt \cc +T add\cc \rn, \rn, \rm +T ldr\cc \rt, [\rn] +.endm + +.macro ldrd_reg rt, rt2, rn, rm +A ldrd \rt, \rt2, [\rn, \rm] +T add \rt, \rn, \rm +T ldrd \rt, \rt2, [\rt] +.endm + +.macro ldrd_post rt, rt2, rn, rm +A ldrd \rt, \rt2, [\rn], \rm +T ldrd \rt, \rt2, [\rn] +T add \rn, \rn, \rm +.endm + +.macro ldrh_pre rt, rn, rm +A ldrh \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T ldrh \rt, [\rn] +.endm + +.macro ldrh_dpre rt, rn, rm +A ldrh \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T ldrh \rt, [\rn] +.endm + +.macro ldrh_post rt, rn, rm +A ldrh \rt, [\rn], \rm +T ldrh \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro ldrb_post rt, rn, rm +A ldrb \rt, [\rn], \rm +T ldrb \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro str_post rt, rn, rm:vararg +A str \rt, [\rn], \rm +T str \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strb_post rt, rn, rm:vararg +A strb \rt, [\rn], \rm +T strb \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strd_post rt, rt2, rn, rm +A strd \rt, \rt2, [\rn], \rm +T strd \rt, \rt2, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strh_pre rt, rn, rm +A strh \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T strh \rt, [\rn] +.endm + +.macro strh_dpre rt, rn, rm +A strh \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T strh \rt, [\rn] +.endm + +.macro strh_post rt, rn, rm +A strh \rt, [\rn], \rm +T strh \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strh_dpost rt, rn, rm +A strh \rt, [\rn], -\rm +T strh \rt, [\rn] +T sub \rn, \rn, \rm +.endm + +#if HAVE_VFP_ARGS +ELF .eabi_attribute 28, 1 +# define VFP +# define NOVFP @ +#else +# define VFP @ +# define NOVFP +#endif + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) diff --git a/arm/cpudetect.S b/arm/cpudetect.S new file mode 100644 index 0000000..45600a8 --- /dev/null +++ b/arm/cpudetect.S @@ -0,0 +1,29 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2015 struktur AG, Joachim Bauch + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "asm.S" +#include "neon.S" + +// we execute a simple NEON instruction and check if SIGILL is triggered to +// detect if the CPU support NEON code +function libde265_detect_neon, export=1 + vand q0, q0, q0 + bx lr +endfunc diff --git a/arm/hevcdsp_qpel_neon.S b/arm/hevcdsp_qpel_neon.S new file mode 100644 index 0000000..4e438a9 --- /dev/null +++ b/arm/hevcdsp_qpel_neon.S @@ -0,0 +1,1004 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This is commit 63ca0fe8288dbd300c9bb814cb671e5d889f691c from + * https://github.com/FFmpeg/FFmpeg/blob/master/libavcodec/arm/hevcdsp_qpel_neon.S + */ + +#include "asm.S" +#include "neon.S" + +#define MAX_PB_SIZE #64 + +.macro regshuffle_d8 + vmov d16, d17 + vmov d17, d18 + vmov d18, d19 + vmov d19, d20 + vmov d20, d21 + vmov d21, d22 + vmov d22, d23 +.endm + +.macro regshuffle_q8 + vmov q0, q1 + vmov q1, q2 + vmov q2, q3 + vmov q3, q4 + vmov q4, q5 + vmov q5, q6 + vmov q6, q7 +.endm + +.macro vextin8 + pld [r2] + vld1.8 {q11}, [r2], r3 + vext.8 d16, d22, d23, #1 + vext.8 d17, d22, d23, #2 + vext.8 d18, d22, d23, #3 + vext.8 d19, d22, d23, #4 + vext.8 d20, d22, d23, #5 + vext.8 d21, d22, d23, #6 + vext.8 d22, d22, d23, #7 +.endm + +.macro loadin8 + pld [r2] + vld1.8 {d16}, [r2], r3 + pld [r2] + vld1.8 {d17}, [r2], r3 + pld [r2] + vld1.8 {d18}, [r2], r3 + pld [r2] + vld1.8 {d19}, [r2], r3 + pld [r2] + vld1.8 {d20}, [r2], r3 + pld [r2] + vld1.8 {d21}, [r2], r3 + pld [r2] + vld1.8 {d22}, [r2], r3 + pld [r2] + vld1.8 {d23}, [r2], r3 +.endm + +.macro qpel_filter_1_32b + vmov.i16 d16, #58 + vmov.i16 d17, #10 + vmull.s16 q9, d6, d16 // 58 * d0 + vmull.s16 q10, d7, d16 // 58 * d1 + vmov.i16 d16, #17 + vmull.s16 q11, d4, d17 // 10 * c0 + vmull.s16 q12, d5, d17 // 10 * c1 + vmov.i16 d17, #5 + vmull.s16 q13, d8, d16 // 17 * e0 + vmull.s16 q14, d9, d16 // 17 * e1 + vmull.s16 q15, d10, d17 // 5 * f0 + vmull.s16 q8, d11, d17 // 5 * f1 + vsub.s32 q9, q11 // 58 * d0 - 10 * c0 + vsub.s32 q10, q12 // 58 * d1 - 10 * c1 + vshll.s16 q11, d2, #2 // 4 * b0 + vshll.s16 q12, d3, #2 // 4 * b1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + vsubl.s16 q13, d12, d0 // g0 - a0 + vsubl.s16 q14, d13, d1 // g1 - a1 + vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + vsub.s32 q13, q15 // g0 - a0 - 5 * f0 + vsub.s32 q14, q8 // g1 - a1 - 5 * f1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +// input q0 - q7 +// output q8 +.macro qpel_filter_2_32b + vmov.i32 q8, #11 + vaddl.s16 q9, d6, d8 // d0 + e0 + vaddl.s16 q10, d7, d9 // d1 + e1 + vaddl.s16 q11, d4, d10 // c0 + f0 + vaddl.s16 q12, d5, d11 // c1 + f1 + vmul.s32 q11, q8 // 11 * (c0 + f0) + vmul.s32 q12, q8 // 11 * (c1 + f1) + vmov.i32 q8, #40 + vaddl.s16 q15, d2, d12 // b0 + g0 + vmul.s32 q9, q8 // 40 * (d0 + e0) + vmul.s32 q10, q8 // 40 * (d1 + e1) + vaddl.s16 q8, d3, d13 // b1 + g1 + vaddl.s16 q13, d0, d14 // a0 + h0 + vaddl.s16 q14, d1, d15 // a1 + h1 + vshl.s32 q15, #2 // 4*(b0+g0) + vshl.s32 q8, #2 // 4*(b1+g1) + vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0 + vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1 + vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0) + vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1) + vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0) + vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1) + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +.macro qpel_filter_3_32b + vmov.i16 d16, #58 + vmov.i16 d17, #10 + vmull.s16 q9, d8, d16 // 58 * d0 + vmull.s16 q10, d9, d16 // 58 * d1 + vmov.i16 d16, #17 + vmull.s16 q11, d10, d17 // 10 * c0 + vmull.s16 q12, d11, d17 // 10 * c1 + vmov.i16 d17, #5 + vmull.s16 q13, d6, d16 // 17 * e0 + vmull.s16 q14, d7, d16 // 17 * e1 + vmull.s16 q15, d4, d17 // 5 * f0 + vmull.s16 q8, d5, d17 // 5 * f1 + vsub.s32 q9, q11 // 58 * d0 - 10 * c0 + vsub.s32 q10, q12 // 58 * d1 - 10 * c1 + vshll.s16 q11, d12, #2 // 4 * b0 + vshll.s16 q12, d13, #2 // 4 * b1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + vsubl.s16 q13, d2, d14 // g0 - a0 + vsubl.s16 q14, d3, d15 // g1 - a1 + vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + vsub.s32 q13, q15 // g0 - a0 - 5 * f0 + vsub.s32 q14, q8 // g1 - a1 - 5 * f1 + vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 + vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 + vqshrn.s32 d16, q9, #6 + vqshrn.s32 d17, q10, #6 +.endm + +.macro qpel_filter_1 out=q7 + vmov.u8 d24, #58 + vmov.u8 d25, #10 + vshll.u8 q13, d20, #4 // 16*e + vshll.u8 q14, d21, #2 // 4*f + vmull.u8 \out, d19, d24 // 58*d + vaddw.u8 q13, q13, d20 // 17*e + vmull.u8 q15, d18, d25 // 10*c + vaddw.u8 q14, q14, d21 // 5*f + vsubl.u8 q12, d22, d16 // g - a + vadd.u16 \out, q13 // 58d + 17e + vshll.u8 q13, d17, #2 // 4*b + vadd.u16 q15, q14 // 10*c + 5*f + vadd.s16 q13, q12 // - a + 4*b + g + vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f + vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f +.endm + +.macro qpel_filter_2 out=q7 + vmov.i16 q12, #10 + vmov.i16 q14, #11 + vaddl.u8 q13, d19, d20 // d + e + vaddl.u8 q15, d18, d21 // c + f + vmul.u16 q13, q12 // 10 * (d+e) + vmul.u16 q15, q14 // 11 * ( c + f) + vaddl.u8 \out, d17, d22 // b + g + vaddl.u8 q12, d16, d23 // a + h + vadd.u16 \out, q13 // b + 10 * (d + e) + g + vadd.s16 q12, q15 + vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g) + vsub.s16 \out, q12 +.endm + +.macro qpel_filter_3 out=q7 + vmov.u8 d24, #58 + vmov.u8 d25, #10 + vshll.u8 q13, d19, #4 // 16*e + vshll.u8 q14, d18, #2 // 4*f + vmull.u8 \out, d20, d24 // 58*d + vaddw.u8 q13, q13, d19 // 17*e + vmull.u8 q15, d21, d25 // 10*c + vaddw.u8 q14, q14, d18 // 5*f + vsubl.u8 q12, d17, d23 // g - a + vadd.u16 \out, q13 // 58d + 17e + vshll.u8 q13, d22, #2 // 4*b + vadd.u16 q15, q14 // 10*c + 5*f + vadd.s16 q13, q12 // - a + 4*b + g + vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f + vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f +.endm + +.macro hevc_put_qpel_vX_neon_8 filter + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + vpush {d8-d15} + sub r2, r2, r3, lsl #1 + sub r2, r3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + lsl r1, #1 +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vst1.16 {q7}, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vst1.16 d14, [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_vX_neon_8 filter + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, r2, r3, lsl #1 + sub r2, r3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.8 d0, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.32 d0[0], [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b + b 99f +.Lbi\@: lsl r9, #1 + mov r10, r8 +0: loadin8 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filter + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q7 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + regshuffle_d8 + vld1.8 {d23}, [r2], r3 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r10, #16 + mov r8, r10 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filter + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d14 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + regshuffle_d8 + vld1.32 {d23[0]}, [r2], r3 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + +function ff_hevc_put_qpel_v1_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_v2_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_v3_neon_8, export=1 + hevc_put_qpel_vX_neon_8 qpel_filter_3 +endfunc + + +function ff_hevc_put_qpel_uw_v1_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_uw_v2_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_uw_v3_neon_8, export=1 + hevc_put_qpel_uw_vX_neon_8 qpel_filter_3 +endfunc + +.macro hevc_put_qpel_hX_neon_8 filter + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + + vpush {d8-d15} + sub r2, #4 + lsl r1, #1 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r5, #4 + beq 4f +8: subs r4, #1 + vextin8 + \filter + vst1.16 {q7}, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vst1.16 d14, [r0], r1 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_hX_neon_8 filter + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, #4 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ + cmp r5, #4 + beq 4f +8: subs r4, #1 + vextin8 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.8 d0, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vqrshrun.s16 d0, q7, #6 + vst1.32 d0[0], [r0], r1 + bne 4b + b 99f +.Lbi\@: + lsl r9, #1 + cmp r5, #4 + beq 4f + mov r10, r8 +8: subs r4, #1 + vextin8 + \filter + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q7 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + add r10, #16 + mov r8, r10 + mov r0, r6 + add r7, #8 + mov r2, r7 + cmp r5, #4 + bne 8b +4: subs r4, #1 + vextin8 + \filter + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d14 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + +function ff_hevc_put_qpel_h1_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_h2_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_h3_neon_8, export=1 + hevc_put_qpel_hX_neon_8 qpel_filter_3 +endfunc + + +function ff_hevc_put_qpel_uw_h1_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_1 +endfunc + +function ff_hevc_put_qpel_uw_h2_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_2 +endfunc + +function ff_hevc_put_qpel_uw_h3_neon_8, export=1 + hevc_put_qpel_uw_hX_neon_8 qpel_filter_3 +endfunc + +.macro hevc_put_qpel_hXvY_neon_8 filterh filterv + push {r4, r5, r6, r7} + ldr r4, [sp, #16] // height + ldr r5, [sp, #20] // width + + vpush {d8-d15} + sub r2, #4 + sub r2, r2, r3, lsl #1 + sub r2, r3 // extra_before 3 + lsl r1, #1 + mov r12, r4 + mov r6, r0 + mov r7, r2 +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vst1.16 {q8}, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #16 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vst1.16 d16, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b +99: vpop {d8-d15} + pop {r4, r5, r6, r7} + bx lr +.endm + +.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv + push {r4-r10} + ldr r5, [sp, #28] // width + ldr r4, [sp, #32] // height + ldr r8, [sp, #36] // src2 + ldr r9, [sp, #40] // src2stride + vpush {d8-d15} + sub r2, #4 + sub r2, r2, r3, lsl #1 + sub r2, r3 // extra_before 3 + mov r12, r4 + mov r6, r0 + mov r7, r2 + cmp r8, #0 + bne .Lbi\@ +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vqrshrun.s16 d0, q8, #6 + vst1.8 d0, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vqrshrun.s16 d0, q8, #6 + vst1.32 d0[0], [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b + b 99f +.Lbi\@: lsl r9, #1 + mov r10, r8 +0: vextin8 + \filterh q0 + vextin8 + \filterh q1 + vextin8 + \filterh q2 + vextin8 + \filterh q3 + vextin8 + \filterh q4 + vextin8 + \filterh q5 + vextin8 + \filterh q6 + vextin8 + \filterh q7 + cmp r5, #4 + beq 4f +8: subs r4, #1 + \filterv + vld1.16 {q0}, [r8], r9 + vqadd.s16 q0, q8 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 8b + subs r5, #8 + beq 99f + mov r4, r12 + add r6, #8 + mov r0, r6 + add r10, #16 + mov r8, r10 + add r7, #8 + mov r2, r7 + b 0b +4: subs r4, #1 + \filterv + vld1.16 d0, [r8], r9 + vqadd.s16 d0, d16 + vqrshrun.s16 d0, q0, #7 + vst1.32 d0[0], [r0], r1 + regshuffle_q8 + vextin8 + \filterh q7 + bne 4b +99: vpop {d8-d15} + pop {r4-r10} + bx lr +.endm + + +function ff_hevc_put_qpel_h1v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h2v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h3v1_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_h1v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h2v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h3v2_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_h1v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_h2v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_h3v3_neon_8, export=1 + hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b +endfunc + + +function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b +endfunc + +function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b +endfunc + +function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b +endfunc + +function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1 + hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b +endfunc + +.macro init_put_pixels + pld [r1] + pld [r1, r2] + mov r12, MAX_PB_SIZE + lsl r12, #1 +.endm + +function ff_hevc_put_pixels_w2_neon_8, export=1 + init_put_pixels + vmov.u8 d5, #255 + vshr.u64 d5, #32 +0: subs r3, #1 + vld1.32 {d0[0]}, [r1], r2 + pld [r1] + vld1.32 d6, [r0] + vshll.u8 q0, d0, #6 + vbit d6, d0, d5 + vst1.32 d6, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w4_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.32 {d0[0]}, [r1], r2 + vld1.32 {d0[1]}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q0, d0, #6 + vst1.64 {d0}, [r0], r12 + vst1.64 {d1}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w6_neon_8, export=1 + init_put_pixels + vmov.u8 q10, #255 + vshr.u64 d21, #32 +0: subs r3, #1 + vld1.16 {d0}, [r1], r2 + pld [r1] + vshll.u8 q0, d0, #6 + vld1.8 {q12}, [r0] + vbit q12, q0, q10 + vst1.8 {q12}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w8_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.8 {d0}, [r1], r2 + vld1.8 {d2}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q0, d0, #6 + vshll.u8 q1, d2, #6 + vst1.16 {q0}, [r0], r12 + vst1.16 {q1}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w12_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.64 {d0}, [r1] + add r1, #8 + vld1.32 {d1[0]}, [r1], r2 + sub r1, #8 + vld1.64 {d2}, [r1] + add r1, #8 + vld1.32 {d1[1]}, [r1], r2 + sub r1, #8 + pld [r1] + pld [r1, r2] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vmov d22, d19 + vst1.64 {d16, d17, d18}, [r0], r12 + vst1.64 {d20, d21, d22}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w16_neon_8, export=1 + init_put_pixels +0: subs r3, #2 + vld1.8 {q0}, [r1], r2 + vld1.8 {q1}, [r1], r2 + pld [r1] + pld [r1, r2] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vst1.8 {q8, q9}, [r0], r12 + vst1.8 {q10, q11}, [r0], r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w24_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {d0, d1, d2}, [r1], r2 + pld [r1] + vshll.u8 q10, d0, #6 + vshll.u8 q11, d1, #6 + vshll.u8 q12, d2, #6 + vstm r0, {q10, q11, q12} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w32_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1], r2 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vstm r0, {q8, q9, q10, q11} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w48_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1] + add r1, #32 + vld1.8 {q2}, [r1], r2 + sub r1, #32 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vshll.u8 q12, d4, #6 + vshll.u8 q13, d5, #6 + vstm r0, {q8, q9, q10, q11, q12, q13} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_pixels_w64_neon_8, export=1 + init_put_pixels +0: subs r3, #1 + vld1.8 {q0, q1}, [r1] + add r1, #32 + vld1.8 {q2, q3}, [r1], r2 + sub r1, #32 + pld [r1] + vshll.u8 q8, d0, #6 + vshll.u8 q9, d1, #6 + vshll.u8 q10, d2, #6 + vshll.u8 q11, d3, #6 + vshll.u8 q12, d4, #6 + vshll.u8 q13, d5, #6 + vshll.u8 q14, d6, #6 + vshll.u8 q15, d7, #6 + vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15} + add r0, r12 + bne 0b + bx lr +endfunc + +function ff_hevc_put_qpel_uw_pixels_neon_8, export=1 + push {r4-r9} + ldr r5, [sp, #24] // width + ldr r4, [sp, #28] // height + ldr r8, [sp, #32] // src2 + ldr r9, [sp, #36] // src2stride + vpush {d8-d15} + cmp r8, #0 + bne 2f +1: subs r4, #1 + vld1.8 {d0}, [r2], r3 + vst1.8 d0, [r0], r1 + bne 1b + vpop {d8-d15} + pop {r4-r9} + bx lr +2: subs r4, #1 + vld1.8 {d0}, [r2], r3 + vld1.16 {q1}, [r8], r9 + vshll.u8 q0, d0, #6 + vqadd.s16 q0, q1 + vqrshrun.s16 d0, q0, #7 + vst1.8 d0, [r0], r1 + bne 2b + vpop {d8-d15} + pop {r4-r9} + bx lr +endfunc + +.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4 +function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1 + ldr r12, [sp] // height +1: subs r12, #4 + vld1.32 {\regs} , [r2], r3 + vld1.32 {\regs2} , [r2], r3 + vld1.32 {\regs3} , [r2], r3 + vld1.32 {\regs4} , [r2], r3 + vst1.32 {\regs} , [r0], r1 + vst1.32 {\regs2} , [r0], r1 + vst1.32 {\regs3} , [r0], r1 + vst1.32 {\regs4} , [r0], r1 + bne 1b + bx lr +endfunc +.endm + +.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4 +function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1 + push {r4-r5} + ldr r12, [sp, #8] // height +1: subs r12, #2 + mov r4, r2 + vld1.32 {\regs} , [r2]! + vld1.32 {\regs2} , [r2] + add r2, r4, r3 + mov r4, r2 + vld1.32 {\regs3} , [r2]! + vld1.32 {\regs4} , [r2] + add r2, r4, r3 + mov r5, r0 + vst1.32 {\regs} , [r0]! + vst1.32 {\regs2} , [r0] + add r0, r5, r1 + mov r5, r0 + vst1.32 {\regs3} , [r0]! + vst1.32 {\regs4} , [r0] + add r0, r5, r1 + bne 1b + pop {r4-r5} + bx lr +endfunc +.endm + +put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1] +put_qpel_uw_pixels 8, d0, d1, d2, d3 +put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0] +put_qpel_uw_pixels 16, q0, q1, q2, q3 +put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21 +put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11 +put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10 +put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11 diff --git a/arm/neon.S b/arm/neon.S new file mode 100644 index 0000000..787bc4b --- /dev/null +++ b/arm/neon.S @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \r0, \r4 + vtrn.32 \r1, \r5 + vtrn.32 \r2, \r6 + vtrn.32 \r3, \r7 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 +.endm + +.macro transpose_4x4 r0, r1, r2, r3 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 +.endm + +.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7 + vswp \r0, \r4 + vswp \r1, \r5 + vswp \r2, \r6 + vswp \r3, \r7 +.endm + +.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 +.endm diff --git a/bitstream.cc b/bitstream.cc new file mode 100644 index 0000000..0298be9 --- /dev/null +++ b/bitstream.cc @@ -0,0 +1,176 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "bitstream.h" +#include "de265.h" + +#include +#include +#include + + + +void bitreader_init(bitreader* br, unsigned char* buffer, int len) +{ + br->data = buffer; + br->bytes_remaining = len; + + br->nextbits=0; + br->nextbits_cnt=0; + + bitreader_refill(br); +} + +void bitreader_refill(bitreader* br) +{ + int shift = 64-br->nextbits_cnt; + + while (shift >= 8 && br->bytes_remaining) { + uint64_t newval = *br->data++; + br->bytes_remaining--; + + shift -= 8; + newval <<= shift; + br->nextbits |= newval; + } + + br->nextbits_cnt = 64-shift; +} + +int get_bits(bitreader* br, int n) +{ + if (br->nextbits_cnt < n) { + bitreader_refill(br); + } + + uint64_t val = br->nextbits; + val >>= 64-n; + + br->nextbits <<= n; + br->nextbits_cnt -= n; + + return val; +} + +int get_bits_fast(bitreader* br, int n) +{ + assert(br->nextbits_cnt >= n); + + uint64_t val = br->nextbits; + val >>= 64-n; + + br->nextbits <<= n; + br->nextbits_cnt -= n; + + return val; +} + +int peek_bits(bitreader* br, int n) +{ + if (br->nextbits_cnt < n) { + bitreader_refill(br); + } + + uint64_t val = br->nextbits; + val >>= 64-n; + + return val; +} + +void skip_bits(bitreader* br, int n) +{ + if (br->nextbits_cnt < n) { + bitreader_refill(br); + } + + br->nextbits <<= n; + br->nextbits_cnt -= n; +} + +void skip_bits_fast(bitreader* br, int n) +{ + br->nextbits <<= n; + br->nextbits_cnt -= n; +} + +void skip_to_byte_boundary(bitreader* br) +{ + int nskip = (br->nextbits_cnt & 7); + + br->nextbits <<= nskip; + br->nextbits_cnt -= nskip; +} + +void prepare_for_CABAC(bitreader* br) +{ + skip_to_byte_boundary(br); + + int rewind = br->nextbits_cnt/8; + br->data -= rewind; + br->bytes_remaining += rewind; + br->nextbits = 0; + br->nextbits_cnt = 0; +} + +int get_uvlc(bitreader* br) +{ + int num_zeros=0; + + while (get_bits(br,1)==0) { + num_zeros++; + + if (num_zeros > MAX_UVLC_LEADING_ZEROS) { return UVLC_ERROR; } + } + + int offset = 0; + if (num_zeros != 0) { + offset = get_bits(br, num_zeros); + int value = offset + (1<0); + return value; + } else { + return 0; + } +} + +int get_svlc(bitreader* br) +{ + int v = get_uvlc(br); + if (v==0) return v; + if (v==UVLC_ERROR) return UVLC_ERROR; + + bool negative = ((v&1)==0); + return negative ? -v/2 : (v+1)/2; +} + +bool check_rbsp_trailing_bits(bitreader* br) +{ + int stop_bit = get_bits(br,1); + assert(stop_bit==1); + + while (br->nextbits_cnt>0 || br->bytes_remaining>0) { + int filler = get_bits(br,1); + if (filler!=0) { + return false; + } + } + + return true; +} diff --git a/cabac.cc b/cabac.cc new file mode 100644 index 0000000..102bc57 --- /dev/null +++ b/cabac.cc @@ -0,0 +1,1033 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "cabac.h" +#include "util.h" + +#include +#include +#include +#include + +#define INITIAL_CABAC_BUFFER_CAPACITY 4096 + + +static const uint8_t LPS_table[64][4] = + { + { 128, 176, 208, 240}, + { 128, 167, 197, 227}, + { 128, 158, 187, 216}, + { 123, 150, 178, 205}, + { 116, 142, 169, 195}, + { 111, 135, 160, 185}, + { 105, 128, 152, 175}, + { 100, 122, 144, 166}, + { 95, 116, 137, 158}, + { 90, 110, 130, 150}, + { 85, 104, 123, 142}, + { 81, 99, 117, 135}, + { 77, 94, 111, 128}, + { 73, 89, 105, 122}, + { 69, 85, 100, 116}, + { 66, 80, 95, 110}, + { 62, 76, 90, 104}, + { 59, 72, 86, 99}, + { 56, 69, 81, 94}, + { 53, 65, 77, 89}, + { 51, 62, 73, 85}, + { 48, 59, 69, 80}, + { 46, 56, 66, 76}, + { 43, 53, 63, 72}, + { 41, 50, 59, 69}, + { 39, 48, 56, 65}, + { 37, 45, 54, 62}, + { 35, 43, 51, 59}, + { 33, 41, 48, 56}, + { 32, 39, 46, 53}, + { 30, 37, 43, 50}, + { 29, 35, 41, 48}, + { 27, 33, 39, 45}, + { 26, 31, 37, 43}, + { 24, 30, 35, 41}, + { 23, 28, 33, 39}, + { 22, 27, 32, 37}, + { 21, 26, 30, 35}, + { 20, 24, 29, 33}, + { 19, 23, 27, 31}, + { 18, 22, 26, 30}, + { 17, 21, 25, 28}, + { 16, 20, 23, 27}, + { 15, 19, 22, 25}, + { 14, 18, 21, 24}, + { 14, 17, 20, 23}, + { 13, 16, 19, 22}, + { 12, 15, 18, 21}, + { 12, 14, 17, 20}, + { 11, 14, 16, 19}, + { 11, 13, 15, 18}, + { 10, 12, 15, 17}, + { 10, 12, 14, 16}, + { 9, 11, 13, 15}, + { 9, 11, 12, 14}, + { 8, 10, 12, 14}, + { 8, 9, 11, 13}, + { 7, 9, 11, 12}, + { 7, 9, 10, 12}, + { 7, 8, 10, 11}, + { 6, 8, 9, 11}, + { 6, 7, 9, 10}, + { 6, 7, 8, 9}, + { 2, 2, 2, 2} + }; + +static const uint8_t renorm_table[32] = + { + 6, 5, 4, 4, + 3, 3, 3, 3, + 2, 2, 2, 2, + 2, 2, 2, 2, + 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1 + }; + +static const uint8_t next_state_MPS[64] = + { + 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16, + 17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32, + 33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48, + 49,50,51,52,53,54,55,56,57,58,59,60,61,62,62,63 + }; + +static const uint8_t next_state_LPS[64] = + { + 0,0,1,2,2,4,4,5,6,7,8,9,9,11,11,12, + 13,13,15,15,16,16,18,18,19,19,21,21,22,22,23,24, + 24,25,26,26,27,27,28,29,29,30,30,30,31,32,32,33, + 33,33,34,34,35,35,35,36,36,36,37,37,37,38,38,63 + }; + + + + + +#ifdef DE265_LOG_TRACE +int logcnt=1; +#endif + +void init_CABAC_decoder(CABAC_decoder* decoder, uint8_t* bitstream, int length) +{ + assert(length >= 0); + + decoder->bitstream_start = bitstream; + decoder->bitstream_curr = bitstream; + decoder->bitstream_end = bitstream+length; +} + +void init_CABAC_decoder_2(CABAC_decoder* decoder) +{ + int length = decoder->bitstream_end - decoder->bitstream_curr; + + decoder->range = 510; + decoder->bits_needed = 8; + + decoder->value = 0; + + if (length>0) { decoder->value = (*decoder->bitstream_curr++) << 8; decoder->bits_needed-=8; } + if (length>1) { decoder->value |= (*decoder->bitstream_curr++); decoder->bits_needed-=8; } + + logtrace(LogCABAC,"[%3d] init_CABAC_decode_2 r:%x v:%x\n", logcnt, decoder->range, decoder->value); +} + + +int decode_CABAC_bit(CABAC_decoder* decoder, context_model* model) +{ + logtrace(LogCABAC,"[%3d] decodeBin r:%x v:%x state:%d\n",logcnt,decoder->range, decoder->value, model->state); + + int decoded_bit; + int LPS = LPS_table[model->state][ ( decoder->range >> 6 ) - 4 ]; + decoder->range -= LPS; + + uint32_t scaled_range = decoder->range << 7; + + logtrace(LogCABAC,"[%3d] sr:%x v:%x\n",logcnt,scaled_range, decoder->value); + + if (decoder->value < scaled_range) + { + logtrace(LogCABAC,"[%3d] MPS\n",logcnt); + + // MPS path + + decoded_bit = model->MPSbit; + model->state = next_state_MPS[model->state]; + + if (scaled_range < ( 256 << 7 ) ) + { + // scaled range, highest bit (15) not set + + decoder->range = scaled_range >> 6; // shift range by one bit + decoder->value <<= 1; // shift value by one bit + decoder->bits_needed++; + + if (decoder->bits_needed == 0) + { + decoder->bits_needed = -8; + if (decoder->bitstream_curr < decoder->bitstream_end) + { decoder->value |= *decoder->bitstream_curr++; } + } + } + } + else + { + logtrace(LogCABAC,"[%3d] LPS\n",logcnt); + //printf("%d %d\n", model->state, 0); + + // LPS path + + decoder->value = (decoder->value - scaled_range); + + int num_bits = renorm_table[ LPS >> 3 ]; + decoder->value <<= num_bits; + decoder->range = LPS << num_bits; /* this is always >= 0x100 except for state 63, + but state 63 is never used */ + + int num_bitsTab = renorm_table[ LPS >> 3 ]; + + assert(num_bits == num_bitsTab); + + decoded_bit = 1 - model->MPSbit; + + if (model->state==0) { model->MPSbit = 1-model->MPSbit; } + model->state = next_state_LPS[model->state]; + + decoder->bits_needed += num_bits; + + if (decoder->bits_needed >= 0) + { + logtrace(LogCABAC,"bits_needed: %d\n", decoder->bits_needed); + if (decoder->bitstream_curr < decoder->bitstream_end) + { decoder->value |= (*decoder->bitstream_curr++) << decoder->bits_needed; } + + decoder->bits_needed -= 8; + } + } + + logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, decoded_bit, decoder->range, decoder->value); +#ifdef DE265_LOG_TRACE + logcnt++; +#endif + + return decoded_bit; +} + +int decode_CABAC_term_bit(CABAC_decoder* decoder) +{ + logtrace(LogCABAC,"CABAC term: range=%x\n", decoder->range); + + decoder->range -= 2; + uint32_t scaledRange = decoder->range << 7; + + if (decoder->value >= scaledRange) + { + return 1; + } + else + { + // there is a while loop in the standard, but it will always be executed only once + + if (scaledRange < (256<<7)) + { + decoder->range = scaledRange >> 6; + decoder->value *= 2; + + decoder->bits_needed++; + if (decoder->bits_needed==0) + { + decoder->bits_needed = -8; + + if (decoder->bitstream_curr < decoder->bitstream_end) { + decoder->value += (*decoder->bitstream_curr++); + } + } + } + + return 0; + } +} + + + +int decode_CABAC_bypass(CABAC_decoder* decoder) +{ + logtrace(LogCABAC,"[%3d] bypass r:%x v:%x\n",logcnt,decoder->range, decoder->value); + + decoder->value <<= 1; + decoder->bits_needed++; + + if (decoder->bits_needed >= 0) + { + if (decoder->bitstream_end > decoder->bitstream_curr) { + decoder->bits_needed = -8; + decoder->value |= *decoder->bitstream_curr++; + } + } + + int bit; + uint32_t scaled_range = decoder->range << 7; + if (decoder->value >= scaled_range) + { + decoder->value -= scaled_range; + bit=1; + } + else + { + bit=0; + } + + logtrace(LogCABAC,"[%3d] -> bit %d r:%x v:%x\n", logcnt, bit, decoder->range, decoder->value); +#ifdef DE265_LOG_TRACE + logcnt++; +#endif + + return bit; +} + + +int decode_CABAC_TU_bypass(CABAC_decoder* decoder, int cMax) +{ + for (int i=0;irange, decoder->value, nBits); + + decoder->value <<= nBits; + decoder->bits_needed+=nBits; + + if (decoder->bits_needed >= 0) + { + if (decoder->bitstream_end > decoder->bitstream_curr) { + int input = *decoder->bitstream_curr++; + input <<= decoder->bits_needed; + + decoder->bits_needed -= 8; + decoder->value |= input; + } + } + + uint32_t scaled_range = decoder->range << 7; + int value = decoder->value / scaled_range; + if (unlikely(value>=(1<value -= value * scaled_range; + + logtrace(LogCABAC,"[%3d] -> value %d r:%x v:%x\n", logcnt+nBits-1, + value, decoder->range, decoder->value); + +#ifdef DE265_LOG_TRACE + logcnt+=nBits; +#endif + + return value; +} + + +int decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits) +{ + int value=0; + + if (likely(nBits<=8)) { + if (nBits==0) { + return 0; + } + // we could use decode_CABAC_bypass() for a single bit, but this seems to be slower +#if 0 + else if (nBits==1) { + value = decode_CABAC_bypass(decoder); + } +#endif + else { + value = decode_CABAC_FL_bypass_parallel(decoder,nBits); + } + } + else { + value = decode_CABAC_FL_bypass_parallel(decoder,8); + nBits-=8; + + while (nBits--) { + value <<= 1; + value |= decode_CABAC_bypass(decoder); + } + } + logtrace(LogCABAC," -> FL: %d\n", value); + + return value; +} + +int decode_CABAC_TR_bypass(CABAC_decoder* decoder, int cRiceParam, int cTRMax) +{ + int prefix = decode_CABAC_TU_bypass(decoder, cTRMax>>cRiceParam); + if (prefix==4) { // TODO check: constant 4 only works for coefficient decoding + return cTRMax; + } + + int suffix = decode_CABAC_FL_bypass(decoder, cRiceParam); + + return (prefix << cRiceParam) | suffix; +} + + +#define MAX_PREFIX 32 + +int decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k) +{ + int base=0; + int n=k; + + for (;;) + { + int bit = decode_CABAC_bypass(decoder); + if (bit==0) + break; + else { + base += 1<=8) { + append_byte((vlc_buffer >> (vlc_buffer_len-8)) & 0xFF); + vlc_buffer_len -= 8; + } +} + +void CABAC_encoder::write_uvlc(int value) +{ + assert(value>=0); + + int nLeadingZeros=0; + int base=0; + int range=1; + + while (value>=base+range) { + base += range; + range <<= 1; + nLeadingZeros++; + } + + write_bits((1<0) write_uvlc(2*value-1); + else write_uvlc(-2*value); +} + +void CABAC_encoder_bitstream::flush_VLC() +{ + while (vlc_buffer_len>=8) { + append_byte((vlc_buffer >> (vlc_buffer_len-8)) & 0xFF); + vlc_buffer_len -= 8; + } + + if (vlc_buffer_len>0) { + append_byte(vlc_buffer << (8-vlc_buffer_len)); + vlc_buffer_len = 0; + } + + vlc_buffer = 0; +} + +void CABAC_encoder_bitstream::skip_bits(int nBits) +{ + while (nBits>=8) { + write_bits(0,8); + nBits-=8; + } + + if (nBits>0) { + write_bits(0,nBits); + } +} + + +int CABAC_encoder_bitstream::number_free_bits_in_byte() const +{ + if ((vlc_buffer_len % 8)==0) return 0; + return 8- (vlc_buffer_len % 8); +} + + +void CABAC_encoder_bitstream::check_size_and_resize(int nBytes) +{ + if (data_size+nBytes > data_capacity) { // 1 extra byte for stuffing + if (data_capacity==0) { + data_capacity = INITIAL_CABAC_BUFFER_CAPACITY; + } else { + data_capacity *= 2; + } + + data_mem = (uint8_t*)realloc(data_mem,data_capacity); + } +} + + +void CABAC_encoder_bitstream::append_byte(int byte) +{ + check_size_and_resize(2); + + // --- emulation prevention --- + + /* These byte sequences may never occur in the bitstream: + 0x000000 / 0x000001 / 0x000002 + + Hence, we have to add a 0x03 before the third byte. + We also have to add a 0x03 for this sequence: 0x000003, because + the escape byte itself also has to be escaped. + */ + + // S0 --(0)--> S1 --(0)--> S2 --(0,1,2,3)--> add stuffing + + if (byte<=3) { + /**/ if (state< 2 && byte==0) { state++; } + else if (state==2 && byte<=3) { + data_mem[ data_size++ ] = 3; + + if (byte==0) state=1; + else state=0; + } + else { state=0; } + } + else { state=0; } + + + // write actual data byte + + data_mem[ data_size++ ] = byte; +} + + +void CABAC_encoder_bitstream::write_startcode() +{ + check_size_and_resize(3); + + data_mem[ data_size+0 ] = 0; + data_mem[ data_size+1 ] = 0; + data_mem[ data_size+2 ] = 1; + data_size+=3; +} + +void CABAC_encoder_bitstream::init_CABAC() +{ + range = 510; + low = 0; + + bits_left = 23; + buffered_byte = 0xFF; + num_buffered_bytes = 0; +} + +void CABAC_encoder_bitstream::flush_CABAC() +{ + if (low >> (32 - bits_left)) + { + append_byte(buffered_byte + 1); + while (num_buffered_bytes > 1) + { + append_byte(0x00); + num_buffered_bytes--; + } + + low -= 1 << (32 - bits_left); + } + else + { + if (num_buffered_bytes > 0) + { + append_byte(buffered_byte); + } + + while (num_buffered_bytes > 1) + { + append_byte(0xff); + num_buffered_bytes--; + } + } + + // printf("low: %08x nbits left:%d filled:%d\n",low,bits_left,32-bits_left); + + write_bits(low >> 8, 24-bits_left); +} + + +void CABAC_encoder_bitstream::write_out() +{ + //logtrace(LogCABAC,"low = %08x (bits_left=%d)\n",low,bits_left); + int leadByte = low >> (24 - bits_left); + bits_left += 8; + low &= 0xffffffffu >> bits_left; + + //logtrace(LogCABAC,"write byte %02x\n",leadByte); + //logtrace(LogCABAC,"-> low = %08x\n",low); + + if (leadByte == 0xff) + { + num_buffered_bytes++; + } + else + { + if (num_buffered_bytes > 0) + { + int carry = leadByte >> 8; + int byte = buffered_byte + carry; + buffered_byte = leadByte & 0xff; + append_byte(byte); + + byte = ( 0xff + carry ) & 0xff; + while ( num_buffered_bytes > 1 ) + { + append_byte(byte); + num_buffered_bytes--; + } + } + else + { + num_buffered_bytes = 1; + buffered_byte = leadByte; + } + } +} + +void CABAC_encoder_bitstream::testAndWriteOut() +{ + // logtrace(LogCABAC,"bits_left = %d\n",bits_left); + + if (bits_left < 12) + { + write_out(); + } +} + + +#ifdef DE265_LOG_TRACE +int encBinCnt=1; +#endif + +void CABAC_encoder_bitstream::write_CABAC_bit(int modelIdx, int bin) +{ + context_model* model = &(*mCtxModels)[modelIdx]; + //m_uiBinsCoded += m_binCountIncrement; + //rcCtxModel.setBinsCoded( 1 ); + + logtrace(LogCABAC,"[%d] range=%x low=%x state=%d, bin=%d\n", + encBinCnt, range,low, model->state,bin); + + /* + printf("[%d] range=%x low=%x state=%d, bin=%d\n", + encBinCnt, range,low, model->state,bin); + + printf("%d %d X\n",model->state,bin != model->MPSbit); + */ + +#ifdef DE265_LOG_TRACE + encBinCnt++; +#endif + + uint32_t LPS = LPS_table[model->state][ ( range >> 6 ) - 4 ]; + range -= LPS; + + if (bin != model->MPSbit) + { + //logtrace(LogCABAC,"LPS\n"); + + int num_bits = renorm_table[ LPS >> 3 ]; + low = (low + range) << num_bits; + range = LPS << num_bits; + + if (model->state==0) { model->MPSbit = 1-model->MPSbit; } + + model->state = next_state_LPS[model->state]; + + bits_left -= num_bits; + } + else + { + //logtrace(LogCABAC,"MPS\n"); + + model->state = next_state_MPS[model->state]; + + + // renorm + + if (range >= 256) { return; } + + low <<= 1; + range <<= 1; + bits_left--; + } + + testAndWriteOut(); +} + +void CABAC_encoder_bitstream::write_CABAC_bypass(int bin) +{ + logtrace(LogCABAC,"[%d] bypass = %d, range=%x\n",encBinCnt,bin,range); + /* + printf("[%d] bypass = %d, range=%x\n",encBinCnt,bin,range); + printf("%d %d X\n",64, -1); + */ + +#ifdef DE265_LOG_TRACE + encBinCnt++; +#endif + + // BinsCoded += m_binCountIncrement; + low <<= 1; + + if (bin) + { + low += range; + } + bits_left--; + + testAndWriteOut(); +} + +void CABAC_encoder::write_CABAC_TU_bypass(int value, int cMax) +{ + for (int i=0;i0) { + n--; + write_CABAC_bypass(value & (1<= 256) + { + return; + } + else + { + low <<= 1; + range <<= 1; + bits_left--; + } + + testAndWriteOut(); +} + + + + +static const uint32_t entropy_table[128] = { + // -------------------- 200 -------------------- + /* state= 0 */ 0x07d13 /* 0.977164 */, 0x08255 /* 1.018237 */, + /* state= 1 */ 0x07738 /* 0.931417 */, 0x086ef /* 1.054179 */, + /* state= 2 */ 0x0702b /* 0.876323 */, 0x0935a /* 1.151195 */, + /* state= 3 */ 0x069e6 /* 0.827333 */, 0x09c7f /* 1.222650 */, + /* state= 4 */ 0x062e8 /* 0.772716 */, 0x0a2c7 /* 1.271708 */, + /* state= 5 */ 0x05c18 /* 0.719488 */, 0x0ae25 /* 1.360532 */, + /* state= 6 */ 0x05632 /* 0.673414 */, 0x0b724 /* 1.430793 */, + /* state= 7 */ 0x05144 /* 0.634904 */, 0x0c05d /* 1.502850 */, + /* state= 8 */ 0x04bdf /* 0.592754 */, 0x0ccf2 /* 1.601145 */, + /* state= 9 */ 0x0478d /* 0.559012 */, 0x0d57b /* 1.667843 */, + /* state=10 */ 0x042ad /* 0.520924 */, 0x0de81 /* 1.738336 */, + /* state=11 */ 0x03f4d /* 0.494564 */, 0x0e4b8 /* 1.786871 */, + /* state=12 */ 0x03a9d /* 0.457945 */, 0x0f471 /* 1.909721 */, + /* state=13 */ 0x037d5 /* 0.436201 */, 0x0fc56 /* 1.971385 */, + /* state=14 */ 0x034c2 /* 0.412177 */, 0x10236 /* 2.017284 */, + /* state=15 */ 0x031a6 /* 0.387895 */, 0x10d5c /* 2.104394 */, + /* state=16 */ 0x02e62 /* 0.362383 */, 0x11b34 /* 2.212552 */, + /* state=17 */ 0x02c20 /* 0.344752 */, 0x120b4 /* 2.255512 */, + /* state=18 */ 0x029b8 /* 0.325943 */, 0x1294d /* 2.322672 */, + /* state=19 */ 0x02791 /* 0.309143 */, 0x135e1 /* 2.420959 */, + /* state=20 */ 0x02562 /* 0.292057 */, 0x13e37 /* 2.486077 */, + /* state=21 */ 0x0230d /* 0.273846 */, 0x144fd /* 2.539000 */, + /* state=22 */ 0x02193 /* 0.262308 */, 0x150c9 /* 2.631150 */, + /* state=23 */ 0x01f5d /* 0.245026 */, 0x15ca0 /* 2.723641 */, + /* state=24 */ 0x01de7 /* 0.233617 */, 0x162f9 /* 2.773246 */, + /* state=25 */ 0x01c2f /* 0.220208 */, 0x16d99 /* 2.856259 */, + /* state=26 */ 0x01a8e /* 0.207459 */, 0x17a93 /* 2.957634 */, + /* state=27 */ 0x0195a /* 0.198065 */, 0x18051 /* 3.002477 */, + /* state=28 */ 0x01809 /* 0.187778 */, 0x18764 /* 3.057759 */, + /* state=29 */ 0x0164a /* 0.174144 */, 0x19460 /* 3.159206 */, + /* state=30 */ 0x01539 /* 0.165824 */, 0x19f20 /* 3.243181 */, + /* state=31 */ 0x01452 /* 0.158756 */, 0x1a465 /* 3.284334 */, + /* state=32 */ 0x0133b /* 0.150261 */, 0x1b422 /* 3.407303 */, + /* state=33 */ 0x0120c /* 0.140995 */, 0x1bce5 /* 3.475767 */, + /* state=34 */ 0x01110 /* 0.133315 */, 0x1c394 /* 3.527962 */, + /* state=35 */ 0x0104d /* 0.127371 */, 0x1d059 /* 3.627736 */, + /* state=36 */ 0x00f8b /* 0.121451 */, 0x1d74b /* 3.681983 */, + /* state=37 */ 0x00ef4 /* 0.116829 */, 0x1dfd0 /* 3.748540 */, + /* state=38 */ 0x00e10 /* 0.109864 */, 0x1e6d3 /* 3.803335 */, + /* state=39 */ 0x00d3f /* 0.103507 */, 0x1f925 /* 3.946462 */, + /* state=40 */ 0x00cc4 /* 0.099758 */, 0x1fda7 /* 3.981667 */, + /* state=41 */ 0x00c42 /* 0.095792 */, 0x203f8 /* 4.031012 */, + /* state=42 */ 0x00b78 /* 0.089610 */, 0x20f7d /* 4.121014 */, + /* state=43 */ 0x00afc /* 0.085830 */, 0x21dd6 /* 4.233102 */, + /* state=44 */ 0x00a5e /* 0.081009 */, 0x22419 /* 4.282016 */, + /* state=45 */ 0x00a1b /* 0.078950 */, 0x22a5e /* 4.331015 */, + /* state=46 */ 0x00989 /* 0.074514 */, 0x23756 /* 4.432323 */, + /* state=47 */ 0x0091b /* 0.071166 */, 0x24225 /* 4.516775 */, + /* state=48 */ 0x008cf /* 0.068837 */, 0x2471a /* 4.555487 */, + /* state=49 */ 0x00859 /* 0.065234 */, 0x25313 /* 4.649048 */, + /* state=50 */ 0x00814 /* 0.063140 */, 0x25d67 /* 4.729721 */, + /* state=51 */ 0x007b6 /* 0.060272 */, 0x2651f /* 4.790028 */, + /* state=52 */ 0x0076e /* 0.058057 */, 0x2687c /* 4.816294 */, + /* state=53 */ 0x00707 /* 0.054924 */, 0x27da7 /* 4.981661 */, + /* state=54 */ 0x006d5 /* 0.053378 */, 0x28172 /* 5.011294 */, + /* state=55 */ 0x00659 /* 0.049617 */, 0x28948 /* 5.072512 */, + /* state=56 */ 0x00617 /* 0.047598 */, 0x297c5 /* 5.185722 */, + /* state=57 */ 0x005dd /* 0.045814 */, 0x2a2df /* 5.272434 */, + /* state=58 */ 0x005c1 /* 0.044965 */, 0x2a581 /* 5.293019 */, + /* state=59 */ 0x00574 /* 0.042619 */, 0x2ad59 /* 5.354304 */, + /* state=60 */ 0x0053b /* 0.040882 */, 0x2bba5 /* 5.465973 */, + /* state=61 */ 0x0050c /* 0.039448 */, 0x2c596 /* 5.543651 */, + /* state=62 */ 0x004e9 /* 0.038377 */, 0x2cd88 /* 5.605741 */, + 0x00400 , 0x2d000 /* dummy, should never be used */ +}; + + +static const uint32_t entropy_table_orig[128] = { + 0x07b23, 0x085f9, 0x074a0, 0x08cbc, 0x06ee4, 0x09354, 0x067f4, 0x09c1b, + 0x060b0, 0x0a62a, 0x05a9c, 0x0af5b, 0x0548d, 0x0b955, 0x04f56, 0x0c2a9, + 0x04a87, 0x0cbf7, 0x045d6, 0x0d5c3, 0x04144, 0x0e01b, 0x03d88, 0x0e937, + 0x039e0, 0x0f2cd, 0x03663, 0x0fc9e, 0x03347, 0x10600, 0x03050, 0x10f95, + 0x02d4d, 0x11a02, 0x02ad3, 0x12333, 0x0286e, 0x12cad, 0x02604, 0x136df, + 0x02425, 0x13f48, 0x021f4, 0x149c4, 0x0203e, 0x1527b, 0x01e4d, 0x15d00, + 0x01c99, 0x166de, 0x01b18, 0x17017, 0x019a5, 0x17988, 0x01841, 0x18327, + 0x016df, 0x18d50, 0x015d9, 0x19547, 0x0147c, 0x1a083, 0x0138e, 0x1a8a3, + 0x01251, 0x1b418, 0x01166, 0x1bd27, 0x01068, 0x1c77b, 0x00f7f, 0x1d18e, + 0x00eda, 0x1d91a, 0x00e19, 0x1e254, 0x00d4f, 0x1ec9a, 0x00c90, 0x1f6e0, + 0x00c01, 0x1fef8, 0x00b5f, 0x208b1, 0x00ab6, 0x21362, 0x00a15, 0x21e46, + 0x00988, 0x2285d, 0x00934, 0x22ea8, 0x008a8, 0x239b2, 0x0081d, 0x24577, + 0x007c9, 0x24ce6, 0x00763, 0x25663, 0x00710, 0x25e8f, 0x006a0, 0x26a26, + 0x00672, 0x26f23, 0x005e8, 0x27ef8, 0x005ba, 0x284b5, 0x0055e, 0x29057, + 0x0050c, 0x29bab, 0x004c1, 0x2a674, 0x004a7, 0x2aa5e, 0x0046f, 0x2b32f, + 0x0041f, 0x2c0ad, 0x003e7, 0x2ca8d, 0x003ba, 0x2d323, 0x0010c, 0x3bfbb +}; + + +const uint32_t entropy_table_theory[128] = + { + 0x08000, 0x08000, 0x076da, 0x089a0, 0x06e92, 0x09340, 0x0670a, 0x09cdf, 0x06029, 0x0a67f, 0x059dd, 0x0b01f, 0x05413, 0x0b9bf, 0x04ebf, 0x0c35f, + 0x049d3, 0x0ccff, 0x04546, 0x0d69e, 0x0410d, 0x0e03e, 0x03d22, 0x0e9de, 0x0397d, 0x0f37e, 0x03619, 0x0fd1e, 0x032ee, 0x106be, 0x02ffa, 0x1105d, + 0x02d37, 0x119fd, 0x02aa2, 0x1239d, 0x02836, 0x12d3d, 0x025f2, 0x136dd, 0x023d1, 0x1407c, 0x021d2, 0x14a1c, 0x01ff2, 0x153bc, 0x01e2f, 0x15d5c, + 0x01c87, 0x166fc, 0x01af7, 0x1709b, 0x0197f, 0x17a3b, 0x0181d, 0x183db, 0x016d0, 0x18d7b, 0x01595, 0x1971b, 0x0146c, 0x1a0bb, 0x01354, 0x1aa5a, + 0x0124c, 0x1b3fa, 0x01153, 0x1bd9a, 0x01067, 0x1c73a, 0x00f89, 0x1d0da, 0x00eb7, 0x1da79, 0x00df0, 0x1e419, 0x00d34, 0x1edb9, 0x00c82, 0x1f759, + 0x00bda, 0x200f9, 0x00b3c, 0x20a99, 0x00aa5, 0x21438, 0x00a17, 0x21dd8, 0x00990, 0x22778, 0x00911, 0x23118, 0x00898, 0x23ab8, 0x00826, 0x24458, + 0x007ba, 0x24df7, 0x00753, 0x25797, 0x006f2, 0x26137, 0x00696, 0x26ad7, 0x0063f, 0x27477, 0x005ed, 0x27e17, 0x0059f, 0x287b6, 0x00554, 0x29156, + 0x0050e, 0x29af6, 0x004cc, 0x2a497, 0x0048d, 0x2ae35, 0x00451, 0x2b7d6, 0x00418, 0x2c176, 0x003e2, 0x2cb15, 0x003af, 0x2d4b5, 0x0037f, 0x2de55 + }; + + +void CABAC_encoder_estim::write_CABAC_bit(int modelIdx, int bit) +{ + context_model* model = &(*mCtxModels)[modelIdx]; + //printf("[%d] state=%d, bin=%d\n", encBinCnt, model->state,bit); + //encBinCnt++; + + int idx = model->state<<1; + + if (bit==model->MPSbit) { + model->state = next_state_MPS[model->state]; + } + else { + idx++; + if (model->state==0) { model->MPSbit = 1-model->MPSbit; } + model->state = next_state_LPS[model->state]; + } + + mFracBits += entropy_table[idx]; + + //printf("-> %08lx %f\n",entropy_table[idx], entropy_table[idx] / float(1<<15)); +} + + +float CABAC_encoder::RDBits_for_CABAC_bin(int modelIdx, int bit) +{ + context_model* model = &(*mCtxModels)[modelIdx]; + int idx = model->state<<1; + + if (bit!=model->MPSbit) { + idx++; + } + + return entropy_table[idx] / float(1<<15); +} + + +void CABAC_encoder::write_CABAC_EGk(int val, int k) +{ + while (val >= ( 1 << k ) ) { + write_CABAC_bypass(1); + val = val - ( 1 << k ); + k++; + } + + write_CABAC_bypass(0); + + while (k) { + k--; + write_CABAC_bypass((val >> k) & 1); + } +} + + + +void CABAC_encoder_estim_constant::write_CABAC_bit(int modelIdx, int bit) +{ + context_model* model = &(*mCtxModels)[modelIdx]; + int idx = model->state<<1; + + if (bit!=model->MPSbit) { + idx++; + } + + mFracBits += entropy_table[idx]; +} + + + +#if 0 +void printtab(int idx,int s) +{ + printf("%d %f %f %f\n", s, + double(entropy_table[idx])/0x8000, + double(entropy_table_orig[idx])/0x8000, + double(entropy_table_f265[idx])/0x8000); +} + + +void plot_tables() +{ + for (int i=-62;i<=0;i++) { + int idx = -i *2; + int s = i; + printtab(idx,s); + } + + for (int i=0;i<=62;i++) { + int idx = 2*i +1; + int s = i; + printtab(idx,s); + } +} +#endif diff --git a/configparam.cc b/configparam.cc new file mode 100644 index 0000000..d944141 --- /dev/null +++ b/configparam.cc @@ -0,0 +1,491 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "configparam.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef RTTI_ENABLED +#error "Need to compile with RTTI enabled." +#endif + +static void remove_option(int* argc,char** argv,int idx, int n=1) +{ + for (int i=idx+n;i<*argc;i++) { + argv[i-n] = argv[i]; + } + + *argc-=n; +} + + +bool option_string::processCmdLineArguments(char** argv, int* argc, int idx) +{ + if (argv==NULL) { return false; } + if (idx >= *argc) { return false; } + + value = argv[idx]; + value_set = true; + + remove_option(argc,argv,idx,1); + + return true; +} + + +void option_int::set_range(int mini,int maxi) +{ + have_low_limit =true; + have_high_limit=true; + low_limit =mini; + high_limit=maxi; +} + +std::string option_int::getTypeDescr() const +{ + std::stringstream sstr; + sstr << "(int)"; + + if (have_low_limit || have_high_limit) { sstr << " "; } + if (have_low_limit) { sstr << low_limit << " <= "; } + if (have_low_limit || have_high_limit) { sstr << "x"; } + if (have_high_limit) { sstr << " <= " << high_limit; } + + if (!valid_values_set.empty()) { + sstr << " {"; + bool first=true; + FOR_LOOP(int, v, valid_values_set) { + if (!first) sstr << ","; else first=false; + sstr << v; + } + sstr << "}"; + } + + return sstr.str(); +} + +bool option_int::processCmdLineArguments(char** argv, int* argc, int idx) +{ + if (argv==NULL) { return false; } + if (idx >= *argc) { return false; } + + int v = atoi(argv[idx]); + if (!is_valid(v)) { return false; } + + value = v; + value_set = true; + + remove_option(argc,argv,idx,1); + + return true; +} + +bool option_int::is_valid(int v) const +{ + if (have_low_limit && vhigh_limit) { return false; } + + if (!valid_values_set.empty()) { + auto iter = std::find(valid_values_set.begin(), valid_values_set.end(), v); + if (iter==valid_values_set.end()) { return false; } + } + + return true; +} + +std::string option_int::get_default_string() const +{ + std::stringstream sstr; + sstr << default_value; + return sstr.str(); +} + + +std::string choice_option_base::getTypeDescr() const +{ + std::vector choices = get_choice_names(); + + std::stringstream sstr; + sstr << "{"; + + bool first=true; +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, c, choices) { +#else + FOR_LOOP(std::string, c, choices) { +#endif + if (first) { first=false; } + else { sstr << ","; } + + sstr << c; + } + + sstr << "}"; + return sstr.str(); +} + + +bool choice_option_base::processCmdLineArguments(char** argv, int* argc, int idx) +{ + if (argv==NULL) { return false; } + if (idx >= *argc) { return false; } + + std::string value = argv[idx]; + + std::cout << "set " << value << "\n"; + bool success = set_value(value); + std::cout << "success " << success << "\n"; + + remove_option(argc,argv,idx,1); + + return success; +} + + +static char* fill_strings_into_memory(const std::vector& strings_list) +{ + // calculate memory requirement + + int totalStringLengths = 0; +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, str, strings_list) { +#else + FOR_LOOP(std::string, str, strings_list) { +#endif + totalStringLengths += str.length() +1; // +1 for null termination + } + + int numStrings = strings_list.size(); + + int pointersSize = (numStrings+1) * sizeof(const char*); + + char* memory = new char[pointersSize + totalStringLengths]; + + + // copy strings to memory area + + char* stringPtr = memory + (numStrings+1) * sizeof(const char*); + const char** tablePtr = (const char**)memory; + +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, str, strings_list) { +#else + FOR_LOOP(std::string, str, strings_list) { +#endif + *tablePtr++ = stringPtr; + + strcpy(stringPtr, str.c_str()); + stringPtr += str.length()+1; + } + + *tablePtr = NULL; + + return memory; +} + + +const char** choice_option_base::get_choices_string_table() const +{ + if (choice_string_table==NULL) { + choice_string_table = fill_strings_into_memory(get_choice_names()); + } + + return (const char**)choice_string_table; +} + + + +bool config_parameters::parse_command_line_params(int* argc, char** argv, int* first_idx_ptr, + bool ignore_unknown_options) +{ + int first_idx=1; + if (first_idx_ptr) { first_idx = *first_idx_ptr; } + + for (int i=first_idx;i < *argc;i++) { + + if (argv[i][0]=='-') { + // option + + if (argv[i][1]=='-') { + // long option + + bool option_found=false; + + for (int o=0;ohasLongOption() && strcmp(mOptions[o]->getLongOption().c_str(), + argv[i]+2)==0) { + option_found=true; + + printf("FOUND %s\n",argv[i]); + + bool success = mOptions[o]->processCmdLineArguments(argv,argc, i+1); + if (!success) { + if (first_idx_ptr) { *first_idx_ptr = i; } + return false; + } + + remove_option(argc,argv,i); + i--; + + break; + } + } + + if (option_found == false && !ignore_unknown_options) { + return false; + } + } + else { + // short option + + bool is_single_option = (argv[i][1] != 0 && argv[i][2]==0); + bool do_remove_option = true; + + for (int n=1; argv[i][n]; n++) { + char option = argv[i][n]; + + bool option_found=false; + + for (int o=0;ogetShortOption() == option) { + option_found=true; + + bool success; + if (is_single_option) { + success = mOptions[o]->processCmdLineArguments(argv,argc, i+1); + } + else { + success = mOptions[o]->processCmdLineArguments(NULL,NULL, 0); + } + + if (!success) { + if (first_idx_ptr) { *first_idx_ptr = i; } + return false; + } + + break; + } + } + + if (!option_found) { + if (!ignore_unknown_options) { + fprintf(stderr, "unknown option -%c\n",option); + return false; + } + else { + do_remove_option=false; + } + } + + } // all short options + + if (do_remove_option) { + remove_option(argc,argv,i); + i--; + } + } // is short option + } // is option + } // all command line arguments + + return true; +} + + +void config_parameters::print_params() const +{ + for (int i=0;ihasShortOption()) { + sstr << '-' << o->getShortOption(); + } else { + sstr << " "; + } + + if (o->hasShortOption() && o->hasLongOption()) { + sstr << ", "; + } else { + sstr << " "; + } + + if (o->hasLongOption()) { + sstr << "--" << std::setw(12) << std::left << o->getLongOption(); + } else { + sstr << " "; + } + + sstr << " "; + sstr << o->getTypeDescr(); + + if (o->has_default()) { + sstr << ", default=" << o->get_default_string(); + } + + if (o->has_description()) { + sstr << " : " << o->get_description(); + } + + sstr << "\n"; + + std::cerr << sstr.str(); + } +} + + +void config_parameters::add_option(option_base* o) +{ + mOptions.push_back(o); + delete[] param_string_table; // delete old table, since we got a new parameter + param_string_table = NULL; +} + + +std::vector config_parameters::get_parameter_IDs() const +{ + std::vector ids; + +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, option, mOptions) { +#else + FOR_LOOP(option_base*, option, mOptions) { +#endif + ids.push_back(option->get_name()); + } + + return ids; +} + + +enum en265_parameter_type config_parameters::get_parameter_type(const char* param) const +{ + option_base* option = find_option(param); + assert(option); + + if (dynamic_cast (option)) { return en265_parameter_int; } + if (dynamic_cast (option)) { return en265_parameter_bool; } + if (dynamic_cast(option)) { return en265_parameter_string; } + if (dynamic_cast(option)) { return en265_parameter_choice; } + + assert(false); + return en265_parameter_bool; +} + + +std::vector config_parameters::get_parameter_choices(const char* param) const +{ + option_base* option = find_option(param); + assert(option); + + choice_option_base* o = dynamic_cast(option); + assert(o); + + return o->get_choice_names(); +} + + +option_base* config_parameters::find_option(const char* param) const +{ +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, o, mOptions) { +#else + FOR_LOOP(option_base*, o, mOptions) { +#endif + if (strcmp(o->get_name().c_str(), param)==0) { return o; } + } + + return NULL; +} + + +bool config_parameters::set_bool(const char* param, bool value) +{ + option_base* option = find_option(param); + assert(option); + + option_bool* o = dynamic_cast(option); + assert(o); + + return o->set(value); +} + +bool config_parameters::set_int(const char* param, int value) +{ + option_base* option = find_option(param); + assert(option); + + option_int* o = dynamic_cast(option); + assert(o); + + return o->set(value); +} + +bool config_parameters::set_string(const char* param, const char* value) +{ + option_base* option = find_option(param); + assert(option); + + option_string* o = dynamic_cast(option); + assert(o); + + return o->set(value); +} + +bool config_parameters::set_choice(const char* param, const char* value) +{ + option_base* option = find_option(param); + assert(option); + + choice_option_base* o = dynamic_cast(option); + assert(o); + + return o->set(value); +} + + + +const char** config_parameters::get_parameter_choices_table(const char* param) const +{ + option_base* option = find_option(param); + assert(option); + + choice_option_base* o = dynamic_cast(option); + assert(o); + + return o->get_choices_string_table(); +} + +const char** config_parameters::get_parameter_string_table() const +{ + if (param_string_table==NULL) { + param_string_table = fill_strings_into_memory(get_parameter_IDs()); + } + + return (const char**)param_string_table; +} diff --git a/contextmodel.cc b/contextmodel.cc new file mode 100644 index 0000000..ec43228 --- /dev/null +++ b/contextmodel.cc @@ -0,0 +1,347 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "slice.h" +#include +#include +#include + +bool D = false; + +context_model_table::context_model_table() + : model(NULL), refcnt(NULL) +{ +} + + +context_model_table::context_model_table(const context_model_table& src) +{ + if (D) printf("%p c'tor = %p\n",this,&src); + + if (src.refcnt) { + (*(src.refcnt))++; + } + + refcnt = src.refcnt; + model = src.model; +} + + +context_model_table::~context_model_table() +{ + if (D) printf("%p destructor\n",this); + + if (refcnt) { + (*refcnt)--; + if (*refcnt==0) { + if (D) printf("mfree %p\n",model); + delete[] model; + delete refcnt; + } + } +} + + +void context_model_table::init(int initType, int QPY) +{ + if (D) printf("%p init\n",this); + + decouple_or_alloc_with_empty_data(); + + initialize_CABAC_models(model, initType, QPY); +} + + +void context_model_table::release() +{ + if (D) printf("%p release %p\n",this,refcnt); + + if (!refcnt) { return; } + + // if (*refcnt == 1) { return; } <- keep memory for later, but does not work when we believe that we freed the memory and nulled all references + + (*refcnt)--; + if (*refcnt==0) { + delete[] model; + delete refcnt; + } + + model = nullptr; + refcnt= nullptr; +} + + +void context_model_table::decouple() +{ + if (D) printf("%p decouple (%p)\n",this,refcnt); + + assert(refcnt); // not necessarily so, but we never use it on an unitialized object + + if (*refcnt > 1) { + (*refcnt)--; + + context_model* oldModel = model; + + model = new context_model[CONTEXT_MODEL_TABLE_LENGTH]; + refcnt= new int; + *refcnt=1; + + memcpy(model,oldModel,sizeof(context_model)*CONTEXT_MODEL_TABLE_LENGTH); + } +} + + +context_model_table context_model_table::transfer() +{ + context_model_table newtable; + newtable.model = model; + newtable.refcnt= refcnt; + + model =nullptr; + refcnt=nullptr; + + return newtable; +} + + +context_model_table& context_model_table::operator=(const context_model_table& src) +{ + if (D) printf("%p assign = %p\n",this,&src); + + // assert(src.refcnt); // not necessarily so, but we never use it on an unitialized object + + if (!src.refcnt) { + release(); + return *this; + } + + (*(src.refcnt))++; + + release(); + + model = src.model; + refcnt= src.refcnt; + + return *this; +} + + +bool context_model_table::operator==(const context_model_table& b) const +{ + if (b.model == model) return true; + if (b.model == nullptr || model == nullptr) return false; + + for (int i=0;i1); + (*refcnt)--; + } + + if (D) printf("%p (alloc)\n",this); + + model = new context_model[CONTEXT_MODEL_TABLE_LENGTH]; + refcnt= new int; + *refcnt=1; +} + + + + + + +static void set_initValue(int SliceQPY, + context_model* model, int initValue, int nContexts) +{ + int slopeIdx = initValue >> 4; + int intersecIdx = initValue & 0xF; + int m = slopeIdx*5 - 45; + int n = (intersecIdx<<3) - 16; + int preCtxState = Clip3(1,126, ((m*Clip3(0,51, SliceQPY))>>4)+n); + + // logtrace(LogSlice,"QP=%d slopeIdx=%d intersecIdx=%d m=%d n=%d\n",SliceQPY,slopeIdx,intersecIdx,m,n); + + for (int i=0;i 0) { + init_context(QPY, cm+CONTEXT_MODEL_CU_SKIP_FLAG, initValue_cu_skip_flag[initType-1], 3); + init_context(QPY, cm+CONTEXT_MODEL_PRED_MODE_FLAG, &initValue_pred_mode_flag[initType-1], 1); + init_context(QPY, cm+CONTEXT_MODEL_MERGE_FLAG, &initValue_merge_flag[initType-1],1); + init_context(QPY, cm+CONTEXT_MODEL_MERGE_IDX, &initValue_merge_idx[initType-1], 1); + init_context(QPY, cm+CONTEXT_MODEL_INTER_PRED_IDC, initValue_inter_pred_idc, 5); + init_context(QPY, cm+CONTEXT_MODEL_REF_IDX_LX, initValue_ref_idx_lX, 2); + init_context(QPY, cm+CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG, &initValue_abs_mvd_greater01_flag[initType == 1 ? 0 : 2], 2); + init_context(QPY, cm+CONTEXT_MODEL_MVP_LX_FLAG, initValue_mvp_lx_flag, 1); + init_context(QPY, cm+CONTEXT_MODEL_RQT_ROOT_CBF, initValue_rqt_root_cbf, 1); + + init_context_const(QPY, cm+CONTEXT_MODEL_RDPCM_FLAG, 139, 2); + init_context_const(QPY, cm+CONTEXT_MODEL_RDPCM_DIR, 139, 2); + } + + init_context(QPY, cm+CONTEXT_MODEL_SPLIT_CU_FLAG, initValue_split_cu_flag[initType], 3); + init_context(QPY, cm+CONTEXT_MODEL_PART_MODE, &initValue_part_mode[(initType!=2 ? initType : 5)], 4); + init_context(QPY, cm+CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, &initValue_prev_intra_luma_pred_flag[initType], 1); + init_context(QPY, cm+CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE, &initValue_intra_chroma_pred_mode[initType], 1); + init_context(QPY, cm+CONTEXT_MODEL_CBF_LUMA, &initValue_cbf_luma[initType == 0 ? 0 : 2], 2); + init_context(QPY, cm+CONTEXT_MODEL_CBF_CHROMA, &initValue_cbf_chroma[initType * 4], 4); + init_context(QPY, cm+CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG, &initValue_split_transform_flag[initType * 3], 3); + init_context(QPY, cm+CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18); + init_context(QPY, cm+CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18); + init_context(QPY, cm+CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG, &initValue_coded_sub_block_flag[initType * 4], 4); + init_context(QPY, cm+CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG, initValue_significant_coeff_flag[initType], 42); + init_context(QPY, cm+CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG+42, initValue_significant_coeff_flag_skipmode[initType], 2); + + init_context(QPY, cm+CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG, &initValue_coeff_abs_level_greater1_flag[initType * 24], 24); + init_context(QPY, cm+CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG, &initValue_coeff_abs_level_greater2_flag[initType * 6], 6); + init_context(QPY, cm+CONTEXT_MODEL_SAO_MERGE_FLAG, &initValue_sao_merge_leftUp_flag[initType], 1); + init_context(QPY, cm+CONTEXT_MODEL_SAO_TYPE_IDX, &initValue_sao_type_idx_lumaChroma_flag[initType], 1); + init_context(QPY, cm+CONTEXT_MODEL_CU_QP_DELTA_ABS, initValue_cu_qp_delta_abs, 2); + init_context(QPY, cm+CONTEXT_MODEL_TRANSFORM_SKIP_FLAG, initValue_transform_skip_flag, 2); + init_context(QPY, cm+CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG, &initValue_cu_transquant_bypass_flag[initType], 1); + + init_context_const(QPY, cm+CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1, 154, 8); + init_context_const(QPY, cm+CONTEXT_MODEL_RES_SCALE_SIGN_FLAG, 154, 2); + init_context_const(QPY, cm+CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG, 154, 1); + init_context_const(QPY, cm+CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX, 154, 1); +} diff --git a/de265.cc b/de265.cc new file mode 100644 index 0000000..75dd0a8 --- /dev/null +++ b/de265.cc @@ -0,0 +1,711 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#define DEBUG_INSERT_STREAM_ERRORS 0 + + +#include "de265.h" +#include "decctx.h" +#include "util.h" +#include "scan.h" +#include "image.h" +#include "sei.h" + +#include +#include +#include +#include + + +// TODO: should be in some vps.c related header +de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_set* vps); + +extern "C" { +LIBDE265_API const char *de265_get_version(void) +{ + return (LIBDE265_VERSION); +} + +LIBDE265_API uint32_t de265_get_version_number(void) +{ + return (LIBDE265_NUMERIC_VERSION); +} + +LIBDE265_API int de265_get_version_number_major(void) +{ + return ((LIBDE265_NUMERIC_VERSION)>>24) & 0xFF; +} + +LIBDE265_API int de265_get_version_number_minor(void) +{ + return ((LIBDE265_NUMERIC_VERSION)>>16) & 0xFF; +} + +LIBDE265_API int de265_get_version_number_maintenance(void) +{ + return ((LIBDE265_NUMERIC_VERSION)>>8) & 0xFF; +} + + +LIBDE265_API const char* de265_get_error_text(de265_error err) +{ + switch (err) { + case DE265_OK: return "no error"; + case DE265_ERROR_NO_SUCH_FILE: return "no such file"; + //case DE265_ERROR_NO_STARTCODE: return "no startcode found"; + //case DE265_ERROR_EOF: return "end of file"; + case DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS: return "coefficient out of image bounds"; + case DE265_ERROR_CHECKSUM_MISMATCH: return "image checksum mismatch"; + case DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA: return "CTB outside of image area"; + case DE265_ERROR_OUT_OF_MEMORY: return "out of memory"; + case DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE: return "coded parameter out of range"; + case DE265_ERROR_IMAGE_BUFFER_FULL: return "DPB/output queue full"; + case DE265_ERROR_CANNOT_START_THREADPOOL: return "cannot start decoding threads"; + case DE265_ERROR_LIBRARY_INITIALIZATION_FAILED: return "global library initialization failed"; + case DE265_ERROR_LIBRARY_NOT_INITIALIZED: return "cannot free library data (not initialized"; + + //case DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED: + // return "internal error: maximum number of thread contexts exceeded"; + //case DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED: + // return "internal error: maximum number of slices exceeded"; + case DE265_ERROR_NOT_IMPLEMENTED_YET: + return "unimplemented decoder feature"; + //case DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED: + //return "scaling list not implemented"; + + case DE265_ERROR_WAITING_FOR_INPUT_DATA: + return "no more input data, decoder stalled"; + case DE265_ERROR_CANNOT_PROCESS_SEI: + return "SEI data cannot be processed"; + case DE265_ERROR_PARAMETER_PARSING: + return "command-line parameter error"; + case DE265_ERROR_NO_INITIAL_SLICE_HEADER: + return "first slice missing, cannot decode dependent slice"; + case DE265_ERROR_PREMATURE_END_OF_SLICE: + return "premature end of slice data"; + case DE265_ERROR_UNSPECIFIED_DECODING_ERROR: + return "unspecified decoding error"; + + case DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING: + return "Cannot run decoder multi-threaded because stream does not support WPP"; + case DE265_WARNING_WARNING_BUFFER_FULL: + return "Too many warnings queued"; + case DE265_WARNING_PREMATURE_END_OF_SLICE_SEGMENT: + return "Premature end of slice segment"; + case DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET: + return "Incorrect entry-point offsets"; + case DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA: + return "CTB outside of image area (concealing stream error...)"; + case DE265_WARNING_SPS_HEADER_INVALID: + return "sps header invalid"; + case DE265_WARNING_PPS_HEADER_INVALID: + return "pps header invalid"; + case DE265_WARNING_SLICEHEADER_INVALID: + return "slice header invalid"; + case DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING: + return "impossible motion vector scaling"; + case DE265_WARNING_NONEXISTING_PPS_REFERENCED: + return "non-existing PPS referenced"; + case DE265_WARNING_NONEXISTING_SPS_REFERENCED: + return "non-existing SPS referenced"; + case DE265_WARNING_BOTH_PREDFLAGS_ZERO: + return "both predFlags[] are zero in MC"; + case DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED: + return "non-existing reference picture accessed"; + case DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ: + return "numMV_P != numMV_Q in deblocking"; + case DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE: + return "number of short-term ref-pic-sets out of range"; + case DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE: + return "short-term ref-pic-set index out of range"; + case DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST: + return "faulty reference picture list"; + case DE265_WARNING_EOSS_BIT_NOT_SET: + return "end_of_sub_stream_one_bit not set to 1 when it should be"; + case DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED: + return "maximum number of reference pictures exceeded"; + case DE265_WARNING_INVALID_CHROMA_FORMAT: + return "invalid chroma format in SPS header"; + case DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID: + return "slice segment address invalid"; + case DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO: + return "dependent slice with address 0"; + case DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM: + return "number of threads limited to maximum amount"; + case DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER: + return "non-existing long-term reference candidate specified in slice header"; + case DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY: + return "cannot apply SAO because we ran out of memory"; + case DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI: + return "SPS header missing, cannot decode SEI"; + case DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA: + return "collocated motion-vector is outside image area"; + + default: return "unknown error"; + } +} + +LIBDE265_API int de265_isOK(de265_error err) +{ + return err == DE265_OK || err >= 1000; +} + + + +static int de265_init_count; + +static std::mutex de265_init_mutex; + + +LIBDE265_API de265_error de265_init() +{ + std::lock_guard lock(de265_init_mutex); + + de265_init_count++; + + if (de265_init_count > 1) { + // we are not the first -> already initialized + + return DE265_OK; + } + + + // do initializations + + init_scan_orders(); + + if (!alloc_and_init_significant_coeff_ctxIdx_lookupTable()) { + de265_init_count--; + return DE265_ERROR_LIBRARY_INITIALIZATION_FAILED; + } + + return DE265_OK; +} + +LIBDE265_API de265_error de265_free() +{ + std::lock_guard lock(de265_init_mutex); + + if (de265_init_count<=0) { + return DE265_ERROR_LIBRARY_NOT_INITIALIZED; + } + + de265_init_count--; + + if (de265_init_count==0) { + free_significant_coeff_ctxIdx_lookupTable(); + } + + return DE265_OK; +} + + +LIBDE265_API de265_decoder_context* de265_new_decoder() +{ + de265_error init_err = de265_init(); + if (init_err != DE265_OK) { + return NULL; + } + + decoder_context* ctx = new decoder_context; + if (!ctx) { + de265_free(); + return NULL; + } + + return (de265_decoder_context*)ctx; +} + + +LIBDE265_API de265_error de265_free_decoder(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + ctx->stop_thread_pool(); + + delete ctx; + + return de265_free(); +} + + +LIBDE265_API de265_error de265_start_worker_threads(de265_decoder_context* de265ctx, int number_of_threads) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + if (number_of_threads > MAX_THREADS) { + number_of_threads = MAX_THREADS; + } + + if (number_of_threads>0) { + de265_error err = ctx->start_thread_pool(number_of_threads); + if (de265_isOK(err)) { + err = DE265_OK; + } + return err; + } + else { + return DE265_OK; + } +} + + +#ifndef LIBDE265_DISABLE_DEPRECATED +LIBDE265_API de265_error de265_decode_data(de265_decoder_context* de265ctx, + const void* data8, int len) +{ + //decoder_context* ctx = (decoder_context*)de265ctx; + de265_error err; + if (len > 0) { + err = de265_push_data(de265ctx, data8, len, 0, NULL); + } else { + err = de265_flush_data(de265ctx); + } + if (err != DE265_OK) { + return err; + } + + int more = 0; + do { + err = de265_decode(de265ctx, &more); + if (err != DE265_OK) { + more = 0; + } + + switch (err) { + case DE265_ERROR_WAITING_FOR_INPUT_DATA: + // ignore error (didn't exist in 0.4 and before) + err = DE265_OK; + break; + default: + break; + } + } while (more); + return err; +} +#endif + +static void dumpdata(const void* data, int len) +{ + for (int i=0;inal_parser.push_data(data,len,pts,user_data); +} + + +LIBDE265_API de265_error de265_push_NAL(de265_decoder_context* de265ctx, + const void* data8, int len, + de265_PTS pts, void* user_data) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + uint8_t* data = (uint8_t*)data8; + + //printf("push NAL (size %d)\n",len); + //dumpdata(data8,16); + + return ctx->nal_parser.push_NAL(data,len,pts,user_data); +} + + +LIBDE265_API de265_error de265_decode(de265_decoder_context* de265ctx, int* more) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + return ctx->decode(more); +} + + +LIBDE265_API void de265_push_end_of_NAL(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + ctx->nal_parser.flush_data(); +} + + +LIBDE265_API void de265_push_end_of_frame(de265_decoder_context* de265ctx) +{ + de265_push_end_of_NAL(de265ctx); + + decoder_context* ctx = (decoder_context*)de265ctx; + ctx->nal_parser.mark_end_of_frame(); +} + + +LIBDE265_API de265_error de265_flush_data(de265_decoder_context* de265ctx) +{ + de265_push_end_of_NAL(de265ctx); + + decoder_context* ctx = (decoder_context*)de265ctx; + + ctx->nal_parser.flush_data(); + ctx->nal_parser.mark_end_of_stream(); + + return DE265_OK; +} + + +LIBDE265_API void de265_reset(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + //printf("--- reset ---\n"); + + ctx->reset(); +} + + +LIBDE265_API const struct de265_image* de265_get_next_picture(de265_decoder_context* de265ctx) +{ + const struct de265_image* img = de265_peek_next_picture(de265ctx); + if (img) { + de265_release_next_picture(de265ctx); + } + + return img; +} + + +LIBDE265_API const struct de265_image* de265_peek_next_picture(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + if (ctx->num_pictures_in_output_queue()>0) { + de265_image* img = ctx->get_next_picture_in_output_queue(); + return img; + } + else { + return NULL; + } +} + + +LIBDE265_API void de265_release_next_picture(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + // no active output picture -> ignore release request + + if (ctx->num_pictures_in_output_queue()==0) { return; } + + de265_image* next_image = ctx->get_next_picture_in_output_queue(); + + loginfo(LogDPB, "release DPB with POC=%d\n",next_image->PicOrderCntVal); + + next_image->PicOutputFlag = false; + + // TODO: actually, we want to release it here, but we cannot without breaking API + // compatibility, because get_next_picture calls this immediately. Hence, we release + // images while scanning for available slots in the DPB. + // if (next_image->can_be_released()) { next_image->release(); } + + // pop output queue + + ctx->pop_next_picture_in_output_queue(); +} + + + +LIBDE265_API int de265_get_highest_TID(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + return ctx->get_highest_TID(); +} + +LIBDE265_API int de265_get_current_TID(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + return ctx->get_current_TID(); +} + +LIBDE265_API void de265_set_limit_TID(de265_decoder_context* de265ctx,int max_tid) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + ctx->set_limit_TID(max_tid); +} + +LIBDE265_API void de265_set_framerate_ratio(de265_decoder_context* de265ctx,int percent) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + ctx->set_framerate_ratio(percent); +} + +LIBDE265_API int de265_change_framerate(de265_decoder_context* de265ctx,int more) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + return ctx->change_framerate(more); +} + + +LIBDE265_API de265_error de265_get_warning(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + return ctx->get_warning(); +} + +LIBDE265_API void de265_set_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param, int value) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + switch (param) + { + case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH: + ctx->param_sei_check_hash = !!value; + break; + + case DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES: + ctx->param_suppress_faulty_pictures = !!value; + break; + + case DE265_DECODER_PARAM_DISABLE_DEBLOCKING: + ctx->param_disable_deblocking = !!value; + break; + + case DE265_DECODER_PARAM_DISABLE_SAO: + ctx->param_disable_sao = !!value; + break; + + /* + case DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT: + ctx->param_disable_mc_residual_idct = !!value; + break; + + case DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT: + ctx->param_disable_intra_residual_idct = !!value; + break; + */ + + default: + assert(false); + break; + } +} + + +LIBDE265_API void de265_set_parameter_int(de265_decoder_context* de265ctx, enum de265_param param, int value) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + switch (param) + { + case DE265_DECODER_PARAM_DUMP_SPS_HEADERS: + ctx->param_sps_headers_fd = value; + break; + + case DE265_DECODER_PARAM_DUMP_VPS_HEADERS: + ctx->param_vps_headers_fd = value; + break; + + case DE265_DECODER_PARAM_DUMP_PPS_HEADERS: + ctx->param_pps_headers_fd = value; + break; + + case DE265_DECODER_PARAM_DUMP_SLICE_HEADERS: + ctx->param_slice_headers_fd = value; + break; + + case DE265_DECODER_PARAM_ACCELERATION_CODE: + ctx->set_acceleration_functions((enum de265_acceleration)value); + break; + + default: + assert(false); + break; + } +} + + + + +LIBDE265_API int de265_get_parameter_bool(de265_decoder_context* de265ctx, enum de265_param param) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + switch (param) + { + case DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH: + return ctx->param_sei_check_hash; + + case DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES: + return ctx->param_suppress_faulty_pictures; + + case DE265_DECODER_PARAM_DISABLE_DEBLOCKING: + return ctx->param_disable_deblocking; + + case DE265_DECODER_PARAM_DISABLE_SAO: + return ctx->param_disable_sao; + + /* + case DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT: + return ctx->param_disable_mc_residual_idct; + + case DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT: + return ctx->param_disable_intra_residual_idct; + */ + + default: + assert(false); + return false; + } +} + + +LIBDE265_API int de265_get_number_of_input_bytes_pending(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + return ctx->nal_parser.bytes_in_input_queue(); +} + + +LIBDE265_API int de265_get_number_of_NAL_units_pending(de265_decoder_context* de265ctx) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + return ctx->nal_parser.number_of_NAL_units_pending(); +} + + +LIBDE265_API int de265_get_image_width(const struct de265_image* img,int channel) +{ + switch (channel) { + case 0: + return img->width_confwin; + case 1: + case 2: + return img->chroma_width_confwin; + default: + return 0; + } +} + +LIBDE265_API int de265_get_image_height(const struct de265_image* img,int channel) +{ + switch (channel) { + case 0: + return img->height_confwin; + case 1: + case 2: + return img->chroma_height_confwin; + default: + return 0; + } +} + +LIBDE265_API int de265_get_bits_per_pixel(const struct de265_image* img,int channel) +{ + switch (channel) { + case 0: + return img->get_sps().BitDepth_Y; + case 1: + case 2: + return img->get_sps().BitDepth_C; + default: + return 0; + } +} + +LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image* img) +{ + return img->get_chroma_format(); +} + +LIBDE265_API const uint8_t* de265_get_image_plane(const de265_image* img, int channel, int* stride) +{ + assert(channel>=0 && channel <= 2); + + uint8_t* data = img->pixels_confwin[channel]; + + if (stride) *stride = img->get_image_stride(channel) * ((de265_get_bits_per_pixel(img, channel)+7) / 8); + + return data; +} + +LIBDE265_API void *de265_get_image_plane_user_data(const struct de265_image* img, int channel) +{ + assert(channel>=0 && channel <= 2); + + return img->plane_user_data[channel]; +} + +LIBDE265_API void de265_set_image_plane(de265_image* img, int cIdx, void* mem, int stride, void *userdata) +{ + // The internal "stride" is the number of pixels per line. + stride = stride / ((de265_get_bits_per_pixel(img, cIdx)+7) / 8); + img->set_image_plane(cIdx, (uint8_t*)mem, stride, userdata); +} + +LIBDE265_API void de265_set_image_allocation_functions(de265_decoder_context* de265ctx, + de265_image_allocation* allocfunc, + void* userdata) +{ + decoder_context* ctx = (decoder_context*)de265ctx; + + ctx->set_image_allocation_functions(allocfunc, userdata); +} + +LIBDE265_API const struct de265_image_allocation *de265_get_default_image_allocation_functions(void) +{ + return &de265_image::default_image_allocation; +} + +LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image* img) +{ + return img->pts; +} + +LIBDE265_API void* de265_get_image_user_data(const struct de265_image* img) +{ + return img->user_data; +} + +LIBDE265_API void de265_set_image_user_data(struct de265_image* img, void *user_data) +{ + img->user_data = user_data; +} + +LIBDE265_API void de265_get_image_NAL_header(const struct de265_image* img, + int* nal_unit_type, + const char** nal_unit_name, + int* nuh_layer_id, + int* nuh_temporal_id) +{ + if (nal_unit_type) *nal_unit_type = img->nal_hdr.nal_unit_type; + if (nal_unit_name) *nal_unit_name = get_NAL_name(img->nal_hdr.nal_unit_type); + if (nuh_layer_id) *nuh_layer_id = img->nal_hdr.nuh_layer_id; + if (nuh_temporal_id) *nuh_temporal_id = img->nal_hdr.nuh_temporal_id; +} +} diff --git a/deblock.cc b/deblock.cc new file mode 100644 index 0000000..f64cd8e --- /dev/null +++ b/deblock.cc @@ -0,0 +1,1058 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "deblock.h" +#include "util.h" +#include "transform.h" +#include "de265.h" + +#include + + + +// 8.7.2.1 for both EDGE_HOR and EDGE_VER at the same time +void markTransformBlockBoundary(de265_image* img, int x0,int y0, + int log2TrafoSize,int trafoDepth, + int filterLeftCbEdge, int filterTopCbEdge) +{ + logtrace(LogDeblock,"markTransformBlockBoundary(%d,%d, %d,%d, %d,%d)\n",x0,y0, + log2TrafoSize,trafoDepth, filterLeftCbEdge,filterTopCbEdge); + + int split_transform = img->get_split_transform_flag(x0,y0,trafoDepth); + if (split_transform) { + int x1 = x0 + ((1<>1); + int y1 = y0 + ((1<>1); + + markTransformBlockBoundary(img,x0,y0,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, filterTopCbEdge); + markTransformBlockBoundary(img,x1,y0,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, filterTopCbEdge); + markTransformBlockBoundary(img,x0,y1,log2TrafoSize-1,trafoDepth+1, filterLeftCbEdge, DEBLOCK_FLAG_HORIZ); + markTransformBlockBoundary(img,x1,y1,log2TrafoSize-1,trafoDepth+1, DEBLOCK_FLAG_VERTI, DEBLOCK_FLAG_HORIZ); + } + else { + // VER + + for (int k=0;k<(1<set_deblk_flags(x0,y0+k, filterLeftCbEdge); + } + + // HOR + + for (int k=0;k<(1<set_deblk_flags(x0+k,y0, filterTopCbEdge); + } + } +} + + + +// 8.7.2.2 for both EDGE_HOR and EDGE_VER at the same time +void markPredictionBlockBoundary(de265_image* img, int x0,int y0, + int log2CbSize, + int filterLeftCbEdge, int filterTopCbEdge) +{ + logtrace(LogDeblock,"markPredictionBlockBoundary(%d,%d, %d, %d,%d)\n",x0,y0, + log2CbSize, filterLeftCbEdge,filterTopCbEdge); + + enum PartMode partMode = img->get_PartMode(x0,y0); + + int cbSize = 1<set_deblk_flags(x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI); + img->set_deblk_flags(x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ); + } + break; + + case PART_Nx2N: + for (int k=0;kset_deblk_flags(x0+cbSize2,y0+k, DEBLOCK_PB_EDGE_VERTI); + } + break; + + case PART_2NxN: + for (int k=0;kset_deblk_flags(x0+k,y0+cbSize2, DEBLOCK_PB_EDGE_HORIZ); + } + break; + + case PART_nLx2N: + for (int k=0;kset_deblk_flags(x0+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI); + } + break; + + case PART_nRx2N: + for (int k=0;kset_deblk_flags(x0+cbSize2+cbSize4,y0+k, DEBLOCK_PB_EDGE_VERTI); + } + break; + + case PART_2NxnU: + for (int k=0;kset_deblk_flags(x0+k,y0+cbSize4, DEBLOCK_PB_EDGE_HORIZ); + } + break; + + case PART_2NxnD: + for (int k=0;kset_deblk_flags(x0+k,y0+cbSize2+cbSize4, DEBLOCK_PB_EDGE_HORIZ); + } + break; + + case PART_2Nx2N: + // NOP + break; + } +} + + +bool derive_edgeFlags_CTBRow(de265_image* img, int ctby) +{ + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + + const int minCbSize = sps.MinCbSizeY; + bool deblocking_enabled=false; // whether deblocking is enabled in some part of the image + + int ctb_mask = (1<> sps.Log2MinCbSizeY; + int cb_y_end = ((ctby+1) << sps.Log2CtbSizeY) >> sps.Log2MinCbSizeY; + + cb_y_end = std::min(cb_y_end, sps.PicHeightInMinCbsY); + + for (int cb_y=cb_y_start;cb_yget_sps().PicWidthInMinCbsY;cb_x++) + { + int log2CbSize = img->get_log2CbSize_cbUnits(cb_x,cb_y); + if (log2CbSize==0) { + continue; + } + + // we are now at the top corner of a CB + + int x0 = cb_x * minCbSize; + int y0 = cb_y * minCbSize; + + int x0ctb = x0 >> ctbshift; + int y0ctb = y0 >> ctbshift; + + // check for corrupted streams + if (img->is_SliceHeader_available(x0,y0)==false) { + return false; + } + + // check whether we should filter this slice + + slice_segment_header* shdr = img->get_SliceHeader(x0,y0); + + // check whether to filter left and top edge + + uint8_t filterLeftCbEdge = DEBLOCK_FLAG_VERTI; + uint8_t filterTopCbEdge = DEBLOCK_FLAG_HORIZ; + if (x0 == 0) filterLeftCbEdge = 0; + if (y0 == 0) filterTopCbEdge = 0; + + // check for slice and tile boundaries (8.7.2, step 2 in both processes) + + if (x0 && ((x0 & ctb_mask) == 0)) { // left edge at CTB boundary + if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 && + img->is_SliceHeader_available(x0-1,y0) && // for corrupted streams + shdr->SliceAddrRS != img->get_SliceHeader(x0-1,y0)->SliceAddrRS) + { + filterLeftCbEdge = 0; + } + else if (pps.loop_filter_across_tiles_enabled_flag == 0 && + pps.TileIdRS[ x0ctb +y0ctb*picWidthInCtbs] != + pps.TileIdRS[((x0-1)>>ctbshift)+y0ctb*picWidthInCtbs]) { + filterLeftCbEdge = 0; + } + } + + if (y0 && ((y0 & ctb_mask) == 0)) { // top edge at CTB boundary + if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 && + img->is_SliceHeader_available(x0,y0-1) && // for corrupted streams + shdr->SliceAddrRS != img->get_SliceHeader(x0,y0-1)->SliceAddrRS) + { + filterTopCbEdge = 0; + } + else if (pps.loop_filter_across_tiles_enabled_flag == 0 && + pps.TileIdRS[x0ctb+ y0ctb *picWidthInCtbs] != + pps.TileIdRS[x0ctb+((y0-1)>>ctbshift)*picWidthInCtbs]) { + filterTopCbEdge = 0; + } + } + + + // mark edges + + if (shdr->slice_deblocking_filter_disabled_flag==0) { + deblocking_enabled=true; + + markTransformBlockBoundary(img, x0,y0, log2CbSize,0, + filterLeftCbEdge, filterTopCbEdge); + + markPredictionBlockBoundary(img, x0,y0, log2CbSize, + filterLeftCbEdge, filterTopCbEdge); + } + } + + return deblocking_enabled; +} + + +bool derive_edgeFlags(de265_image* img) +{ + bool deblocking_enabled=false; + + for (int y=0;yget_sps().PicHeightInCtbsY;y++) { + deblocking_enabled |= derive_edgeFlags_CTBRow(img,y); + } + + return deblocking_enabled; +} + + +// 8.7.2.3 (both, EDGE_VER and EDGE_HOR) +void derive_boundaryStrength(de265_image* img, bool vertical, int yStart,int yEnd, + int xStart,int xEnd) +{ + int xIncr = vertical ? 2 : 1; + int yIncr = vertical ? 1 : 2; + int xOffs = vertical ? 1 : 0; + int yOffs = vertical ? 0 : 1; + int edgeMask = vertical ? + (DEBLOCK_FLAG_VERTI | DEBLOCK_PB_EDGE_VERTI) : + (DEBLOCK_FLAG_HORIZ | DEBLOCK_PB_EDGE_HORIZ); + int transformEdgeMask = vertical ? DEBLOCK_FLAG_VERTI : DEBLOCK_FLAG_HORIZ; + + xEnd = libde265_min(xEnd,img->get_deblk_width()); + yEnd = libde265_min(yEnd,img->get_deblk_height()); + + int TUShift = img->get_sps().Log2MinTrafoSize; + int TUStride= img->get_sps().PicWidthInTbsY; + + for (int y=yStart;yget_deblk_flags(xDi,yDi) & edgeMask) ? "edge" : "..."); + + uint8_t edgeFlags = img->get_deblk_flags(xDi,yDi); + + if (edgeFlags & edgeMask) { + bool p_is_intra_pred = (img->get_pred_mode(xDi-xOffs, yDi-yOffs) == MODE_INTRA); + bool q_is_intra_pred = (img->get_pred_mode(xDi, yDi ) == MODE_INTRA); + + int bS; + + if (p_is_intra_pred || q_is_intra_pred) { + bS = 2; + } + else { + // opposing site + int xDiOpp = xDi-xOffs; + int yDiOpp = yDi-yOffs; + + if ((edgeFlags & transformEdgeMask) && + (img->get_nonzero_coefficient(xDi ,yDi) || + img->get_nonzero_coefficient(xDiOpp,yDiOpp))) { + bS = 1; + } + else { + + bS = 0; + + const PBMotion& mviP = img->get_mv_info(xDiOpp,yDiOpp); + const PBMotion& mviQ = img->get_mv_info(xDi ,yDi); + + slice_segment_header* shdrP = img->get_SliceHeader(xDiOpp,yDiOpp); + slice_segment_header* shdrQ = img->get_SliceHeader(xDi ,yDi); + + int refPicP0 = mviP.predFlag[0] ? shdrP->RefPicList[0][ mviP.refIdx[0] ] : -1; + int refPicP1 = mviP.predFlag[1] ? shdrP->RefPicList[1][ mviP.refIdx[1] ] : -1; + int refPicQ0 = mviQ.predFlag[0] ? shdrQ->RefPicList[0][ mviQ.refIdx[0] ] : -1; + int refPicQ1 = mviQ.predFlag[1] ? shdrQ->RefPicList[1][ mviQ.refIdx[1] ] : -1; + + bool samePics = ((refPicP0==refPicQ0 && refPicP1==refPicQ1) || + (refPicP0==refPicQ1 && refPicP1==refPicQ0)); + + if (!samePics) { + bS = 1; + } + else { + MotionVector mvP0 = mviP.mv[0]; if (!mviP.predFlag[0]) { mvP0.x=mvP0.y=0; } + MotionVector mvP1 = mviP.mv[1]; if (!mviP.predFlag[1]) { mvP1.x=mvP1.y=0; } + MotionVector mvQ0 = mviQ.mv[0]; if (!mviQ.predFlag[0]) { mvQ0.x=mvQ0.y=0; } + MotionVector mvQ1 = mviQ.mv[1]; if (!mviQ.predFlag[1]) { mvQ1.x=mvQ1.y=0; } + + int numMV_P = mviP.predFlag[0] + mviP.predFlag[1]; + int numMV_Q = mviQ.predFlag[0] + mviQ.predFlag[1]; + + if (numMV_P!=numMV_Q) { + img->decctx->add_warning(DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + + // two different reference pictures or only one reference picture + if (refPicP0 != refPicP1) { + + if (refPicP0 == refPicQ0) { + if (abs_value(mvP0.x-mvQ0.x) >= 4 || + abs_value(mvP0.y-mvQ0.y) >= 4 || + abs_value(mvP1.x-mvQ1.x) >= 4 || + abs_value(mvP1.y-mvQ1.y) >= 4) { + bS = 1; + } + } + else { + if (abs_value(mvP0.x-mvQ1.x) >= 4 || + abs_value(mvP0.y-mvQ1.y) >= 4 || + abs_value(mvP1.x-mvQ0.x) >= 4 || + abs_value(mvP1.y-mvQ0.y) >= 4) { + bS = 1; + } + } + } + else { + assert(refPicQ0==refPicQ1); + + if ((abs_value(mvP0.x-mvQ0.x) >= 4 || + abs_value(mvP0.y-mvQ0.y) >= 4 || + abs_value(mvP1.x-mvQ1.x) >= 4 || + abs_value(mvP1.y-mvQ1.y) >= 4) + && + (abs_value(mvP0.x-mvQ1.x) >= 4 || + abs_value(mvP0.y-mvQ1.y) >= 4 || + abs_value(mvP1.x-mvQ0.x) >= 4 || + abs_value(mvP1.y-mvQ0.y) >= 4)) { + bS = 1; + } + } + } + + /* + printf("unimplemented deblocking code for CU at %d;%d\n",xDi,yDi); + + logerror(LogDeblock, "unimplemented code reached (file %s, line %d)\n", + __FILE__, __LINE__); + */ + } + } + + img->set_deblk_bS(xDi,yDi, bS); + } + else { + img->set_deblk_bS(xDi,yDi, 0); + } + } +} + + +void derive_boundaryStrength_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) +{ + int ctbSize = img->get_sps().CtbSizeY; + int deblkSize = ctbSize/4; + + derive_boundaryStrength(img,vertical, + yCtb*deblkSize, (yCtb+1)*deblkSize, + xCtb*deblkSize, (xCtb+1)*deblkSize); +} + + +static uint8_t table_8_23_beta[52] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, + 9,10,11,12,13,14,15,16,17,18,20,22,24,26,28,30,32,34,36, + 38,40,42,44,46,48,50,52,54,56,58,60,62,64 +}; + +static uint8_t table_8_23_tc[54] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, + 5, 5, 6, 6, 7, 8, 9,10,11,13,14,16,18,20,22,24 +}; + + + +// 8.7.2.4 +template +void edge_filtering_luma_internal(de265_image* img, bool vertical, + int yStart,int yEnd, int xStart,int xEnd) +{ + //printf("luma %d-%d %d-%d\n",xStart,xEnd,yStart,yEnd); + + const seq_parameter_set& sps = img->get_sps(); + + int xIncr = vertical ? 2 : 1; + int yIncr = vertical ? 1 : 2; + + const int stride = img->get_image_stride(0); + + int bitDepth_Y = sps.BitDepth_Y; + + xEnd = libde265_min(xEnd,img->get_deblk_width()); + yEnd = libde265_min(yEnd,img->get_deblk_height()); + + for (int y=yStart;y pixel resolution + int yDi = y<<2; // *4 -> pixel resolution + int bS = img->get_deblk_bS(xDi,yDi); + + //printf("x,y:%d,%d xDi,yDi:%d,%d\n",x,y,xDi,yDi); + + logtrace(LogDeblock,"deblock POC=%d %c --- x:%d y:%d bS:%d---\n", + img->PicOrderCntVal,vertical ? 'V':'H',xDi,yDi,bS); + +#if 0 + { + uint8_t* ptr = img->y + stride*yDi + xDi; + + for (int dy=-4;dy<4;dy++) { + for (int dx=-4;dx<4;dx++) { + printf("%02x ", ptr[dy*stride + dx]); + if (dx==-1) printf("| "); + } + printf("\n"); + if (dy==-1) printf("-------------------------\n"); + } + } +#endif + +#if 0 + if (!vertical) + { + uint8_t* ptr = img->y + stride*yDi + xDi; + + for (int dy=-4;dy<4;dy++) { + for (int dx=0;dx<4;dx++) { + printf("%02x ", ptr[dy*stride + dx]); + if (dx==-1) printf("| "); + } + printf("\n"); + if (dy==-1) printf("-------------------------\n"); + } + } +#endif + + if (bS>0) { + + // 8.7.2.4.3 + + pixel_t* ptr = img->get_image_plane_at_pos_NEW(0, xDi,yDi); + + pixel_t q[4][4], p[4][4]; + for (int k=0;k<4;k++) + for (int i=0;i<4;i++) + { + if (vertical) { + q[k][i] = ptr[ i +k*stride]; + p[k][i] = ptr[-i-1+k*stride]; + } + else { + q[k][i] = ptr[k + i *stride]; + p[k][i] = ptr[k -(i+1)*stride]; + } + } + +#if 0 + for (int k=0;k<4;k++) + { + for (int i=0;i<4;i++) + { + printf("%02x ", p[k][3-i]); + } + + printf("| "); + + for (int i=0;i<4;i++) + { + printf("%02x ", q[k][i]); + } + printf("\n"); + } +#endif + + + int QP_Q = img->get_QPY(xDi,yDi); + int QP_P = (vertical ? + img->get_QPY(xDi-1,yDi) : + img->get_QPY(xDi,yDi-1) ); + int qP_L = (QP_Q+QP_P+1)>>1; + + logtrace(LogDeblock,"QP: %d & %d -> %d\n",QP_Q,QP_P,qP_L); + + int sliceIndexQ00 = img->get_SliceHeaderIndex(xDi,yDi); + int beta_offset = img->slices[sliceIndexQ00]->slice_beta_offset; + int tc_offset = img->slices[sliceIndexQ00]->slice_tc_offset; + + int Q_beta = Clip3(0,51, qP_L + beta_offset); + int betaPrime = table_8_23_beta[Q_beta]; + int beta = betaPrime * (1<<(bitDepth_Y - 8)); + + int Q_tc = Clip3(0,53, qP_L + 2*(bS-1) + tc_offset); + int tcPrime = table_8_23_tc[Q_tc]; + int tc = tcPrime * (1<<(bitDepth_Y - 8)); + + logtrace(LogDeblock,"beta: %d (%d) tc: %d (%d)\n",beta,beta_offset, tc,tc_offset); + + int dE=0, dEp=0, dEq=0; + + if (vertical || !vertical) { + int dp0 = abs_value(p[0][2] - 2*p[0][1] + p[0][0]); + int dp3 = abs_value(p[3][2] - 2*p[3][1] + p[3][0]); + int dq0 = abs_value(q[0][2] - 2*q[0][1] + q[0][0]); + int dq3 = abs_value(q[3][2] - 2*q[3][1] + q[3][0]); + + int dpq0 = dp0 + dq0; + int dpq3 = dp3 + dq3; + + int dp = dp0 + dp3; + int dq = dq0 + dq3; + int d = dpq0+ dpq3; + + if (d>2) && + abs_value(p[0][3]-p[0][0])+abs_value(q[0][0]-q[0][3]) < (beta>>3) && + abs_value(p[0][0]-q[0][0]) < ((5*tc+1)>>1)); + + bool dSam3 = (2*dpq3 < (beta>>2) && + abs_value(p[3][3]-p[3][0])+abs_value(q[3][0]-q[3][3]) < (beta>>3) && + abs_value(p[3][0]-q[3][0]) < ((5*tc+1)>>1)); + + if (dSam0 && dSam3) { + dE=2; + } + else { + dE=1; + } + + if (dp < ((beta + (beta>>1))>>3)) { dEp=1; } + if (dq < ((beta + (beta>>1))>>3)) { dEq=1; } + + logtrace(LogDeblock,"dE:%d dEp:%d dEq:%d\n",dE,dEp,dEq); + } + } + else { + // TODO + assert(0); + } + + + // 8.7.2.4.4 + + if (dE != 0) { + bool filterP = true; + bool filterQ = true; + + if (vertical) { + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi-1,yDi)) filterP=false; + if (img->get_cu_transquant_bypass(xDi-1,yDi)) filterP=false; + + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi)) filterQ=false; + if (img->get_cu_transquant_bypass(xDi,yDi)) filterQ=false; + } + else { + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi-1)) filterP=false; + if (img->get_cu_transquant_bypass(xDi,yDi-1)) filterP=false; + + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(xDi,yDi)) filterQ=false; + if (img->get_cu_transquant_bypass(xDi,yDi)) filterQ=false; + } + + for (int k=0;k<4;k++) { + //int nDp,nDq; + + logtrace(LogDeblock,"line:%d\n",k); + + const pixel_t p0 = p[k][0]; + const pixel_t p1 = p[k][1]; + const pixel_t p2 = p[k][2]; + const pixel_t p3 = p[k][3]; + const pixel_t q0 = q[k][0]; + const pixel_t q1 = q[k][1]; + const pixel_t q2 = q[k][2]; + const pixel_t q3 = q[k][3]; + + if (dE==2) { + // strong filtering + + //nDp=nDq=3; + + pixel_t pnew[3],qnew[3]; + pnew[0] = Clip3(p0-2*tc,p0+2*tc, (p2 + 2*p1 + 2*p0 + 2*q0 + q1 +4)>>3); + pnew[1] = Clip3(p1-2*tc,p1+2*tc, (p2 + p1 + p0 + q0+2)>>2); + pnew[2] = Clip3(p2-2*tc,p2+2*tc, (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3); + qnew[0] = Clip3(q0-2*tc,q0+2*tc, (p1+2*p0+2*q0+2*q1+q2+4)>>3); + qnew[1] = Clip3(q1-2*tc,q1+2*tc, (p0+q0+q1+q2+2)>>2); + qnew[2] = Clip3(q2-2*tc,q2+2*tc, (p0+q0+q1+3*q2+2*q3+4)>>3); + + logtrace(LogDeblock,"strong filtering\n"); + + if (vertical) { + for (int i=0;i<3;i++) { + if (filterP) { ptr[-i-1+k*stride] = pnew[i]; } + if (filterQ) { ptr[ i + k*stride] = qnew[i]; } + } + + // ptr[-1+k*stride] = ptr[ 0+k*stride] = 200; + } + else { + for (int i=0;i<3;i++) { + if (filterP) { ptr[ k -(i+1)*stride] = pnew[i]; } + if (filterQ) { ptr[ k + i *stride] = qnew[i]; } + } + } + } + else { + // weak filtering + + //nDp=nDq=0; + + int delta = (9*(q0-p0) - 3*(q1-p1) + 8)>>4; + logtrace(LogDeblock,"delta=%d, tc=%d\n",delta,tc); + + if (abs_value(delta) < tc*10) { + + delta = Clip3(-tc,tc,delta); + logtrace(LogDeblock," deblk + %d;%d [%02x->%02x] - %d;%d [%02x->%02x] delta:%d\n", + vertical ? xDi-1 : xDi+k, + vertical ? yDi+k : yDi-1, p0,Clip_BitDepth(p0+delta, bitDepth_Y), + vertical ? xDi : xDi+k, + vertical ? yDi+k : yDi, q0,Clip_BitDepth(q0-delta, bitDepth_Y), + delta); + + if (vertical) { + if (filterP) { ptr[-0-1+k*stride] = Clip_BitDepth(p0+delta, bitDepth_Y); } + if (filterQ) { ptr[ 0 +k*stride] = Clip_BitDepth(q0-delta, bitDepth_Y); } + } + else { + if (filterP) { ptr[ k -1*stride] = Clip_BitDepth(p0+delta, bitDepth_Y); } + if (filterQ) { ptr[ k +0*stride] = Clip_BitDepth(q0-delta, bitDepth_Y); } + } + + //ptr[ 0+k*stride] = 200; + + if (dEp==1 && filterP) { + int delta_p = Clip3(-(tc>>1), tc>>1, (((p2+p0+1)>>1)-p1+delta)>>1); + + logtrace(LogDeblock," deblk dEp %d;%d delta:%d\n", + vertical ? xDi-2 : xDi+k, + vertical ? yDi+k : yDi-2, + delta_p); + + if (vertical) { ptr[-1-1+k*stride] = Clip_BitDepth(p1+delta_p, bitDepth_Y); } + else { ptr[ k -2*stride] = Clip_BitDepth(p1+delta_p, bitDepth_Y); } + } + + if (dEq==1 && filterQ) { + int delta_q = Clip3(-(tc>>1), tc>>1, (((q2+q0+1)>>1)-q1-delta)>>1); + + logtrace(LogDeblock," delkb dEq %d;%d delta:%d\n", + vertical ? xDi+1 : xDi+k, + vertical ? yDi+k : yDi+1, + delta_q); + + if (vertical) { ptr[ 1 +k*stride] = Clip_BitDepth(q1+delta_q, bitDepth_Y); } + else { ptr[ k +1*stride] = Clip_BitDepth(q1+delta_q, bitDepth_Y); } + } + + //nDp = dEp+1; + //nDq = dEq+1; + + //logtrace(LogDeblock,"weak filtering (%d:%d)\n",nDp,nDq); + } + } + } + } + } + } +} + + +void edge_filtering_luma(de265_image* img, bool vertical, + int yStart,int yEnd, int xStart,int xEnd) +{ + if (img->high_bit_depth(0)) { + edge_filtering_luma_internal(img,vertical,yStart,yEnd,xStart,xEnd); + } + else { + edge_filtering_luma_internal(img,vertical,yStart,yEnd,xStart,xEnd); + } +} + +void edge_filtering_luma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) +{ + int ctbSize = img->get_sps().CtbSizeY; + int deblkSize = ctbSize/4; + + edge_filtering_luma(img,vertical, + yCtb*deblkSize, (yCtb+1)*deblkSize, + xCtb*deblkSize, (xCtb+1)*deblkSize); +} + + + + +// 8.7.2.4 +/** ?Start and ?End values in 4-luma pixels resolution. + */ +template +void edge_filtering_chroma_internal(de265_image* img, bool vertical, + int yStart,int yEnd, + int xStart,int xEnd) +{ + //printf("chroma %d-%d %d-%d\n",xStart,xEnd,yStart,yEnd); + + const seq_parameter_set& sps = img->get_sps(); + + const int SubWidthC = sps.SubWidthC; + const int SubHeightC = sps.SubHeightC; + + int xIncr = vertical ? 2 : 1; + int yIncr = vertical ? 1 : 2; + + xIncr *= SubWidthC; + yIncr *= SubHeightC; + + const int stride = img->get_image_stride(1); + + xEnd = libde265_min(xEnd,img->get_deblk_width()); + yEnd = libde265_min(yEnd,img->get_deblk_height()); + + int bitDepth_C = sps.BitDepth_C; + + for (int y=yStart;yget_deblk_bS(xDi*SubWidthC,yDi*SubHeightC); + + if (bS>1) { + // 8.7.2.4.5 + + for (int cplane=0;cplane<2;cplane++) { + int cQpPicOffset = (cplane==0 ? + img->get_pps().pic_cb_qp_offset : + img->get_pps().pic_cr_qp_offset); + + pixel_t* ptr = img->get_image_plane_at_pos_NEW(cplane+1, xDi,yDi); + + pixel_t p[2][4]; + pixel_t q[2][4]; + + logtrace(LogDeblock,"-%s- %d %d\n",cplane==0 ? "Cb" : "Cr",xDi,yDi); + + for (int i=0;i<2;i++) + for (int k=0;k<4;k++) + { + if (vertical) { + q[i][k] = ptr[ i +k*stride]; + p[i][k] = ptr[-i-1+k*stride]; + } + else { + q[i][k] = ptr[k + i *stride]; + p[i][k] = ptr[k -(i+1)*stride]; + } + } + +#if 0 + for (int k=0;k<4;k++) + { + for (int i=0;i<2;i++) + { + printf("%02x ", p[1-i][k]); + } + + printf("| "); + + for (int i=0;i<2;i++) + { + printf("%02x ", q[i][k]); + } + printf("\n"); + } +#endif + + int QP_Q = img->get_QPY(SubWidthC*xDi,SubHeightC*yDi); + int QP_P = (vertical ? + img->get_QPY(SubWidthC*xDi-1,SubHeightC*yDi) : + img->get_QPY(SubWidthC*xDi,SubHeightC*yDi-1)); + int qP_i = ((QP_Q+QP_P+1)>>1) + cQpPicOffset; + int QP_C; + if (sps.ChromaArrayType == CHROMA_420) { + QP_C = table8_22(qP_i); + } else { + QP_C = libde265_min(qP_i, 51); + } + + + //printf("POC=%d\n",ctx->img->PicOrderCntVal); + logtrace(LogDeblock,"%d %d: ((%d+%d+1)>>1) + %d = qP_i=%d (QP_C=%d)\n", + SubWidthC*xDi,SubHeightC*yDi, QP_Q,QP_P,cQpPicOffset,qP_i,QP_C); + + int sliceIndexQ00 = img->get_SliceHeaderIndex(SubWidthC*xDi,SubHeightC*yDi); + int tc_offset = img->slices[sliceIndexQ00]->slice_tc_offset; + + int Q = Clip3(0,53, QP_C + 2*(bS-1) + tc_offset); + + int tcPrime = table_8_23_tc[Q]; + int tc = tcPrime * (1<<(sps.BitDepth_C - 8)); + + logtrace(LogDeblock,"tc_offset=%d Q=%d tc'=%d tc=%d\n",tc_offset,Q,tcPrime,tc); + + if (vertical) { + bool filterP = true; + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi-1,SubHeightC*yDi)) filterP=false; + if (img->get_cu_transquant_bypass(SubWidthC*xDi-1,SubHeightC*yDi)) filterP=false; + + bool filterQ = true; + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; + if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; + + + for (int k=0;k<4;k++) { + int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3)); + logtrace(LogDeblock,"delta=%d\n",delta); + if (filterP) { ptr[-1+k*stride] = Clip_BitDepth(p[0][k]+delta, bitDepth_C); } + if (filterQ) { ptr[ 0+k*stride] = Clip_BitDepth(q[0][k]-delta, bitDepth_C); } + } + } + else { + bool filterP = true; + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi-1)) filterP=false; + if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi-1)) filterP=false; + + bool filterQ = true; + if (sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; + if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false; + + for (int k=0;k<4;k++) { + int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3)); + if (filterP) { ptr[ k-1*stride] = Clip_BitDepth(p[0][k]+delta, bitDepth_C); } + if (filterQ) { ptr[ k+0*stride] = Clip_BitDepth(q[0][k]-delta, bitDepth_C); } + } + } + } + } + } +} + + +void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd, + int xStart,int xEnd) +{ + if (img->high_bit_depth(1)) { + edge_filtering_chroma_internal(img,vertical,yStart,yEnd,xStart,xEnd); + } + else { + edge_filtering_chroma_internal(img,vertical,yStart,yEnd,xStart,xEnd); + } +} + + +void edge_filtering_chroma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb) +{ + int ctbSize = img->get_sps().CtbSizeY; + int deblkSize = ctbSize/4; + + edge_filtering_chroma(img,vertical, + yCtb*deblkSize, (yCtb+1)*deblkSize, + xCtb*deblkSize, (xCtb+1)*deblkSize); +} + + + +class thread_task_deblock_CTBRow : public thread_task +{ +public: + struct de265_image* img; + int ctb_y; + bool vertical; + + virtual void work(); + virtual std::string name() const { + char buf[100]; + sprintf(buf,"deblock-%d",ctb_y); + return buf; + } +}; + + +void thread_task_deblock_CTBRow::work() +{ + state = Running; + img->thread_run(this); + + int xStart=0; + int xEnd = img->get_deblk_width(); + + int ctbSize = img->get_sps().CtbSizeY; + int deblkSize = ctbSize/4; + + int first = ctb_y * deblkSize; + int last = (ctb_y+1) * deblkSize; + if (last > img->get_deblk_height()) { + last = img->get_deblk_height(); + } + + int finalProgress = CTB_PROGRESS_DEBLK_V; + if (!vertical) finalProgress = CTB_PROGRESS_DEBLK_H; + + int rightCtb = img->get_sps().PicWidthInCtbsY-1; + + if (vertical) { + // pass 1: vertical + + int CtbRow = std::min(ctb_y+1 , img->get_sps().PicHeightInCtbsY-1); + img->wait_for_progress(this, rightCtb,CtbRow, CTB_PROGRESS_PREFILTER); + } + else { + // pass 2: horizontal + + if (ctb_y>0) { + img->wait_for_progress(this, rightCtb,ctb_y-1, CTB_PROGRESS_DEBLK_V); + } + + img->wait_for_progress(this, rightCtb,ctb_y, CTB_PROGRESS_DEBLK_V); + + if (ctb_y+1get_sps().PicHeightInCtbsY) { + img->wait_for_progress(this, rightCtb,ctb_y+1, CTB_PROGRESS_DEBLK_V); + } + } + + //printf("deblock %d to %d orientation: %d\n",first,last,vertical); + + bool deblocking_enabled; + + // first pass: check edge flags and whether we have to deblock + if (vertical) { + deblocking_enabled = derive_edgeFlags_CTBRow(img, ctb_y); + + //for (int x=0;x<=rightCtb;x++) { + int x=0; img->set_CtbDeblockFlag(x,ctb_y, deblocking_enabled); + //} + } + else { + int x=0; deblocking_enabled=img->get_CtbDeblockFlag(x,ctb_y); + } + + if (deblocking_enabled) { + derive_boundaryStrength(img, vertical, first,last, xStart,xEnd); + + edge_filtering_luma(img, vertical, first,last, xStart,xEnd); + + if (img->get_sps().ChromaArrayType != CHROMA_MONO) { + edge_filtering_chroma(img, vertical, first,last, xStart,xEnd); + } + } + + for (int x=0;x<=rightCtb;x++) { + const int CtbWidth = img->get_sps().PicWidthInCtbsY; + img->ctb_progress[x+ctb_y*CtbWidth].set_progress(finalProgress); + } + + state = Finished; + img->thread_finishes(this); +} + + +void add_deblocking_tasks(image_unit* imgunit) +{ + de265_image* img = imgunit->img; + decoder_context* ctx = img->decctx; + + int nRows = img->get_sps().PicHeightInCtbsY; + + int n=0; + img->thread_start(nRows*2); + + for (int pass=0;pass<2;pass++) + { + for (int y=0;yget_sps().PicHeightInCtbsY;y++) + { + thread_task_deblock_CTBRow* task = new thread_task_deblock_CTBRow; + + task->img = img; + task->ctb_y = y; + task->vertical = (pass==0); + + imgunit->tasks.push_back(task); + add_task(&ctx->thread_pool_, task); + n++; + } + } +} + + +void apply_deblocking_filter(de265_image* img) // decoder_context* ctx) +{ + decoder_context* ctx = img->decctx; + + char enabled_deblocking = derive_edgeFlags(img); + + if (enabled_deblocking) + { + // vertical filtering + + logtrace(LogDeblock,"VERTICAL\n"); + derive_boundaryStrength(img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); + edge_filtering_luma (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); + + if (img->get_sps().ChromaArrayType != CHROMA_MONO) { + edge_filtering_chroma (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width()); + } +#if 0 + char buf[1000]; + sprintf(buf,"lf-after-V-%05d.yuv", ctx->img->PicOrderCntVal); + write_picture_to_file(ctx->img, buf); +#endif + + // horizontal filtering + + logtrace(LogDeblock,"HORIZONTAL\n"); + derive_boundaryStrength(img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); + edge_filtering_luma (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); + + if (img->get_sps().ChromaArrayType != CHROMA_MONO) { + edge_filtering_chroma (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width()); + } + +#if 0 + sprintf(buf,"lf-after-H-%05d.yuv", ctx->img->PicOrderCntVal); + write_picture_to_file(ctx->img, buf); +#endif + } +} diff --git a/decctx.cc b/decctx.cc new file mode 100644 index 0000000..edebb71 --- /dev/null +++ b/decctx.cc @@ -0,0 +1,2285 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "decctx.h" +#include "util.h" +#include "sao.h" +#include "sei.h" +#include "deblock.h" + +#include +#include +#include +#include +#include + +#include "fallback.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_SSE4_1 +#include "x86/sse.h" +#endif + +#ifdef HAVE_ARM +#include "arm/arm.h" +#endif + +#define SAVE_INTERMEDIATE_IMAGES 0 + +#if SAVE_INTERMEDIATE_IMAGES +#include "visualize.h" +#endif + +extern void thread_decode_CTB_row(void* d); +extern void thread_decode_slice_segment(void* d); + + +thread_context::thread_context() +{ + /* + CtbAddrInRS = 0; + CtbAddrInTS = 0; + + CtbX = 0; + CtbY = 0; + */ + + /* + refIdx[0] = refIdx[1] = 0; + mvd[0][0] = mvd[0][1] = mvd[1][0] = mvd[1][1] = 0; + merge_flag = 0; + merge_idx = 0; + mvp_lX_flag[0] = mvp_lX_flag[1] = 0; + inter_pred_idc = 0; + */ + + /* + enum IntraPredMode IntraPredModeC; // chroma intra-prediction mode for current CB + */ + + /* + cu_transquant_bypass_flag = false; + memset(transform_skip_flag,0, 3*sizeof(uint8_t)); + */ + + + //memset(coeffList,0,sizeof(int16_t)*3*32*32); + //memset(coeffPos,0,sizeof(int16_t)*3*32*32); + //memset(nCoeff,0,sizeof(int16_t)*3); + + + + IsCuQpDeltaCoded = false; + CuQpDelta = 0; + + IsCuChromaQpOffsetCoded = false; + CuQpOffsetCb = 0; + CuQpOffsetCr = 0; + + /* + currentQPY = 0; + currentQG_x = 0; + currentQG_y = 0; + lastQPYinPreviousQG = 0; + */ + + /* + qPYPrime = 0; + qPCbPrime = 0; + qPCrPrime = 0; + */ + + /* + memset(&cabac_decoder, 0, sizeof(CABAC_decoder)); + memset(&ctx_model, 0, sizeof(ctx_model)); + */ + + decctx = NULL; + img = NULL; + shdr = NULL; + + imgunit = NULL; + sliceunit = NULL; + + + //memset(this,0,sizeof(thread_context)); + + // There is a interesting issue here. When aligning _coeffBuf to 16 bytes offset with + // __attribute__((align(16))), the following statement is optimized away since the + // compiler assumes that the pointer would be 16-byte aligned. However, this is not the + // case when the structure has been dynamically allocated. In this case, the base can + // also be at 8 byte offsets (at least with MingW,32 bit). + int offset = ((uintptr_t)_coeffBuf) & 0xf; + + if (offset == 0) { + coeffBuf = _coeffBuf; // correctly aligned already + } + else { + coeffBuf = (int16_t *) (((uint8_t *)_coeffBuf) + (16-offset)); + } + + memset(coeffBuf, 0, 32*32*sizeof(int16_t)); +} + + +slice_unit::slice_unit(decoder_context* decctx) + : nal(NULL), + shdr(NULL), + imgunit(NULL), + flush_reorder_buffer(false), + nThreads(0), + first_decoded_CTB_RS(-1), + last_decoded_CTB_RS(-1), + thread_contexts(NULL), + ctx(decctx) +{ + state = Unprocessed; + nThreadContexts = 0; +} + +slice_unit::~slice_unit() +{ + ctx->nal_parser.free_NAL_unit(nal); + + if (thread_contexts) { + delete[] thread_contexts; + } +} + + +void slice_unit::allocate_thread_contexts(int n) +{ + assert(thread_contexts==NULL); + + thread_contexts = new thread_context[n]; + nThreadContexts = n; +} + + +image_unit::image_unit() +{ + img=NULL; + role=Invalid; + state=Unprocessed; +} + + +image_unit::~image_unit() +{ + for (int i=0;iFirstAfterEndOfSequenceNAL = true; + //ctx->last_RAP_picture_NAL_type = NAL_UNIT_UNDEFINED; + + //de265_init_image(&ctx->coeff); + + // --- decoded picture buffer --- + + current_image_poc_lsb = -1; // any invalid number +} + + +decoder_context::~decoder_context() +{ + while (!image_units.empty()) { + delete image_units.back(); + image_units.pop_back(); + } +} + + +void decoder_context::set_image_allocation_functions(de265_image_allocation* allocfunc, + void* userdata) +{ + if (allocfunc) { + param_image_allocation_functions = *allocfunc; + param_image_allocation_userdata = userdata; + } + else { + assert(false); // actually, it makes no sense to reset the allocation functions + + param_image_allocation_functions = de265_image::default_image_allocation; + param_image_allocation_userdata = NULL; + } +} + + +de265_error decoder_context::start_thread_pool(int nThreads) +{ + ::start_thread_pool(&thread_pool_, nThreads); + + num_worker_threads = nThreads; + + return DE265_OK; +} + + +void decoder_context::stop_thread_pool() +{ + if (get_num_worker_threads()>0) { + //flush_thread_pool(&ctx->thread_pool); + ::stop_thread_pool(&thread_pool_); + } +} + + +void decoder_context::reset() +{ + if (num_worker_threads>0) { + //flush_thread_pool(&ctx->thread_pool); + ::stop_thread_pool(&thread_pool_); + } + + // -------------------------------------------------- + +#if 0 + ctx->end_of_stream = false; + ctx->pending_input_NAL = NULL; + ctx->current_vps = NULL; + ctx->current_sps = NULL; + ctx->current_pps = NULL; + ctx->num_worker_threads = 0; + ctx->current_image_poc_lsb = 0; + ctx->first_decoded_picture = 0; + ctx->NoRaslOutputFlag = 0; + ctx->HandleCraAsBlaFlag = 0; + ctx->FirstAfterEndOfSequenceNAL = 0; + ctx->PicOrderCntMsb = 0; + ctx->prevPicOrderCntLsb = 0; + ctx->prevPicOrderCntMsb = 0; + ctx->NumPocStCurrBefore=0; + ctx->NumPocStCurrAfter=0; + ctx->NumPocStFoll=0; + ctx->NumPocLtCurr=0; + ctx->NumPocLtFoll=0; + ctx->nal_unit_type=0; + ctx->IdrPicFlag=0; + ctx->RapPicFlag=0; +#endif + + img = NULL; + + + // TODO: remove all pending image_units + + + // --- decoded picture buffer --- + + current_image_poc_lsb = -1; // any invalid number + first_decoded_picture = true; + + + // --- remove all pictures from output queue --- + + // there was a bug the peek_next_image did not return NULL on empty output queues. + // This was (indirectly) fixed by recreating the DPB buffer, but it should actually + // be sufficient to clear it like this. + // The error showed while scrubbing the ToS video in VLC. + dpb.clear(); + + nal_parser.remove_pending_input_data(); + + + while (!image_units.empty()) { + delete image_units.back(); + image_units.pop_back(); + } + + // --- start threads again --- + + if (num_worker_threads>0) { + // TODO: need error checking + start_thread_pool(num_worker_threads); + } +} + +void base_context::set_acceleration_functions(enum de265_acceleration l) +{ + // fill scalar functions first (so that function table is completely filled) + + init_acceleration_functions_fallback(&acceleration); + + + // override functions with optimized variants + +#ifdef HAVE_SSE4_1 + if (l>=de265_acceleration_SSE) { + init_acceleration_functions_sse(&acceleration); + } +#endif +#ifdef HAVE_ARM + if (l>=de265_acceleration_ARM) { + init_acceleration_functions_arm(&acceleration); + } +#endif +} + + +void decoder_context::init_thread_context(thread_context* tctx) +{ + // zero scrap memory for coefficient blocks + memset(tctx->_coeffBuf, 0, sizeof(tctx->_coeffBuf)); // TODO: check if we can safely remove this + + tctx->currentQG_x = -1; + tctx->currentQG_y = -1; + + + + // --- find QPY that was active at the end of the previous slice --- + + // find the previous CTB in TS order + + const pic_parameter_set& pps = tctx->img->get_pps(); + const seq_parameter_set& sps = tctx->img->get_sps(); + + + if (tctx->shdr->slice_segment_address > 0) { + int prevCtb = pps.CtbAddrTStoRS[ pps.CtbAddrRStoTS[tctx->shdr->slice_segment_address] -1 ]; + + int ctbX = prevCtb % sps.PicWidthInCtbsY; + int ctbY = prevCtb / sps.PicWidthInCtbsY; + + + // take the pixel at the bottom right corner (but consider that the image size might be smaller) + + int x = ((ctbX+1) << sps.Log2CtbSizeY)-1; + int y = ((ctbY+1) << sps.Log2CtbSizeY)-1; + + x = std::min(x,sps.pic_width_in_luma_samples-1); + y = std::min(y,sps.pic_height_in_luma_samples-1); + + //printf("READ QPY: %d %d -> %d (should %d)\n",x,y,imgunit->img->get_QPY(x,y), tc.currentQPY); + + //if (tctx->shdr->dependent_slice_segment_flag) { // TODO: do we need this condition ? + tctx->currentQPY = tctx->img->get_QPY(x,y); + //} + } +} + + +void decoder_context::add_task_decode_CTB_row(thread_context* tctx, + bool firstSliceSubstream, + int ctbRow) +{ + thread_task_ctb_row* task = new thread_task_ctb_row; + task->firstSliceSubstream = firstSliceSubstream; + task->tctx = tctx; + task->debug_startCtbRow = ctbRow; + tctx->task = task; + + add_task(&thread_pool_, task); + + tctx->imgunit->tasks.push_back(task); +} + + +void decoder_context::add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream, + int ctbx,int ctby) +{ + thread_task_slice_segment* task = new thread_task_slice_segment; + task->firstSliceSubstream = firstSliceSubstream; + task->tctx = tctx; + task->debug_startCtbX = ctbx; + task->debug_startCtbY = ctby; + tctx->task = task; + + add_task(&thread_pool_, task); + + tctx->imgunit->tasks.push_back(task); +} + + +de265_error decoder_context::read_vps_NAL(bitreader& reader) +{ + logdebug(LogHeaders,"---> read VPS\n"); + + std::shared_ptr new_vps = std::make_shared(); + de265_error err = new_vps->read(this,&reader); + if (err != DE265_OK) { + return err; + } + + if (param_vps_headers_fd>=0) { + new_vps->dump(param_vps_headers_fd); + } + + vps[ new_vps->video_parameter_set_id ] = new_vps; + + return DE265_OK; +} + +de265_error decoder_context::read_sps_NAL(bitreader& reader) +{ + logdebug(LogHeaders,"----> read SPS\n"); + + std::shared_ptr new_sps = std::make_shared(); + de265_error err; + + if ((err=new_sps->read(this, &reader)) != DE265_OK) { + return err; + } + + if (param_sps_headers_fd>=0) { + new_sps->dump(param_sps_headers_fd); + } + + sps[ new_sps->seq_parameter_set_id ] = new_sps; + + return DE265_OK; +} + +de265_error decoder_context::read_pps_NAL(bitreader& reader) +{ + logdebug(LogHeaders,"----> read PPS\n"); + + std::shared_ptr new_pps = std::make_shared(); + + bool success = new_pps->read(&reader,this); + + if (param_pps_headers_fd>=0) { + new_pps->dump(param_pps_headers_fd); + } + + if (success) { + pps[ (int)new_pps->pic_parameter_set_id ] = new_pps; + } + + return success ? DE265_OK : DE265_WARNING_PPS_HEADER_INVALID; +} + +de265_error decoder_context::read_sei_NAL(bitreader& reader, bool suffix) +{ + logdebug(LogHeaders,"----> read SEI\n"); + + sei_message sei; + + //push_current_picture_to_output_queue(); + + de265_error err = DE265_OK; + + if ((err=read_sei(&reader,&sei, suffix, current_sps.get())) == DE265_OK) { + dump_sei(&sei, current_sps.get()); + + if (image_units.empty()==false && suffix) { + image_units.back()->suffix_SEIs.push_back(sei); + } + } + else { + add_warning(err, false); + } + + return err; +} + +de265_error decoder_context::read_eos_NAL(bitreader& reader) +{ + FirstAfterEndOfSequenceNAL = true; + return DE265_OK; +} + +de265_error decoder_context::read_slice_NAL(bitreader& reader, NAL_unit* nal, nal_header& nal_hdr) +{ + logdebug(LogHeaders,"---> read slice segment header\n"); + + + // --- read slice header --- + + slice_segment_header* shdr = new slice_segment_header; + bool continueDecoding; + de265_error err = shdr->read(&reader,this, &continueDecoding); + if (!continueDecoding) { + if (img) { img->integrity = INTEGRITY_NOT_DECODED; } + nal_parser.free_NAL_unit(nal); + delete shdr; + return err; + } + + if (param_slice_headers_fd>=0) { + shdr->dump_slice_segment_header(this, param_slice_headers_fd); + } + + + if (process_slice_segment_header(shdr, &err, nal->pts, &nal_hdr, nal->user_data) == false) + { + if (img!=NULL) img->integrity = INTEGRITY_NOT_DECODED; + nal_parser.free_NAL_unit(nal); + delete shdr; + return err; + } + + this->img->add_slice_segment_header(shdr); + + skip_bits(&reader,1); // TODO: why? + prepare_for_CABAC(&reader); + + + // modify entry_point_offsets + + int headerLength = reader.data - nal->data(); + for (int i=0;inum_entry_point_offsets;i++) { + shdr->entry_point_offset[i] -= nal->num_skipped_bytes_before(shdr->entry_point_offset[i], + headerLength); + } + + + + // --- start a new image if this is the first slice --- + + if (shdr->first_slice_segment_in_pic_flag) { + image_unit* imgunit = new image_unit; + imgunit->img = this->img; + image_units.push_back(imgunit); + } + + + // --- add slice to current picture --- + + if ( ! image_units.empty() ) { + + slice_unit* sliceunit = new slice_unit(this); + sliceunit->nal = nal; + sliceunit->shdr = shdr; + sliceunit->reader = reader; + + sliceunit->flush_reorder_buffer = flush_reorder_buffer_at_this_frame; + + + image_units.back()->slice_units.push_back(sliceunit); + } + + bool did_work; + err = decode_some(&did_work); + + return DE265_OK; +} + + +template void pop_front(std::vector& vec) +{ + for (int i=1;islice_units.empty() ) { + + image_unit* imgunit = image_units[0]; + slice_unit* sliceunit = imgunit->get_next_unprocessed_slice_segment(); + + if (sliceunit != NULL) { + + //pop_front(imgunit->slice_units); + + if (sliceunit->flush_reorder_buffer) { + dpb.flush_reorder_buffer(); + } + + *did_work = true; + + //err = decode_slice_unit_sequential(imgunit, sliceunit); + err = decode_slice_unit_parallel(imgunit, sliceunit); + if (err) { + return err; + } + + //delete sliceunit; + } + } + + + + // if we decoded all slices of the current image and there will not + // be added any more slices to the image, output the image + + if ( ( image_units.size()>=2 && image_units[0]->all_slice_segments_processed()) || + ( image_units.size()>=1 && image_units[0]->all_slice_segments_processed() && + nal_parser.number_of_NAL_units_pending()==0 && + (nal_parser.is_end_of_stream() || nal_parser.is_end_of_frame()) )) { + + image_unit* imgunit = image_units[0]; + + *did_work=true; + + + // mark all CTBs as decoded even if they are not, because faulty input + // streams could miss part of the picture + // TODO: this will not work when slice decoding is parallel to post-filtering, + // so we will have to replace this with keeping track of which CTB should have + // been decoded (but aren't because of the input stream being faulty) + + imgunit->img->mark_all_CTB_progress(CTB_PROGRESS_PREFILTER); + + + + // run post-processing filters (deblocking & SAO) + + if (img->decctx->num_worker_threads) + run_postprocessing_filters_parallel(imgunit); + else + run_postprocessing_filters_sequential(imgunit->img); + + // process suffix SEIs + + for (int i=0;isuffix_SEIs.size();i++) { + const sei_message& sei = imgunit->suffix_SEIs[i]; + + err = process_sei(&sei, imgunit->img); + if (err != DE265_OK) + break; + } + + + push_picture_to_output_queue(imgunit); + + // remove just decoded image unit from queue + + delete imgunit; + + pop_front(image_units); + } + + return err; +} + + +de265_error decoder_context::decode_slice_unit_sequential(image_unit* imgunit, + slice_unit* sliceunit) +{ + de265_error err = DE265_OK; + + /* + printf("decode slice POC=%d addr=%d, img=%p\n", + sliceunit->shdr->slice_pic_order_cnt_lsb, + sliceunit->shdr->slice_segment_address, + imgunit->img); + */ + + remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList); + + if (sliceunit->shdr->slice_segment_address >= imgunit->img->get_pps().CtbAddrRStoTS.size()) { + return DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA; + } + + + struct thread_context tctx; + + tctx.shdr = sliceunit->shdr; + tctx.img = imgunit->img; + tctx.decctx = this; + tctx.imgunit = imgunit; + tctx.sliceunit= sliceunit; + tctx.CtbAddrInTS = imgunit->img->get_pps().CtbAddrRStoTS[tctx.shdr->slice_segment_address]; + tctx.task = NULL; + + init_thread_context(&tctx); + + if (sliceunit->reader.bytes_remaining <= 0) { + return DE265_ERROR_PREMATURE_END_OF_SLICE; + } + + init_CABAC_decoder(&tctx.cabac_decoder, + sliceunit->reader.data, + sliceunit->reader.bytes_remaining); + + // alloc CABAC-model array if entropy_coding_sync is enabled + + if (imgunit->img->get_pps().entropy_coding_sync_enabled_flag && + sliceunit->shdr->first_slice_segment_in_pic_flag) { + imgunit->ctx_models.resize( (img->get_sps().PicHeightInCtbsY-1) ); //* CONTEXT_MODEL_TABLE_LENGTH ); + } + + sliceunit->nThreads=1; + + err=read_slice_segment_data(&tctx); + + sliceunit->finished_threads.set_progress(1); + + return err; +} + + +void decoder_context::mark_whole_slice_as_processed(image_unit* imgunit, + slice_unit* sliceunit, + int progress) +{ + //printf("mark whole slice\n"); + + + // mark all CTBs upto the next slice segment as processed + + slice_unit* nextSegment = imgunit->get_next_slice_segment(sliceunit); + if (nextSegment) { + /* + printf("mark whole slice between %d and %d\n", + sliceunit->shdr->slice_segment_address, + nextSegment->shdr->slice_segment_address); + */ + + for (int ctb=sliceunit->shdr->slice_segment_address; + ctb < nextSegment->shdr->slice_segment_address; + ctb++) + { + if (ctb >= imgunit->img->number_of_ctbs()) + break; + + imgunit->img->ctb_progress[ctb].set_progress(progress); + } + } +} + + +de265_error decoder_context::decode_slice_unit_parallel(image_unit* imgunit, + slice_unit* sliceunit) +{ + de265_error err = DE265_OK; + + remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList); + + /* + printf("-------- decode --------\n"); + printf("IMAGE UNIT %p\n",imgunit); + sliceunit->shdr->dump_slice_segment_header(sliceunit->ctx, 1); + imgunit->dump_slices(); + */ + + de265_image* img = imgunit->img; + const pic_parameter_set& pps = img->get_pps(); + + sliceunit->state = slice_unit::InProgress; + + bool use_WPP = (img->decctx->num_worker_threads > 0 && + pps.entropy_coding_sync_enabled_flag); + + bool use_tiles = (img->decctx->num_worker_threads > 0 && + pps.tiles_enabled_flag); + + + // TODO: remove this warning later when we do frame-parallel decoding + if (img->decctx->num_worker_threads > 0 && + pps.entropy_coding_sync_enabled_flag == false && + pps.tiles_enabled_flag == false) { + + img->decctx->add_warning(DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING, true); + } + + + // If this is the first slice segment, mark all CTBs before this as processed + // (the real first slice segment could be missing). + + if (imgunit->is_first_slice_segment(sliceunit)) { + slice_segment_header* shdr = sliceunit->shdr; + int firstCTB = shdr->slice_segment_address; + + for (int ctb=0;ctbctb_progress[ctb].set_progress(CTB_PROGRESS_PREFILTER); + } + } + + + // if there is a previous slice that has been completely decoded, + // mark all CTBs until the start of this slice as completed + + //printf("this slice: %p\n",sliceunit); + slice_unit* prevSlice = imgunit->get_prev_slice_segment(sliceunit); + //if (prevSlice) printf("prev slice state: %d\n",prevSlice->state); + if (prevSlice && prevSlice->state == slice_unit::Decoded) { + mark_whole_slice_as_processed(imgunit,prevSlice,CTB_PROGRESS_PREFILTER); + } + + + // TODO: even though we cannot split this into several tasks, we should run it + // as a background thread + if (!use_WPP && !use_tiles) { + //printf("SEQ\n"); + err = decode_slice_unit_sequential(imgunit, sliceunit); + sliceunit->state = slice_unit::Decoded; + mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); + return err; + } + + + if (use_WPP && use_tiles) { + // TODO: this is not allowed ... output some warning or error + + return DE265_WARNING_PPS_HEADER_INVALID; + } + + + if (use_WPP) { + //printf("WPP\n"); + err = decode_slice_unit_WPP(imgunit, sliceunit); + sliceunit->state = slice_unit::Decoded; + mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); + return err; + } + else if (use_tiles) { + //printf("TILE\n"); + err = decode_slice_unit_tiles(imgunit, sliceunit); + sliceunit->state = slice_unit::Decoded; + mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER); + return err; + } + + assert(false); + return err; +} + + +de265_error decoder_context::decode_slice_unit_WPP(image_unit* imgunit, + slice_unit* sliceunit) +{ + de265_error err = DE265_OK; + + de265_image* img = imgunit->img; + slice_segment_header* shdr = sliceunit->shdr; + const pic_parameter_set& pps = img->get_pps(); + + int nRows = shdr->num_entry_point_offsets +1; + int ctbsWidth = img->get_sps().PicWidthInCtbsY; + + + assert(img->num_threads_active() == 0); + + + // reserve space to store entropy coding context models for each CTB row + + if (shdr->first_slice_segment_in_pic_flag) { + // reserve space for nRows-1 because we don't need to save the CABAC model in the last CTB row + imgunit->ctx_models.resize( (img->get_sps().PicHeightInCtbsY-1) ); //* CONTEXT_MODEL_TABLE_LENGTH ); + } + + + sliceunit->allocate_thread_contexts(nRows); + + + // first CTB in this slice + int ctbAddrRS = shdr->slice_segment_address; + int ctbRow = ctbAddrRS / ctbsWidth; + + for (int entryPt=0;entryPt0) { + ctbRow++; + ctbAddrRS = ctbRow * ctbsWidth; + } + else if (nRows>1 && (ctbAddrRS % ctbsWidth) != 0) { + // If slice segment consists of several WPP rows, each of them + // has to start at a row. + + //printf("does not start at start\n"); + + err = DE265_WARNING_SLICEHEADER_INVALID; + break; + } + + + // prepare thread context + + thread_context* tctx = sliceunit->get_thread_context(entryPt); + + tctx->shdr = shdr; + tctx->decctx = img->decctx; + tctx->img = img; + tctx->imgunit = imgunit; + tctx->sliceunit= sliceunit; + tctx->CtbAddrInTS = pps.CtbAddrRStoTS[ctbAddrRS]; + + init_thread_context(tctx); + + + // init CABAC + + int dataStartIndex; + if (entryPt==0) { dataStartIndex=0; } + else { dataStartIndex=shdr->entry_point_offset[entryPt-1]; } + + int dataEnd; + if (entryPt==nRows-1) dataEnd = sliceunit->reader.bytes_remaining; + else dataEnd = shdr->entry_point_offset[entryPt]; + + if (dataStartIndex<0 || dataEnd>sliceunit->reader.bytes_remaining || + dataEnd <= dataStartIndex) { + //printf("WPP premature end\n"); + err = DE265_ERROR_PREMATURE_END_OF_SLICE; + break; + } + + init_CABAC_decoder(&tctx->cabac_decoder, + &sliceunit->reader.data[dataStartIndex], + dataEnd-dataStartIndex); + + // add task + + //printf("start task for ctb-row: %d\n",ctbRow); + img->thread_start(1); + sliceunit->nThreads++; + add_task_decode_CTB_row(tctx, entryPt==0, ctbRow); + } + +#if 0 + for (;;) { + printf("q:%d r:%d b:%d f:%d\n", + img->nThreadsQueued, + img->nThreadsRunning, + img->nThreadsBlocked, + img->nThreadsFinished); + + if (img->debug_is_completed()) break; + + usleep(1000); + } +#endif + + img->wait_for_completion(); + + for (int i=0;itasks.size();i++) + delete imgunit->tasks[i]; + imgunit->tasks.clear(); + + return DE265_OK; +} + +de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit, + slice_unit* sliceunit) +{ + de265_error err = DE265_OK; + + de265_image* img = imgunit->img; + slice_segment_header* shdr = sliceunit->shdr; + const pic_parameter_set& pps = img->get_pps(); + + int nTiles = shdr->num_entry_point_offsets +1; + int ctbsWidth = img->get_sps().PicWidthInCtbsY; + + + assert(img->num_threads_active() == 0); + + sliceunit->allocate_thread_contexts(nTiles); + + + // first CTB in this slice + int ctbAddrRS = shdr->slice_segment_address; + int tileID = pps.TileIdRS[ctbAddrRS]; + + for (int entryPt=0;entryPt0) { + tileID++; + + if (tileID >= pps.num_tile_columns * pps.num_tile_rows) { + err = DE265_WARNING_SLICEHEADER_INVALID; + break; + } + + int ctbX = pps.colBd[tileID % pps.num_tile_columns]; + int ctbY = pps.rowBd[tileID / pps.num_tile_columns]; + ctbAddrRS = ctbY * ctbsWidth + ctbX; + } + + // set thread context + + thread_context* tctx = sliceunit->get_thread_context(entryPt); + + tctx->shdr = shdr; + tctx->decctx = img->decctx; + tctx->img = img; + tctx->imgunit = imgunit; + tctx->sliceunit= sliceunit; + tctx->CtbAddrInTS = pps.CtbAddrRStoTS[ctbAddrRS]; + + init_thread_context(tctx); + + + // init CABAC + + int dataStartIndex; + if (entryPt==0) { dataStartIndex=0; } + else { dataStartIndex=shdr->entry_point_offset[entryPt-1]; } + + int dataEnd; + if (entryPt==nTiles-1) dataEnd = sliceunit->reader.bytes_remaining; + else dataEnd = shdr->entry_point_offset[entryPt]; + + if (dataStartIndex<0 || dataEnd>sliceunit->reader.bytes_remaining || + dataEnd <= dataStartIndex) { + err = DE265_ERROR_PREMATURE_END_OF_SLICE; + break; + } + + init_CABAC_decoder(&tctx->cabac_decoder, + &sliceunit->reader.data[dataStartIndex], + dataEnd-dataStartIndex); + + // add task + + //printf("add tiles thread\n"); + img->thread_start(1); + sliceunit->nThreads++; + add_task_decode_slice_segment(tctx, entryPt==0, + ctbAddrRS % ctbsWidth, + ctbAddrRS / ctbsWidth); + } + + img->wait_for_completion(); + + for (int i=0;itasks.size();i++) + delete imgunit->tasks[i]; + imgunit->tasks.clear(); + + return err; +} + + +de265_error decoder_context::decode_NAL(NAL_unit* nal) +{ + //return decode_NAL_OLD(nal); + + decoder_context* ctx = this; + + de265_error err = DE265_OK; + + bitreader reader; + bitreader_init(&reader, nal->data(), nal->size()); + + nal_header nal_hdr; + nal_hdr.read(&reader); + ctx->process_nal_hdr(&nal_hdr); + + if (nal_hdr.nuh_layer_id > 0) { + // Discard all NAL units with nuh_layer_id > 0 + // These will have to be handeled by an SHVC decoder. + nal_parser.free_NAL_unit(nal); + return DE265_OK; + } + + loginfo(LogHighlevel,"NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n", + nal->data()[0], nal->data()[1], + get_NAL_name(nal_hdr.nal_unit_type), + nal_hdr.nuh_temporal_id); + + /* + printf("NAL: 0x%x 0x%x - unit type:%s temporal id:%d\n", + nal->data()[0], nal->data()[1], + get_NAL_name(nal_hdr.nal_unit_type), + nal_hdr.nuh_temporal_id); + */ + + // throw away NALs from higher TIDs than currently selected + // TODO: better online switching of HighestTID + + //printf("hTid: %d\n", current_HighestTid); + + if (nal_hdr.nuh_temporal_id > current_HighestTid) { + nal_parser.free_NAL_unit(nal); + return DE265_OK; + } + + + if (nal_hdr.nal_unit_type<32) { + err = read_slice_NAL(reader, nal, nal_hdr); + } + else switch (nal_hdr.nal_unit_type) { + case NAL_UNIT_VPS_NUT: + err = read_vps_NAL(reader); + nal_parser.free_NAL_unit(nal); + break; + + case NAL_UNIT_SPS_NUT: + err = read_sps_NAL(reader); + nal_parser.free_NAL_unit(nal); + break; + + case NAL_UNIT_PPS_NUT: + err = read_pps_NAL(reader); + nal_parser.free_NAL_unit(nal); + break; + + case NAL_UNIT_PREFIX_SEI_NUT: + case NAL_UNIT_SUFFIX_SEI_NUT: + err = read_sei_NAL(reader, nal_hdr.nal_unit_type==NAL_UNIT_SUFFIX_SEI_NUT); + nal_parser.free_NAL_unit(nal); + break; + + case NAL_UNIT_EOS_NUT: + ctx->FirstAfterEndOfSequenceNAL = true; + nal_parser.free_NAL_unit(nal); + break; + + default: + nal_parser.free_NAL_unit(nal); + break; + } + + return err; +} + + +de265_error decoder_context::decode(int* more) +{ + decoder_context* ctx = this; + + // if the stream has ended, and no more NALs are to be decoded, flush all pictures + + if (ctx->nal_parser.get_NAL_queue_length() == 0 && + (ctx->nal_parser.is_end_of_stream() || ctx->nal_parser.is_end_of_frame()) && + ctx->image_units.empty()) { + + // flush all pending pictures into output queue + + // ctx->push_current_picture_to_output_queue(); // TODO: not with new queue + ctx->dpb.flush_reorder_buffer(); + + if (more) { *more = ctx->dpb.num_pictures_in_output_queue(); } + + return DE265_OK; + } + + + // if NAL-queue is empty, we need more data + // -> input stalled + + if (ctx->nal_parser.is_end_of_stream() == false && + ctx->nal_parser.is_end_of_frame() == false && + ctx->nal_parser.get_NAL_queue_length() == 0) { + if (more) { *more=1; } + + return DE265_ERROR_WAITING_FOR_INPUT_DATA; + } + + + // when there are no free image buffers in the DPB, pause decoding + // -> output stalled + + if (!ctx->dpb.has_free_dpb_picture(false)) { + if (more) *more = 1; + return DE265_ERROR_IMAGE_BUFFER_FULL; + } + + + // decode one NAL from the queue + + de265_error err = DE265_OK; + bool did_work = false; + + if (ctx->nal_parser.get_NAL_queue_length()) { // number_of_NAL_units_pending()) { + NAL_unit* nal = ctx->nal_parser.pop_from_NAL_queue(); + assert(nal); + err = ctx->decode_NAL(nal); + // ctx->nal_parser.free_NAL_unit(nal); TODO: do not free NAL with new loop + did_work=true; + } + else if (ctx->nal_parser.is_end_of_frame() == true && + ctx->image_units.empty()) { + if (more) { *more=1; } + + return DE265_ERROR_WAITING_FOR_INPUT_DATA; + } + else { + err = decode_some(&did_work); + } + + if (more) { + // decoding error is assumed to be unrecoverable + *more = (err==DE265_OK && did_work); + } + + return err; +} + + +void decoder_context::process_nal_hdr(nal_header* nal) +{ + nal_unit_type = nal->nal_unit_type; + + IdrPicFlag = isIdrPic(nal->nal_unit_type); + RapPicFlag = isRapPic(nal->nal_unit_type); +} + + + +/* 8.3.1 + */ +void decoder_context::process_picture_order_count(slice_segment_header* hdr) +{ + loginfo(LogHeaders,"POC computation. lsb:%d prev.pic.lsb:%d msb:%d\n", + hdr->slice_pic_order_cnt_lsb, + prevPicOrderCntLsb, + PicOrderCntMsb); + + if (isIRAP(nal_unit_type) && + NoRaslOutputFlag) + { + PicOrderCntMsb=0; + + + // flush all images from reorder buffer + + flush_reorder_buffer_at_this_frame = true; + //ctx->dpb.flush_reorder_buffer(); + } + else + { + int MaxPicOrderCntLsb = current_sps->MaxPicOrderCntLsb; + + if ((hdr->slice_pic_order_cnt_lsb < prevPicOrderCntLsb) && + (prevPicOrderCntLsb - hdr->slice_pic_order_cnt_lsb) >= MaxPicOrderCntLsb/2) { + PicOrderCntMsb = prevPicOrderCntMsb + MaxPicOrderCntLsb; + } + else if ((hdr->slice_pic_order_cnt_lsb > prevPicOrderCntLsb) && + (hdr->slice_pic_order_cnt_lsb - prevPicOrderCntLsb) > MaxPicOrderCntLsb/2) { + PicOrderCntMsb = prevPicOrderCntMsb - MaxPicOrderCntLsb; + } + else { + PicOrderCntMsb = prevPicOrderCntMsb; + } + } + + img->PicOrderCntVal = PicOrderCntMsb + hdr->slice_pic_order_cnt_lsb; + img->picture_order_cnt_lsb = hdr->slice_pic_order_cnt_lsb; + + loginfo(LogHeaders,"POC computation. new msb:%d POC=%d\n", + PicOrderCntMsb, + img->PicOrderCntVal); + + if (img->nal_hdr.nuh_temporal_id==0 && + !isSublayerNonReference(nal_unit_type) && + !isRASL(nal_unit_type) && + !isRADL(nal_unit_type)) + { + loginfo(LogHeaders,"set prevPicOrderCntLsb/Msb\n"); + + prevPicOrderCntLsb = hdr->slice_pic_order_cnt_lsb; + prevPicOrderCntMsb = PicOrderCntMsb; + } +} + + +/* 8.3.3.2 + Returns DPB index of the generated picture. + */ +int decoder_context::generate_unavailable_reference_picture(const seq_parameter_set* sps, + int POC, bool longTerm) +{ + assert(dpb.has_free_dpb_picture(true)); + + std::shared_ptr current_sps = this->sps[ (int)current_pps->seq_parameter_set_id ]; + + int idx = dpb.new_image(current_sps, this, 0,0, false); + assert(idx>=0); + //printf("-> fill with unavailable POC %d\n",POC); + + de265_image* img = dpb.get_image(idx); + + img->fill_image(1<<(sps->BitDepth_Y-1), + 1<<(sps->BitDepth_C-1), + 1<<(sps->BitDepth_C-1)); + + img->fill_pred_mode(MODE_INTRA); + + img->PicOrderCntVal = POC; + img->picture_order_cnt_lsb = POC & (sps->MaxPicOrderCntLsb-1); + img->PicOutputFlag = false; + img->PicState = (longTerm ? UsedForLongTermReference : UsedForShortTermReference); + img->integrity = INTEGRITY_UNAVAILABLE_REFERENCE; + + return idx; +} + + +/* 8.3.2 invoked once per picture + + This function will mark pictures in the DPB as 'unused' or 'used for long-term reference' + */ +void decoder_context::process_reference_picture_set(slice_segment_header* hdr) +{ + std::vector removeReferencesList; + + const int currentID = img->get_ID(); + + + if (isIRAP(nal_unit_type) && NoRaslOutputFlag) { + + int currentPOC = img->PicOrderCntVal; + + // reset DPB + + /* The standard says: "When the current picture is an IRAP picture with NoRaslOutputFlag + equal to 1, all reference pictures currently in the DPB (if any) are marked as + "unused for reference". + + This seems to be wrong as it also throws out the first CRA picture in a stream like + RAP_A (decoding order: CRA,POC=64, RASL,POC=60). Removing only the pictures with + lower POCs seems to be compliant to the reference decoder. + */ + + for (int i=0;iPicState != UnusedForReference && + img->PicOrderCntVal < currentPOC && + img->removed_at_picture_id > img->get_ID()) { + + removeReferencesList.push_back(img->get_ID()); + img->removed_at_picture_id = img->get_ID(); + + //printf("will remove ID %d (a)\n",img->get_ID()); + } + } + } + + + if (isIDR(nal_unit_type)) { + + // clear all reference pictures + + NumPocStCurrBefore = 0; + NumPocStCurrAfter = 0; + NumPocStFoll = 0; + NumPocLtCurr = 0; + NumPocLtFoll = 0; + } + else { + const ref_pic_set* rps = &hdr->CurrRps; + + // (8-98) + + int i,j,k; + + // scan ref-pic-set for smaller POCs and fill into PocStCurrBefore / PocStFoll + + for (i=0, j=0, k=0; + iNumNegativePics; + i++) + { + if (rps->UsedByCurrPicS0[i]) { + PocStCurrBefore[j++] = img->PicOrderCntVal + rps->DeltaPocS0[i]; + //printf("PocStCurrBefore = %d\n",PocStCurrBefore[j-1]); + } + else { + PocStFoll[k++] = img->PicOrderCntVal + rps->DeltaPocS0[i]; + } + } + + NumPocStCurrBefore = j; + + + // scan ref-pic-set for larger POCs and fill into PocStCurrAfter / PocStFoll + + for (i=0, j=0; + iNumPositivePics; + i++) + { + if (rps->UsedByCurrPicS1[i]) { + PocStCurrAfter[j++] = img->PicOrderCntVal + rps->DeltaPocS1[i]; + //printf("PocStCurrAfter = %d\n",PocStCurrAfter[j-1]); + } + else { + PocStFoll[k++] = img->PicOrderCntVal + rps->DeltaPocS1[i]; + } + } + + NumPocStCurrAfter = j; + NumPocStFoll = k; + + + // find used / future long-term references + + for (i=0, j=0, k=0; + //inum_long_term_ref_pics_sps + hdr->num_long_term_pics; + inum_long_term_sps + hdr->num_long_term_pics; + i++) + { + int pocLt = PocLsbLt[i]; + + if (hdr->delta_poc_msb_present_flag[i]) { + int currentPictureMSB = img->PicOrderCntVal - hdr->slice_pic_order_cnt_lsb; + pocLt += currentPictureMSB + - DeltaPocMsbCycleLt[i] * current_sps->MaxPicOrderCntLsb; + } + + if (UsedByCurrPicLt[i]) { + PocLtCurr[j] = pocLt; + CurrDeltaPocMsbPresentFlag[j] = hdr->delta_poc_msb_present_flag[i]; + j++; + } + else { + PocLtFoll[k] = pocLt; + FollDeltaPocMsbPresentFlag[k] = hdr->delta_poc_msb_present_flag[i]; + k++; + } + } + + NumPocLtCurr = j; + NumPocLtFoll = k; + } + + + // (old 8-99) / (new 8-106) + // 1. + + std::vector picInAnyList(dpb.size(), false); + + + dpb.log_dpb_content(); + + for (int i=0;i=0) picInAnyList[k]=true; + else { + // TODO, CHECK: is it ok that we generate a picture with POC = LSB (PocLtCurr) + // We do not know the correct MSB + int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), + PocLtCurr[i], true); + picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture + + RefPicSetLtCurr[i] = k = concealedPicture; + picInAnyList[concealedPicture]=true; + } + + if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { + img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; + } + } + + + for (int i=0;i=0) picInAnyList[k]=true; + else { + int concealedPicture = k = generate_unavailable_reference_picture(current_sps.get(), + PocLtFoll[i], true); + picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture + + RefPicSetLtFoll[i] = concealedPicture; + picInAnyList[concealedPicture]=true; + } + } + + + // 2. Mark all pictures in RefPicSetLtCurr / RefPicSetLtFoll as UsedForLongTermReference + + for (int i=0;iPicState = UsedForLongTermReference; + } + + for (int i=0;iPicState = UsedForLongTermReference; + } + + + // 3. + + for (int i=0;i idx=%d\n",PocStCurrBefore[i], k); + + RefPicSetStCurrBefore[i] = k; // -1 == "no reference picture" + if (k>=0) picInAnyList[k]=true; + else { + int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), + PocStCurrBefore[i], false); + RefPicSetStCurrBefore[i] = k = concealedPicture; + + picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture + picInAnyList[concealedPicture] = true; + + //printf(" concealed: %d\n", concealedPicture); + } + + if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { + img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; + } + } + + for (int i=0;i idx=%d\n",PocStCurrAfter[i], k); + + RefPicSetStCurrAfter[i] = k; // -1 == "no reference picture" + if (k>=0) picInAnyList[k]=true; + else { + int concealedPicture = generate_unavailable_reference_picture(current_sps.get(), + PocStCurrAfter[i], false); + RefPicSetStCurrAfter[i] = k = concealedPicture; + + + picInAnyList.resize(dpb.size(), false); // adjust size of array to hold new picture + picInAnyList[concealedPicture]=true; + + //printf(" concealed: %d\n", concealedPicture); + } + + if (dpb.get_image(k)->integrity != INTEGRITY_CORRECT) { + img->integrity = INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE; + } + } + + for (int i=0;i=0) picInAnyList[k]=true; + } + + // 4. any picture that is not marked for reference is put into the "UnusedForReference" state + + for (int i=0;i=picInAnyList.size() || !picInAnyList[i]) // no reference + { + de265_image* dpbimg = dpb.get_image(i); + if (dpbimg != img && // not the current picture + dpbimg->removed_at_picture_id > img->get_ID()) // has not been removed before + { + if (dpbimg->PicState != UnusedForReference) { + removeReferencesList.push_back(dpbimg->get_ID()); + //printf("will remove ID %d (b)\n",dpbimg->get_ID()); + + dpbimg->removed_at_picture_id = img->get_ID(); + } + } + } + + hdr->RemoveReferencesList = removeReferencesList; + + //remove_images_from_dpb(hdr->RemoveReferencesList); +} + + +// 8.3.4 +// Returns whether we can continue decoding (or whether there is a severe error). +/* Called at beginning of each slice. + + Constructs + - the RefPicList[2][], containing indices into the DPB, and + - the RefPicList_POC[2][], containing POCs. + - LongTermRefPic[2][] is also set to true if it is a long-term reference + */ +bool decoder_context::construct_reference_picture_lists(slice_segment_header* hdr) +{ + int NumPocTotalCurr = hdr->NumPocTotalCurr; + int NumRpsCurrTempList0 = libde265_max(hdr->num_ref_idx_l0_active, NumPocTotalCurr); + + // TODO: fold code for both lists together + + int RefPicListTemp0[3*MAX_NUM_REF_PICS]; // TODO: what would be the correct maximum ? + int RefPicListTemp1[3*MAX_NUM_REF_PICS]; // TODO: what would be the correct maximum ? + char isLongTerm[2][3*MAX_NUM_REF_PICS]; + + memset(isLongTerm,0,2*3*MAX_NUM_REF_PICS); + + /* --- Fill RefPicListTmp0 with reference pictures in this order: + 1) short term, past POC + 2) short term, future POC + 3) long term + */ + + int rIdx=0; + while (rIdx < NumRpsCurrTempList0) { + for (int i=0;inum_ref_idx_l0_active > 16) { + add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + return false; + } + */ + + assert(hdr->num_ref_idx_l0_active <= 16); + for (rIdx=0; rIdxnum_ref_idx_l0_active; rIdx++) { + int idx = hdr->ref_pic_list_modification_flag_l0 ? hdr->list_entry_l0[rIdx] : rIdx; + + hdr->RefPicList[0][rIdx] = RefPicListTemp0[idx]; + hdr->LongTermRefPic[0][rIdx] = isLongTerm[0][idx]; + + // remember POC of referenced image (needed in motion.c, derive_collocated_motion_vector) + de265_image* img_0_rIdx = dpb.get_image(hdr->RefPicList[0][rIdx]); + if (img_0_rIdx==NULL) { + return false; + } + hdr->RefPicList_POC[0][rIdx] = img_0_rIdx->PicOrderCntVal; + hdr->RefPicList_PicState[0][rIdx] = img_0_rIdx->PicState; + } + + + /* --- Fill RefPicListTmp1 with reference pictures in this order: + 1) short term, future POC + 2) short term, past POC + 3) long term + */ + + if (hdr->slice_type == SLICE_TYPE_B) { + int NumRpsCurrTempList1 = libde265_max(hdr->num_ref_idx_l1_active, NumPocTotalCurr); + + int rIdx=0; + while (rIdx < NumRpsCurrTempList1) { + for (int i=0;inum_ref_idx_l0_active > 16) { + add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + return false; + } + + assert(hdr->num_ref_idx_l1_active <= 16); + for (rIdx=0; rIdxnum_ref_idx_l1_active; rIdx++) { + int idx = hdr->ref_pic_list_modification_flag_l1 ? hdr->list_entry_l1[rIdx] : rIdx; + + hdr->RefPicList[1][rIdx] = RefPicListTemp1[idx]; + hdr->LongTermRefPic[1][rIdx] = isLongTerm[1][idx]; + + // remember POC of referenced imaged (needed in motion.c, derive_collocated_motion_vector) + de265_image* img_1_rIdx = dpb.get_image(hdr->RefPicList[1][rIdx]); + if (img_1_rIdx == NULL) { return false; } + hdr->RefPicList_POC[1][rIdx] = img_1_rIdx->PicOrderCntVal; + hdr->RefPicList_PicState[1][rIdx] = img_1_rIdx->PicState; + } + } + + + // show reference picture lists + + loginfo(LogHeaders,"RefPicList[0] ="); + for (rIdx=0; rIdxnum_ref_idx_l0_active; rIdx++) { + loginfo(LogHeaders,"* [%d]=%d (LT=%d)", + hdr->RefPicList[0][rIdx], + hdr->RefPicList_POC[0][rIdx], + hdr->LongTermRefPic[0][rIdx] + ); + } + loginfo(LogHeaders,"*\n"); + + if (hdr->slice_type == SLICE_TYPE_B) { + loginfo(LogHeaders,"RefPicList[1] ="); + for (rIdx=0; rIdxnum_ref_idx_l1_active; rIdx++) { + loginfo(LogHeaders,"* [%d]=%d (LT=%d)", + hdr->RefPicList[1][rIdx], + hdr->RefPicList_POC[1][rIdx], + hdr->LongTermRefPic[1][rIdx] + ); + } + loginfo(LogHeaders,"*\n"); + } + + return true; +} + + + +void decoder_context::run_postprocessing_filters_sequential(de265_image* img) +{ +#if SAVE_INTERMEDIATE_IMAGES + char buf[1000]; + sprintf(buf,"pre-lf-%05d.yuv", img->PicOrderCntVal); + write_picture_to_file(img, buf); +#endif + + if (!img->decctx->param_disable_deblocking) { + apply_deblocking_filter(img); + } + +#if SAVE_INTERMEDIATE_IMAGES + sprintf(buf,"pre-sao-%05d.yuv", img->PicOrderCntVal); + write_picture_to_file(img, buf); +#endif + + if (!img->decctx->param_disable_sao) { + apply_sample_adaptive_offset_sequential(img); + } + +#if SAVE_INTERMEDIATE_IMAGES + sprintf(buf,"sao-%05d.yuv", img->PicOrderCntVal); + write_picture_to_file(img, buf); +#endif +} + + +void decoder_context::run_postprocessing_filters_parallel(image_unit* imgunit) +{ + de265_image* img = imgunit->img; + + int saoWaitsForProgress = CTB_PROGRESS_PREFILTER; + bool waitForCompletion = false; + + if (!img->decctx->param_disable_deblocking) { + add_deblocking_tasks(imgunit); + saoWaitsForProgress = CTB_PROGRESS_DEBLK_H; + } + + if (!img->decctx->param_disable_sao) { + waitForCompletion |= add_sao_tasks(imgunit, saoWaitsForProgress); + //apply_sample_adaptive_offset(img); + } + + img->wait_for_completion(); +} + +/* +void decoder_context::push_current_picture_to_output_queue() +{ + push_picture_to_output_queue(img); +} +*/ + +de265_error decoder_context::push_picture_to_output_queue(image_unit* imgunit) +{ + de265_image* outimg = imgunit->img; + + if (outimg==NULL) { return DE265_OK; } + + + // push image into output queue + + if (outimg->PicOutputFlag) { + loginfo(LogDPB,"new picture has output-flag=true\n"); + + if (outimg->integrity != INTEGRITY_CORRECT && + param_suppress_faulty_pictures) { + } + else { + dpb.insert_image_into_reorder_buffer(outimg); + } + + loginfo(LogDPB,"push image %d into reordering queue\n", outimg->PicOrderCntVal); + } + + // check for full reorder buffers + + int maxNumPicsInReorderBuffer = 0; + + // TODO: I'd like to have the has_vps() check somewhere else (not decode the picture at all) + if (outimg->has_vps()) { + int sublayer = outimg->get_vps().vps_max_sub_layers -1; + maxNumPicsInReorderBuffer = outimg->get_vps().layer[sublayer].vps_max_num_reorder_pics; + } + + if (dpb.num_pictures_in_reorder_buffer() > maxNumPicsInReorderBuffer) { + dpb.output_next_picture_in_reorder_buffer(); + } + + dpb.log_dpb_queues(); + + return DE265_OK; +} + + +// returns whether we can continue decoding the stream or whether we should give up +bool decoder_context::process_slice_segment_header(slice_segment_header* hdr, + de265_error* err, de265_PTS pts, + nal_header* nal_hdr, + void* user_data) +{ + *err = DE265_OK; + + flush_reorder_buffer_at_this_frame = false; + + + // get PPS and SPS for this slice + + int pps_id = hdr->slice_pic_parameter_set_id; + if (pps[pps_id]->pps_read==false) { + logerror(LogHeaders, "PPS %d has not been read\n", pps_id); + assert(false); // TODO + } + + current_pps = pps[pps_id]; + current_sps = sps[ (int)current_pps->seq_parameter_set_id ]; + current_vps = vps[ (int)current_sps->video_parameter_set_id ]; + + calc_tid_and_framerate_ratio(); + + + // --- prepare decoding of new picture --- + + if (hdr->first_slice_segment_in_pic_flag) { + + // previous picture has been completely decoded + + //ctx->push_current_picture_to_output_queue(); + + current_image_poc_lsb = hdr->slice_pic_order_cnt_lsb; + + + seq_parameter_set* sps = current_sps.get(); + + + // --- find and allocate image buffer for decoding --- + + int image_buffer_idx; + bool isOutputImage = (!sps->sample_adaptive_offset_enabled_flag || param_disable_sao); + image_buffer_idx = dpb.new_image(current_sps, this, pts, user_data, isOutputImage); + if (image_buffer_idx == -1) { + *err = DE265_ERROR_IMAGE_BUFFER_FULL; + return false; + } + + /*de265_image* */ img = dpb.get_image(image_buffer_idx); + img->nal_hdr = *nal_hdr; + + // Note: sps is already set in new_image() -> ??? still the case with shared_ptr ? + + img->set_headers(current_vps, current_sps, current_pps); + + img->decctx = this; + + img->clear_metadata(); + + + if (isIRAP(nal_unit_type)) { + if (isIDR(nal_unit_type) || + isBLA(nal_unit_type) || + first_decoded_picture || + FirstAfterEndOfSequenceNAL) + { + NoRaslOutputFlag = true; + FirstAfterEndOfSequenceNAL = false; + } + else if (0) // TODO: set HandleCraAsBlaFlag by external means + { + } + else + { + NoRaslOutputFlag = false; + HandleCraAsBlaFlag = false; + } + } + + + if (isRASL(nal_unit_type) && + NoRaslOutputFlag) + { + img->PicOutputFlag = false; + } + else + { + img->PicOutputFlag = !!hdr->pic_output_flag; + } + + process_picture_order_count(hdr); + + if (hdr->first_slice_segment_in_pic_flag) { + // mark picture so that it is not overwritten by unavailable reference frames + img->PicState = UsedForShortTermReference; + + process_reference_picture_set(hdr); + } + + img->PicState = UsedForShortTermReference; + + log_set_current_POC(img->PicOrderCntVal); + + + // next image is not the first anymore + + first_decoded_picture = false; + } + else { + // claims to be not the first slice, but there is no active image available + + if (img == NULL) { + return false; + } + } + + if (hdr->slice_type == SLICE_TYPE_B || + hdr->slice_type == SLICE_TYPE_P) + { + bool success = construct_reference_picture_lists(hdr); + if (!success) { + return false; + } + } + + //printf("process slice segment header\n"); + + loginfo(LogHeaders,"end of process-slice-header\n"); + dpb.log_dpb_content(); + + + if (hdr->dependent_slice_segment_flag==0) { + hdr->SliceAddrRS = hdr->slice_segment_address; + } else { + hdr->SliceAddrRS = previous_slice_header->SliceAddrRS; + } + + previous_slice_header = hdr; + + + loginfo(LogHeaders,"SliceAddrRS = %d\n",hdr->SliceAddrRS); + + return true; +} + + +void decoder_context::remove_images_from_dpb(const std::vector& removeImageList) +{ + for (int i=0;i=0) { + //printf("remove ID %d\n", removeImageList[i]); + de265_image* dpbimg = dpb.get_image( idx ); + dpbimg->PicState = UnusedForReference; + } + } +} + + + +/* + . 0 1 2 <- goal_HighestTid + +-----+-----+-----+ + | -0->| -1->| -2->| + +-----+-----+-----+ + 0 33 66 100 <- framerate_ratio + */ + +int decoder_context::get_highest_TID() const +{ + if (current_sps) { return current_sps->sps_max_sub_layers-1; } + if (current_vps) { return current_vps->vps_max_sub_layers-1; } + + return 6; +} + +void decoder_context::set_limit_TID(int max_tid) +{ + limit_HighestTid = max_tid; + calc_tid_and_framerate_ratio(); +} + +int decoder_context::change_framerate(int more) +{ + if (current_sps == NULL) { return framerate_ratio; } + + int highestTid = get_highest_TID(); + + assert(more>=-1 && more<=1); + + goal_HighestTid += more; + goal_HighestTid = std::max(goal_HighestTid, 0); + goal_HighestTid = std::min(goal_HighestTid, highestTid); + + framerate_ratio = framedrop_tid_index[goal_HighestTid]; + + calc_tid_and_framerate_ratio(); + + return framerate_ratio; +} + +void decoder_context::set_framerate_ratio(int percent) +{ + framerate_ratio = percent; + calc_tid_and_framerate_ratio(); +} + +void decoder_context::compute_framedrop_table() +{ + int highestTID = get_highest_TID(); + + for (int tid=highestTID ; tid>=0 ; tid--) { + int lower = 100 * tid /(highestTID+1); + int higher = 100 * (tid+1)/(highestTID+1); + + for (int l=lower; l<=higher; l++) { + int ratio = 100 * (l-lower) / (higher-lower); + + // if we would exceed our TID limit, decode the highest TID at full frame-rate + if (tid > limit_HighestTid) { + tid = limit_HighestTid; + ratio = 100; + } + + framedrop_tab[l].tid = tid; + framedrop_tab[l].ratio = ratio; + } + + framedrop_tid_index[tid] = higher; + } + +#if 0 + for (int i=0;i<=100;i++) { + printf("%d%%: %d/%d",i, framedrop_tab[i].tid, framedrop_tab[i].ratio); + for (int k=0;k<=highestTID;k++) { + if (framedrop_tid_index[k] == i) printf(" ** TID=%d **",k); + } + printf("\n"); + } +#endif +} + +void decoder_context::calc_tid_and_framerate_ratio() +{ + int highestTID = get_highest_TID(); + + + // if number of temporal layers changed, we have to recompute the framedrop table + + if (framedrop_tab[100].tid != highestTID) { + compute_framedrop_table(); + } + + goal_HighestTid = framedrop_tab[framerate_ratio].tid; + layer_framerate_ratio = framedrop_tab[framerate_ratio].ratio; + + // TODO: for now, we switch immediately + current_HighestTid = goal_HighestTid; +} + + +void error_queue::add_warning(de265_error warning, bool once) +{ + // check if warning was already shown + bool add=true; + if (once) { + for (int i=0;i + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "dpb.h" +#include "decctx.h" +#include +#include + + +#define DPB_DEFAULT_MAX_IMAGES 30 + + +decoded_picture_buffer::decoded_picture_buffer() +{ + max_images_in_DPB = DPB_DEFAULT_MAX_IMAGES; + norm_images_in_DPB = DPB_DEFAULT_MAX_IMAGES; +} + + +decoded_picture_buffer::~decoded_picture_buffer() +{ + for (int i=0;iPicOrderCntVal, + dpb[i]->get_ID(), + dpb[i]->PicState == UnusedForReference ? "unused" : + dpb[i]->PicState == UsedForShortTermReference ? "short-term" : "long-term", + dpb[i]->PicOutputFlag ? "output" : "---"); + } +} + + +bool decoded_picture_buffer::has_free_dpb_picture(bool high_priority) const +{ + // we will always adapt the buffer to insert high-priority images + if (high_priority) return true; + + // quick test to check for free slots + if (dpb.size() < max_images_in_DPB) return true; + + // scan for empty slots + for (int i=0;iPicOutputFlag==false && dpb[i]->PicState == UnusedForReference) { + return true; + } + } + + return false; +} + + +int decoded_picture_buffer::DPB_index_of_picture_with_POC(int poc, int currentID, bool preferLongTerm) const +{ + logdebug(LogHeaders,"DPB_index_of_picture_with_POC POC=%d\n",poc); + + //log_dpb_content(ctx); + //loginfo(LogDPB,"searching for short-term reference POC=%d\n",poc); + + if (preferLongTerm) { + for (int k=0;kPicOrderCntVal == poc && + dpb[k]->removed_at_picture_id > currentID && + dpb[k]->PicState == UsedForLongTermReference) { + return k; + } + } + } + + for (int k=0;kPicOrderCntVal == poc && + dpb[k]->removed_at_picture_id > currentID && + dpb[k]->PicState != UnusedForReference) { + return k; + } + } + + return -1; +} + + +int decoded_picture_buffer::DPB_index_of_picture_with_LSB(int lsb, int currentID, bool preferLongTerm) const +{ + logdebug(LogHeaders,"get access to picture with LSB %d from DPB\n",lsb); + + if (preferLongTerm) { + for (int k=0;kpicture_order_cnt_lsb == lsb && + dpb[k]->removed_at_picture_id > currentID && + dpb[k]->PicState == UsedForLongTermReference) { + return k; + } + } + } + + for (int k=0;kpicture_order_cnt_lsb == lsb && + dpb[k]->removed_at_picture_id > currentID && + dpb[k]->PicState != UnusedForReference) { + return k; + } + } + + return -1; +} + + +int decoded_picture_buffer::DPB_index_of_picture_with_ID(int id) const +{ + logdebug(LogHeaders,"get access to picture with ID %d from DPB\n",id); + + for (int k=0;kget_ID() == id) { + return k; + } + } + + return -1; +} + + +void decoded_picture_buffer::output_next_picture_in_reorder_buffer() +{ + assert(!reorder_output_queue.empty()); + + // search for picture in reorder buffer with minimum POC + + int minPOC = reorder_output_queue[0]->PicOrderCntVal; + int minIdx = 0; + for (int i=1;iPicOrderCntVal < minPOC) { + minPOC = reorder_output_queue[i]->PicOrderCntVal; + minIdx = i; + } + } + + + // put image into output queue + + image_output_queue.push_back(reorder_output_queue[minIdx]); + + + // remove image from reorder buffer + + reorder_output_queue[minIdx] = reorder_output_queue.back(); + reorder_output_queue.pop_back(); +} + + +bool decoded_picture_buffer::flush_reorder_buffer() +{ + // return 'false' when there are no pictures in reorder buffer + if (reorder_output_queue.empty()) return false; + + while (!reorder_output_queue.empty()) { + output_next_picture_in_reorder_buffer(); + } + + return true; +} + + +void decoded_picture_buffer::clear() +{ + for (int i=0;iPicOutputFlag || + dpb[i]->PicState != UnusedForReference) + { + dpb[i]->PicOutputFlag = false; + dpb[i]->PicState = UnusedForReference; + dpb[i]->release(); + } + } + + reorder_output_queue.clear(); + image_output_queue.clear(); +} + + +int decoded_picture_buffer::new_image(std::shared_ptr sps, + decoder_context* decctx, + de265_PTS pts, void* user_data, bool isOutputImage) +{ + loginfo(LogHeaders,"DPB::new_image\n"); + log_dpb_content(); + + // --- search for a free slot in the DPB --- + + int free_image_buffer_idx = -1; + for (int i=0;ican_be_released()) { + dpb[i]->release(); /* TODO: this is surely not the best place to free the image, but + we have to do it here because releasing it in de265_release_image() + would break the API compatibility. */ + + free_image_buffer_idx = i; + break; + } + } + + + // Try to free a buffer at the end if the DPB got too large. + /* This should also probably move to a better place as soon as the API allows for this. */ + + if (dpb.size() > norm_images_in_DPB && // buffer too large + free_image_buffer_idx != dpb.size()-1 && // last slot not reused in this alloc + dpb.back()->can_be_released()) // last slot is free + { + delete dpb.back(); + dpb.pop_back(); + } + + + // create a new image slot if no empty slot remaining + + if (free_image_buffer_idx == -1) { + free_image_buffer_idx = dpb.size(); + dpb.push_back(new de265_image); + } + + + // --- allocate new image --- + + de265_image* img = dpb[free_image_buffer_idx]; + + int w = sps->pic_width_in_luma_samples; + int h = sps->pic_height_in_luma_samples; + + enum de265_chroma chroma; + switch (sps->chroma_format_idc) { + case 0: chroma = de265_chroma_mono; break; + case 1: chroma = de265_chroma_420; break; + case 2: chroma = de265_chroma_422; break; + case 3: chroma = de265_chroma_444; break; + default: chroma = de265_chroma_420; assert(0); break; // should never happen + } + + img->alloc_image(w,h, chroma, sps, true, decctx, /*NULL,*/ pts, user_data, isOutputImage); + + img->integrity = INTEGRITY_CORRECT; + + return free_image_buffer_idx; +} + + +void decoded_picture_buffer::pop_next_picture_in_output_queue() +{ + image_output_queue.pop_front(); + + + loginfo(LogDPB, "DPB output queue: "); + for (int i=0;iPicOrderCntVal); + } + loginfo(LogDPB,"*\n"); +} + + +void decoded_picture_buffer::log_dpb_queues() const +{ + loginfo(LogDPB, "DPB reorder queue (after push): "); + for (int i=0;iPicOrderCntVal); + } + loginfo(LogDPB,"*\n"); + + loginfo(LogDPB, "DPB output queue (after push): "); + for (int i=0;iPicOrderCntVal); + } + loginfo(LogDPB,"*\n"); +} diff --git a/fallback-dct.cc b/fallback-dct.cc new file mode 100644 index 0000000..2e99c7c --- /dev/null +++ b/fallback-dct.cc @@ -0,0 +1,1210 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "fallback-dct.h" + +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + +#include +#include + + +static void printMatrix(const char* name, const int16_t* v, int n) +{ + printf("--- %s ---\n",name); + for (int r=0;r>bdShift2; + + dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c); + } +} + + +void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + int nT = 4; + int bdShift2 = 20-bit_depth; + + assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size + + for (int y=0;y>bdShift2; + + dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth); + } +} + + +void transform_skip_residual_fallback(int32_t *residual, const int16_t *coeffs, int nT, + int tsShift,int bdShift) +{ + const int rnd = 1<<(bdShift-1); + + for (int y=0;y> bdShift; + } +} + + +void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride) +{ + int bitDepth = 8; + int bdShift2 = 20-bitDepth; + int offset = (1<<(bdShift2-1)); + int tsShift = 5 + log2nT; // TODO: extended_precision + int nT = 1<>bdShift2; + + dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); + } + } +} + +void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride) +{ + int bitDepth = 8; + int bdShift2 = 20-bitDepth; + int offset = (1<<(bdShift2-1)); + int tsShift = 5 + log2nT; // TODO: extended_precision + int nT = 1<>bdShift2; + + dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum); + } + } +} + + +void transform_bypass_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride) +{ + for (int x=0;x>bdShift; + residual[y*nT+x] = sum; + } + } +} + + +void rdpcm_h_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift) +{ + int rnd = (1<<(bdShift-1)); + + for (int y=0;y>bdShift; + residual[y*nT+x] = sum; + } + } +} + + +void transform_bypass_fallback(int32_t *dst, const int16_t *coeffs, int nT) +{ + for (int y=0;y "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * coeffs[c+j*4]; + } + + g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7); + } + + /* + for (int y=0;y<4;y++) { + logtrace(LogTransform,"*%d ",g[y][c]); + } + logtrace(LogTransform,"*\n"); + */ + } + + + // --- H --- + + for (int y=0;y<4;y++) { + + /* + logtrace(LogTransform,"DST-H: "); + for (int c=0;c<4;c++) { + logtrace(LogTransform,"%d ",g[y][c]); + } + logtrace(LogTransform,"* -> "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * g[y][j]; + } + + int out = Clip3(-32768,32767, (sum+rndH)>>postShift); + + dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out); + + logtrace(LogTransform,"*%d ",out); + } + + logtrace(LogTransform,"*\n"); + } +} + + +void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, + int bit_depth) +{ + int16_t g[4][4]; + + int postShift = 20-bit_depth; + int rndV = 1<<(7-1); + int rndH = 1<<(postShift-1); + + + // --- V --- + + for (int c=0;c<4;c++) { + /* + logtrace(LogTransform,"DST-V: "); + for (int r=0;r<4;r++) { + logtrace(LogTransform,"%d ",coeffs[c+r*4]); + } + logtrace(LogTransform,"* -> "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * coeffs[c+j*4]; + } + + g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7); + } + + /* + for (int y=0;y<4;y++) { + logtrace(LogTransform,"*%d ",g[y][c]); + } + logtrace(LogTransform,"*\n"); + */ + } + + + // --- H --- + + for (int y=0;y<4;y++) { + + /* + logtrace(LogTransform,"DST-H: "); + for (int c=0;c<4;c++) { + logtrace(LogTransform,"%d ",g[y][c]); + } + logtrace(LogTransform,"* -> "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * g[y][j]; + } + + int out = Clip3(-32768,32767, (sum+rndH)>>postShift); + + dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth); + + logtrace(LogTransform,"*%d ",out); + } + + logtrace(LogTransform,"*\n"); + } +} + + +void fdst_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + int16_t g[4*4]; + + int BD = 8; + int shift1 = Log2(4) + BD -9; + int shift2 = Log2(4) + 6; + + int rnd1 = 1<<(shift1-1); + int rnd2 = 1<<(shift2-1); + + + // --- V --- + + for (int c=0;c<4;c++) { + + /* + logtrace(LogTransform,"DST-V: "); + for (int r=0;r<4;r++) { + logtrace(LogTransform,"%d ",coeffs[c+r*4]); + } + logtrace(LogTransform,"* -> "); + */ + + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[i][j] * input[c+j*stride]; + } + + g[c+4*i] = Clip3(-32768,32767, (sum+rnd1)>>shift1); + } + } + + + // --- H --- + + for (int y=0;y<4;y++) { + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[i][j] * g[y*4+j]; + } + + // TODO: do we need clipping ? + int out = (sum+rnd2)>>shift2; // Clip3(-32768,32767, (sum+rndH)>>postShift); + + coeffs[y*4+i] = out; + + logtrace(LogTransform,"*%d ",out); + } + + logtrace(LogTransform,"*\n"); + } +} + + +void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) +{ + int16_t g[4][4]; + + int rndV = 1<<(7-1); + int rndH = 1<<(bdShift-1); + + int CoeffMax = (1<>7); + } + } + + + // --- H --- + + for (int y=0;y<4;y++) { + for (int i=0;i<4;i++) { + int sum=0; + + for (int j=0;j<4;j++) { + sum += mat_8_357[j][i] * g[y][j]; + } + + dst[y*4+i] = (sum + rndH)>>bdShift; + } + } +} + + + +static int8_t mat_dct[32][32] = { + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, + { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90}, + { 90, 87, 80, 70, 57, 43, 25, 9, -9,-25,-43,-57,-70,-80,-87,-90, -90,-87,-80,-70,-57,-43,-25, -9, 9, 25, 43, 57, 70, 80, 87, 90}, + { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4,-22,-46,-67,-82,-90}, + { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89, 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89}, + { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22, -22,-61,-85,-90,-73,-38, 4, 46, 78, 90, 82, 54, 13,-31,-67,-88}, + { 87, 57, 9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87, -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43, 9, 57, 87}, + { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31, 31, 78, 90, 61, 4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85}, + { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83}, + { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67, 4, 73, 88, 38, -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82}, + { 80, 9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80, -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70, 9, 80}, + { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46, 46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82, 4,-78}, + { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75, 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75}, + { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54, -54,-85, 4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73}, + { 70,-43,-87, 9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70, -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90, 9,-87,-43, 70}, + { 67,-54,-78, 38, 85,-22,-90, 4, 90, 13,-88,-31, 82, 46,-73,-61, 61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67}, + { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64}, + { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67, -67,-54, 78, 38,-85,-22, 90, 4,-90, 13, 88,-31,-82, 46, 73,-61}, + { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87, 9,-90, 25, 80,-57, -57, 80, 25,-90, 9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57}, + { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73, 73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88, 4, 85,-54}, + { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50, 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50}, + { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82, 4, 78, -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46}, + { 43,-90, 57, 25,-87, 70, 9,-80, 80, -9,-70, 87,-25,-57, 90,-43, -43, 90,-57,-25, 87,-70, -9, 80,-80, 9, 70,-87, 25, 57,-90, 43}, + { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82, 82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67, 4,-73, 88,-38}, + { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36}, + { 31,-78, 90,-61, 4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85, -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31}, + { 25,-70, 90,-80, 43, 9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25, -25, 70,-90, 80,-43, -9, 57,-87, 87,-57, 9, 43,-80, 90,-70, 25}, + { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88, 88,-67, 31, 13,-54, 82,-90, 78,-46, 4, 38,-73, 90,-85, 61,-22}, + { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18, 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18}, + { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31, 4, 22,-46, 67,-82, 90, -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13}, + { 9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9, -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25, 9}, + { 4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90, 90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4} +}; + + + + +template +void transform_idct_add(pixel_t *dst, ptrdiff_t stride, + int nT, const int16_t *coeffs, int bit_depth) +{ + /* + The effective shift is + 7 bits right for bit-depth 8, + 6 bits right for bit-depth 9, + 5 bits right for bit-depth 10. + + Computation is independent of the block size. + Each multiplication with the table includes a left shift of 6 bits. + Hence, we have 2* 6 bits = 12 bits left shift. + V-pass has fixed 7 bit right shift. + H-pass has 20-BitDepth bit right shift; + + Effective shift 's' means: residual value 1 gives DC-coeff (1< "); + */ + + + // find last non-zero coefficient to reduce computations carried out in DCT + + int lastCol = nT-1; + for (;lastCol>=0;lastCol--) { + if (coeffs[c+lastCol*nT]) { break; } + } + + for (int i=0;i>7); + + logtrace(LogTransform,"*%d ",g[c+i*nT]); + } + logtrace(LogTransform,"*\n"); + } + + /* + printf("--- temp\n"); + for (int r=0;r "); + */ + + + // find last non-zero coefficient to reduce computations carried out in DCT + + int lastCol = nT-1; + for (;lastCol>=0;lastCol--) { + if (g[y*nT+lastCol]) { break; } + } + + + for (int i=0;i>postShift); + int out = (sum+rnd2)>>postShift; + + //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i); + //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i])); + dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth); + + logtrace(LogTransform,"*%d ",out); + } + logtrace(LogTransform,"*\n"); + } +} + + + +void transform_idct_fallback(int32_t *dst, int nT, const int16_t *coeffs, int bdShift, int max_coeff_bits) +{ + /* + The effective shift is + 7 bits right for bit-depth 8, + 6 bits right for bit-depth 9, + 5 bits right for bit-depth 10. + + One transformation with raw transform filter values increases range be 2048 (=32*64). + This equals 11 bits. + + Computation is independent of the block size. + Each multiplication with the table includes a left shift of 6 bits. + Hence, we have 2* 6 bits = 12 bits left shift. + V-pass has fixed 7 bit right shift. + H-pass has 20-BitDepth bit right shift; + + Effective shift 's' means: residual value 1 gives DC-coeff (1< "); + */ + + + // find last non-zero coefficient to reduce computations carried out in DCT + + int lastCol = nT-1; + for (;lastCol>=0;lastCol--) { + if (coeffs[c+lastCol*nT]) { break; } + } + + for (int i=0;i>7); + + logtrace(LogTransform,"*%d ",g[c+i*nT]); + } + logtrace(LogTransform,"*\n"); + } + + /* + printf("--- temp\n"); + for (int r=0;r "); + */ + + + // find last non-zero coefficient to reduce computations carried out in DCT + + int lastCol = nT-1; + for (;lastCol>=0;lastCol--) { + if (g[y*nT+lastCol]) { break; } + } + + + for (int i=0;i>bdShift; + + logtrace(LogTransform,"*%d ",sum); + } + logtrace(LogTransform,"*\n"); + } +} + + +void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) +{ + transform_idct_fallback(dst,4,coeffs,bdShift,max_coeff_bits); +} + +void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits) +{ + transform_idct_fallback(dst,8,coeffs,bdShift,max_coeff_bits); +} + +void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs, + int bdShift, int max_coeff_bits) +{ + transform_idct_fallback(dst,16,coeffs,bdShift,max_coeff_bits); +} + +void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs, + int bdShift, int max_coeff_bits) +{ + transform_idct_fallback(dst,32,coeffs,bdShift,max_coeff_bits); +} + + + + +void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) +{ + transform_idct_add(dst,stride, 4, coeffs, 8); +} + +void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) +{ + transform_idct_add(dst,stride, 8, coeffs, 8); +} + +void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) +{ + transform_idct_add(dst,stride, 16, coeffs, 8); +} + +void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) +{ + transform_idct_add(dst,stride, 32, coeffs, 8); +} + + +void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + transform_idct_add(dst,stride, 4, coeffs, bit_depth); +} + +void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + transform_idct_add(dst,stride, 8, coeffs, bit_depth); +} + +void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + transform_idct_add(dst,stride, 16, coeffs, bit_depth); +} + +void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) +{ + transform_idct_add(dst,stride, 32, coeffs, bit_depth); +} + + +static void transform_fdct_8(int16_t* coeffs, int nT, + const int16_t *input, ptrdiff_t stride) +{ + /* + Each sum over a basis vector sums nT elements, which is compensated by + shifting right by Log2(nT), effectively dividing by 2^Log2(nT) = nT. + Do this in each of the H/V passes. + + Each multiplication with the table includes a left shift of 6 bits. + Hence, we have in total 2* 6 bits = 12 bits left shift because of the + multiplications. + + We carry out shifts after each pass: + First (V) pass has BitDepth-9 bit right shift, + Second (H) pass has fixed 6 bit right shift. + + For bit-depth 8, the total shift is 7 bits left. + For bit-depth 9, the total shift is 6 bits left. + For bit-depth 10, the total shift is 5 bits left. + + I.e.: a constant residual value 1 gives DC-coeff (1< 4 bits and we are down to 16 bits again. + After the second pass, we need 16+5+6=27 bits for the intermediate sum + (16 bit input, 5 bit because we sum 2^5 elements, 6 bit because of coefficient multiplication). + The second pass shift is Log2(32)+6 = 11 and we are down again to 16 bits. + + For larger input bit-depths, the intermediate result after the first pass + will be wider accordingly, but the widths after the shifts are the same. + */ + + int BitDepth = 8; + + // / compensate everything | / effective word length | + int shift1 = Log2(nT) + 6 + BitDepth - 15; + int shift2 = Log2(nT) + 6; + + int rnd1 = 1<<(shift1-1); + int rnd2 = 1<<(shift2-1); + int fact = (1<<(5-Log2(nT))); + + int16_t g[32*32]; // actually, only [nT*nT] used + + for (int c=0;c>shift1; // clipping to -32768;32767 unnecessary + } + } + + + for (int y=0;y>shift2; + + coeffs[y*nT+i] = out; + } + } +} + + +void fdct_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + transform_fdct_8(coeffs, 4, input,stride); +} + +void fdct_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + transform_fdct_8(coeffs, 8, input,stride); +} + +void fdct_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + transform_fdct_8(coeffs, 16, input,stride); +} + +void fdct_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride) +{ + transform_fdct_8(coeffs, 32, input,stride); +} + + + + +void hadamard_transform_8(int16_t *coeffs, int n, const int16_t *input, ptrdiff_t stride) +{ + int16_t tmp[32*32]; + + // row transforms + + //printMatrix("input",input,n); + + int16_t am[32],bm[32]; + int16_t *a = am, *b = bm; + for (int row=0;row>1);i++) { + a[ i] = input[i+rs] + input[i+(n>>1)+rs]; + a[(n>>1)+i] = input[i+rs] - input[i+(n>>1)+rs]; + } + + int iOuter=(n>>1); + int nInner=(n>>2); + + while (nInner>=2) { + std::swap(a,b); + + for (int k=0;k>=1; + nInner>>=1; + } + + for (int k=0;k>1);i++) { + a[ i] = tmp[i*n+col] + tmp[(i+(n>>1))*n+col]; + a[(n>>1)+i] = tmp[i*n+col] - tmp[(i+(n>>1))*n+col]; + } + + int iOuter=(n>>1); + int nInner=(n>>2); + + while (nInner>=2) { + std::swap(a,b); + + for (int k=0;k>=1; + nInner>>=1; + } + + for (int k=0;k + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "fallback-motion.h" +#include "util.h" + +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + +#include + + +void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height) +{ + int offset8bit = 32; + int shift8bit = 6; + + assert((width&1)==0); + + for (int y=0;y>shift8bit); + out[1] = Clip1_8bit((in[1] + offset8bit)>>shift8bit); + out+=2; in+=2; + } + } +} + + +void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD) +{ + assert(log2WD>=1); // TODO + + const int rnd = (1<<(log2WD-1)); + + for (int y=0;y>log2WD) + o); + out++; in++; + } + } +} + +void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD) +{ + assert(log2WD>=1); // TODO + + const int rnd = ((o1+o2+1) << log2WD); + + for (int y=0;y>(log2WD+1)); + out++; in1++; in2++; + } + } +} + + +void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height) +{ + int offset8bit = 64; + int shift8bit = 7; + + assert((width&1)==0); + + // I had a special case for 8-pixel parallel, unrolled code, + // but I did not see any speedup. + +#if 0 + for (int y=0;y>shift8bit); + out++; in1++; in2++; + } + } +#endif + +#if 0 + if ((width&7)==0) { + for (int y=0;y>shift8bit); + out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit); + out[2] = Clip1_8bit((in1[2] + in2[2] + offset8bit)>>shift8bit); + out[3] = Clip1_8bit((in1[3] + in2[3] + offset8bit)>>shift8bit); + out[4] = Clip1_8bit((in1[4] + in2[4] + offset8bit)>>shift8bit); + out[5] = Clip1_8bit((in1[5] + in2[5] + offset8bit)>>shift8bit); + out[6] = Clip1_8bit((in1[6] + in2[6] + offset8bit)>>shift8bit); + out[7] = Clip1_8bit((in1[7] + in2[7] + offset8bit)>>shift8bit); + out+=8; in1+=8; in2+=8; + } + } + } + else +#endif + { + for (int y=0;y>shift8bit); + out[1] = Clip1_8bit((in1[1] + in2[1] + offset8bit)>>shift8bit); + out+=2; in1+=2; in2+=2; + } + } + } +} + + + + + +void put_unweighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) +{ + int shift1 = 14-bit_depth; + int offset1 = 0; + if (shift1>0) { offset1 = 1<<(shift1-1); } + + assert((width&1)==0); + + for (int y=0;y>shift1, bit_depth); + out[1] = Clip_BitDepth((in[1] + offset1)>>shift1, bit_depth); + out+=2; in+=2; + } + } +} + +#include + +void put_weighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) +{ + assert(log2WD>=1); // TODO + + const int rnd = (1<<(log2WD-1)); + + for (int y=0;y>log2WD) + o, bit_depth); + out++; in++; + } + } +} + +void put_weighted_bipred_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) +{ + assert(log2WD>=1); // TODO + + const int rnd = ((o1+o2+1) << log2WD); + + for (int y=0;y>(log2WD+1), bit_depth); + out++; in1++; in2++; + } + } +} + + +void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height, int bit_depth) +{ + int shift2 = 15-bit_depth; + int offset2 = 1<<(shift2-1); + + assert((width&1)==0); + + for (int y=0;y>shift2, bit_depth); + out[1] = Clip_BitDepth((in1[1] + in2[1] + offset2)>>shift2, bit_depth); + out+=2; in1+=2; in2+=2; + } + } +} + + + + + +void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride, + const uint8_t *src, ptrdiff_t src_stride, + int width, int height, + int mx, int my, int16_t* mcbuffer) +{ + int shift3 = 6; + + for (int y=0;y +void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dst_stride, + const pixel_t *src, ptrdiff_t src_stride, + int nPbWC, int nPbHC, + int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth) +{ + const int shift1 = bit_depth-8; + const int shift2 = 6; + //const int shift3 = 6; + + int extra_left = 1; + int extra_top = 1; + // int extra_right = 2; + int extra_bottom= 2; + + + int nPbH_extra = extra_top + nPbHC + extra_bottom; + + int16_t* tmp2buf = (int16_t*)alloca( nPbWC * nPbH_extra * sizeof(int16_t) ); + + /* + int nPbW_extra = extra_left + nPbWC + extra_right; + + + printf("x,y FracC: %d/%d\n",xFracC,yFracC); + + printf("---IN---\n"); + + for (int y=-extra_top;y>shift1; break; + case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>shift1; break; + case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>shift1; break; + case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>shift1; break; + case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>shift1; break; + case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>shift1; break; + default: + case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>shift1; break; + } + + //printf("%d %d %d %d -> %d\n",p[0],p[1],p[2],p[3],v); + + tmp2buf[y+extra_top + x*nPbH_extra] = v; + p++; + + //printf("%05d ",tmp2buf[y+extra_top + x*nPbH_extra]); + } + //printf("\n"); + } + + // V-filters + + int vshift = (xFracC==0 ? shift1 : shift2); + + for (int x=0;x>vshift; break; + case 2: v = (-4*p[0]+54*p[1]+16*p[2]-2*p[3])>>vshift; break; + case 3: v = (-6*p[0]+46*p[1]+28*p[2]-4*p[3])>>vshift; break; + case 4: v = (-4*p[0]+36*p[1]+36*p[2]-4*p[3])>>vshift; break; + case 5: v = (-4*p[0]+28*p[1]+46*p[2]-6*p[3])>>vshift; break; + case 6: v = (-2*p[0]+16*p[1]+54*p[2]-4*p[3])>>vshift; break; + default: + case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>vshift; break; + } + + dst[x + y*dst_stride] = v; + p++; + } + + } + + /* + printf("---V---\n"); + for (int y=0;y(int16_t *dst, ptrdiff_t dst_stride, + const uint8_t *src, ptrdiff_t src_stride, + int nPbWC, int nPbHC, + int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth); +template +void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dst_stride, + const uint16_t *src, ptrdiff_t src_stride, + int nPbWC, int nPbHC, + int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth); + + + +void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride, + const uint8_t *src, ptrdiff_t srcstride, + int nPbW, int nPbH, int16_t* mcbuffer) +{ + //const int shift1 = 0; // sps->BitDepth_Y-8; + const int shift2 = 6; + + // straight copy + + for (int y=0;y +void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride, + const pixel_t *src, ptrdiff_t srcstride, + int nPbW, int nPbH, int16_t* mcbuffer, + int xFracL, int yFracL, int bit_depth) +{ + int extra_left = extra_before[xFracL]; + //int extra_right = extra_after [xFracL]; + int extra_top = extra_before[yFracL]; + int extra_bottom = extra_after [yFracL]; + + //int nPbW_extra = extra_left + nPbW + extra_right; + int nPbH_extra = extra_top + nPbH + extra_bottom; + + const int shift1 = bit_depth-8; + const int shift2 = 6; + + + // H-filters + + switch (xFracL) { + case 0: + for (int y=-extra_top;y>shift1; + o += nPbH_extra; + p++; + } + } + break; + case 2: + for (int y=-extra_top;y>shift1; + o += nPbH_extra; + p++; + } + } + break; + case 3: + for (int y=-extra_top;y>shift1; + o += nPbH_extra; + p++; + } + } + break; + } + + + logtrace(LogMotion,"---H---\n"); + + for (int y=-extra_top;y>vshift; + o+=out_stride; + p++; + } + } + break; + case 2: + for (int x=0;x>vshift; + o+=out_stride; + p++; + } + } + break; + case 3: + for (int x=0;x>vshift; + o+=out_stride; + p++; + } + } + break; + } + + + logtrace(LogMotion,"---V---\n"); + for (int y=0;y + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "fallback.h" +#include "fallback-motion.h" +#include "fallback-dct.h" + + +void init_acceleration_functions_fallback(struct acceleration_functions* accel) +{ + accel->put_weighted_pred_avg_8 = put_weighted_pred_avg_8_fallback; + accel->put_unweighted_pred_8 = put_unweighted_pred_8_fallback; + accel->put_weighted_pred_8 = put_weighted_pred_8_fallback; + accel->put_weighted_bipred_8 = put_weighted_bipred_8_fallback; + + accel->put_weighted_pred_avg_16 = put_weighted_pred_avg_16_fallback; + accel->put_unweighted_pred_16 = put_unweighted_pred_16_fallback; + accel->put_weighted_pred_16 = put_weighted_pred_16_fallback; + accel->put_weighted_bipred_16 = put_weighted_bipred_16_fallback; + + + accel->put_hevc_epel_8 = put_epel_8_fallback; + accel->put_hevc_epel_h_8 = put_epel_hv_fallback; + accel->put_hevc_epel_v_8 = put_epel_hv_fallback; + accel->put_hevc_epel_hv_8 = put_epel_hv_fallback; + + accel->put_hevc_qpel_8[0][0] = put_qpel_0_0_fallback; + accel->put_hevc_qpel_8[0][1] = put_qpel_0_1_fallback; + accel->put_hevc_qpel_8[0][2] = put_qpel_0_2_fallback; + accel->put_hevc_qpel_8[0][3] = put_qpel_0_3_fallback; + accel->put_hevc_qpel_8[1][0] = put_qpel_1_0_fallback; + accel->put_hevc_qpel_8[1][1] = put_qpel_1_1_fallback; + accel->put_hevc_qpel_8[1][2] = put_qpel_1_2_fallback; + accel->put_hevc_qpel_8[1][3] = put_qpel_1_3_fallback; + accel->put_hevc_qpel_8[2][0] = put_qpel_2_0_fallback; + accel->put_hevc_qpel_8[2][1] = put_qpel_2_1_fallback; + accel->put_hevc_qpel_8[2][2] = put_qpel_2_2_fallback; + accel->put_hevc_qpel_8[2][3] = put_qpel_2_3_fallback; + accel->put_hevc_qpel_8[3][0] = put_qpel_3_0_fallback; + accel->put_hevc_qpel_8[3][1] = put_qpel_3_1_fallback; + accel->put_hevc_qpel_8[3][2] = put_qpel_3_2_fallback; + accel->put_hevc_qpel_8[3][3] = put_qpel_3_3_fallback; + + accel->put_hevc_epel_16 = put_epel_16_fallback; + accel->put_hevc_epel_h_16 = put_epel_hv_fallback; + accel->put_hevc_epel_v_16 = put_epel_hv_fallback; + accel->put_hevc_epel_hv_16 = put_epel_hv_fallback; + + accel->put_hevc_qpel_16[0][0] = put_qpel_0_0_fallback_16; + accel->put_hevc_qpel_16[0][1] = put_qpel_0_1_fallback_16; + accel->put_hevc_qpel_16[0][2] = put_qpel_0_2_fallback_16; + accel->put_hevc_qpel_16[0][3] = put_qpel_0_3_fallback_16; + accel->put_hevc_qpel_16[1][0] = put_qpel_1_0_fallback_16; + accel->put_hevc_qpel_16[1][1] = put_qpel_1_1_fallback_16; + accel->put_hevc_qpel_16[1][2] = put_qpel_1_2_fallback_16; + accel->put_hevc_qpel_16[1][3] = put_qpel_1_3_fallback_16; + accel->put_hevc_qpel_16[2][0] = put_qpel_2_0_fallback_16; + accel->put_hevc_qpel_16[2][1] = put_qpel_2_1_fallback_16; + accel->put_hevc_qpel_16[2][2] = put_qpel_2_2_fallback_16; + accel->put_hevc_qpel_16[2][3] = put_qpel_2_3_fallback_16; + accel->put_hevc_qpel_16[3][0] = put_qpel_3_0_fallback_16; + accel->put_hevc_qpel_16[3][1] = put_qpel_3_1_fallback_16; + accel->put_hevc_qpel_16[3][2] = put_qpel_3_2_fallback_16; + accel->put_hevc_qpel_16[3][3] = put_qpel_3_3_fallback_16; + + + + accel->transform_skip_8 = transform_skip_8_fallback; + accel->transform_skip_rdpcm_h_8 = transform_skip_rdpcm_h_8_fallback; + accel->transform_skip_rdpcm_v_8 = transform_skip_rdpcm_v_8_fallback; + accel->transform_bypass = transform_bypass_fallback; + accel->transform_bypass_rdpcm_h = transform_bypass_rdpcm_h_fallback; + accel->transform_bypass_rdpcm_v = transform_bypass_rdpcm_v_fallback; + accel->transform_4x4_dst_add_8 = transform_4x4_luma_add_8_fallback; + accel->transform_add_8[0] = transform_4x4_add_8_fallback; + accel->transform_add_8[1] = transform_8x8_add_8_fallback; + accel->transform_add_8[2] = transform_16x16_add_8_fallback; + accel->transform_add_8[3] = transform_32x32_add_8_fallback; + + accel->transform_skip_16 = transform_skip_16_fallback; + accel->transform_4x4_dst_add_16 = transform_4x4_luma_add_16_fallback; + accel->transform_add_16[0] = transform_4x4_add_16_fallback; + accel->transform_add_16[1] = transform_8x8_add_16_fallback; + accel->transform_add_16[2] = transform_16x16_add_16_fallback; + accel->transform_add_16[3] = transform_32x32_add_16_fallback; + + accel->rotate_coefficients = rotate_coefficients_fallback; + accel->add_residual_8 = add_residual_fallback; + accel->add_residual_16 = add_residual_fallback; + accel->rdpcm_h = rdpcm_h_fallback; + accel->rdpcm_v = rdpcm_v_fallback; + accel->transform_skip_residual = transform_skip_residual_fallback; + + accel->transform_idst_4x4 = transform_idst_4x4_fallback; + accel->transform_idct_4x4 = transform_idct_4x4_fallback; + accel->transform_idct_8x8 = transform_idct_8x8_fallback; + accel->transform_idct_16x16 = transform_idct_16x16_fallback; + accel->transform_idct_32x32 = transform_idct_32x32_fallback; + + accel->fwd_transform_4x4_dst_8 = fdst_4x4_8_fallback; + accel->fwd_transform_8[0] = fdct_4x4_8_fallback; + accel->fwd_transform_8[1] = fdct_8x8_8_fallback; + accel->fwd_transform_8[2] = fdct_16x16_8_fallback; + accel->fwd_transform_8[3] = fdct_32x32_8_fallback; + + accel->hadamard_transform_8[0] = hadamard_4x4_8_fallback; + accel->hadamard_transform_8[1] = hadamard_8x8_8_fallback; + accel->hadamard_transform_8[2] = hadamard_16x16_8_fallback; + accel->hadamard_transform_8[3] = hadamard_32x32_8_fallback; +} diff --git a/image-io.cc b/image-io.cc new file mode 100644 index 0000000..60f4e6c --- /dev/null +++ b/image-io.cc @@ -0,0 +1,220 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "libde265/image-io.h" +#include + + + +ImageSource::ImageSource() +{ +} + + +ImageSource_YUV::ImageSource_YUV() + : mFH(NULL) +{ +} + + +ImageSource_YUV::~ImageSource_YUV() +{ + if (mFH) { + fclose(mFH); + } +} + + +bool ImageSource_YUV::set_input_file(const char* filename, int w,int h) +{ + assert(mFH==NULL); + + mFH = fopen(filename,"rb"); + if (mFH==NULL) { + return false; + } + + width =w; + height=h; + mReachedEndOfFile = false; + + return true; +} + + +de265_image* ImageSource_YUV::read_next_image() +{ + if (mReachedEndOfFile) return NULL; + + de265_image* img = new de265_image; + img->alloc_image(width,height,de265_chroma_420, NULL, false, + NULL, /*NULL,*/ 0, NULL, false); + assert(img); // TODO: error handling + + // --- load image --- + + uint8_t* p; + int stride; + + p = img->get_image_plane(0); stride = img->get_image_stride(0); + for (int y=0;yget_image_plane(1); stride = img->get_image_stride(1); + for (int y=0;yget_image_plane(2); stride = img->get_image_stride(2); + for (int y=0;yget_width(); + int height= img->get_height(); + + p = img->get_image_plane(0); stride = img->get_image_stride(0); + for (int y=0;yget_image_plane(1); stride = img->get_image_stride(1); + for (int y=0;yget_image_plane(2); stride = img->get_image_stride(2); + for (int y=0;y + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "image.h" +#include "decctx.h" + +#include + +#include +#include +#include + +#include + + +#ifdef HAVE_MALLOC_H +#include +#endif + +#ifdef HAVE_SSE4_1 +// SSE code processes 128bit per iteration and thus might read more data +// than is later actually used. +#define MEMORY_PADDING 16 +#else +#define MEMORY_PADDING 0 +#endif + +#define STANDARD_ALIGNMENT 16 + +#ifdef HAVE___MINGW_ALIGNED_MALLOC +#define ALLOC_ALIGNED(alignment, size) __mingw_aligned_malloc((size), (alignment)) +#define FREE_ALIGNED(mem) __mingw_aligned_free((mem)) +#elif _WIN32 +#define ALLOC_ALIGNED(alignment, size) _aligned_malloc((size), (alignment)) +#define FREE_ALIGNED(mem) _aligned_free((mem)) +#elif defined(HAVE_POSIX_MEMALIGN) +static inline void *ALLOC_ALIGNED(size_t alignment, size_t size) { + void *mem = NULL; + if (posix_memalign(&mem, alignment, size) != 0) { + return NULL; + } + return mem; +}; +#define FREE_ALIGNED(mem) free((mem)) +#else +#define ALLOC_ALIGNED(alignment, size) memalign((alignment), (size)) +#define FREE_ALIGNED(mem) free((mem)) +#endif + +#define ALLOC_ALIGNED_16(size) ALLOC_ALIGNED(16, size) + +static const int alignment = 16; + +LIBDE265_API void* de265_alloc_image_plane(struct de265_image* img, int cIdx, + void* inputdata, int inputstride, void *userdata) +{ + int alignment = STANDARD_ALIGNMENT; + int stride = (img->get_width(cIdx) + alignment-1) / alignment * alignment; + int height = img->get_height(cIdx); + + uint8_t* p = (uint8_t *)ALLOC_ALIGNED_16(stride * height + MEMORY_PADDING); + + if (p==NULL) { return NULL; } + + img->set_image_plane(cIdx, p, stride, userdata); + + // copy input data if provided + + if (inputdata != NULL) { + if (inputstride == stride) { + memcpy(p, inputdata, stride*height); + } + else { + for (int y=0;yget_image_plane(cIdx); + assert(p); + FREE_ALIGNED(p); +} + + +static int de265_image_get_buffer(de265_decoder_context* ctx, + de265_image_spec* spec, de265_image* img, void* userdata) +{ + const int rawChromaWidth = spec->width / img->SubWidthC; + const int rawChromaHeight = spec->height / img->SubHeightC; + + int luma_stride = (spec->width + spec->alignment-1) / spec->alignment * spec->alignment; + int chroma_stride = (rawChromaWidth + spec->alignment-1) / spec->alignment * spec->alignment; + + assert(img->BitDepth_Y >= 8 && img->BitDepth_Y <= 16); + assert(img->BitDepth_C >= 8 && img->BitDepth_C <= 16); + + int luma_bpl = luma_stride * ((img->BitDepth_Y+7)/8); + int chroma_bpl = chroma_stride * ((img->BitDepth_C+7)/8); + + int luma_height = spec->height; + int chroma_height = rawChromaHeight; + + bool alloc_failed = false; + + uint8_t* p[3] = { 0,0,0 }; + p[0] = (uint8_t *)ALLOC_ALIGNED_16(luma_height * luma_bpl + MEMORY_PADDING); + if (p[0]==NULL) { alloc_failed=true; } + + if (img->get_chroma_format() != de265_chroma_mono) { + p[1] = (uint8_t *)ALLOC_ALIGNED_16(chroma_height * chroma_bpl + MEMORY_PADDING); + p[2] = (uint8_t *)ALLOC_ALIGNED_16(chroma_height * chroma_bpl + MEMORY_PADDING); + + if (p[1]==NULL || p[2]==NULL) { alloc_failed=true; } + } + else { + p[1] = NULL; + p[2] = NULL; + chroma_stride = 0; + } + + if (alloc_failed) { + for (int i=0;i<3;i++) + if (p[i]) { + FREE_ALIGNED(p[i]); + } + + return 0; + } + + img->set_image_plane(0, p[0], luma_stride, NULL); + img->set_image_plane(1, p[1], chroma_stride, NULL); + img->set_image_plane(2, p[2], chroma_stride, NULL); + + return 1; +} + +static void de265_image_release_buffer(de265_decoder_context* ctx, + de265_image* img, void* userdata) +{ + for (int i=0;i<3;i++) { + uint8_t* p = (uint8_t*)img->get_image_plane(i); + if (p) { + FREE_ALIGNED(p); + } + } +} + + +de265_image_allocation de265_image::default_image_allocation = { + de265_image_get_buffer, + de265_image_release_buffer +}; + + +void de265_image::set_image_plane(int cIdx, uint8_t* mem, int stride, void *userdata) +{ + pixels[cIdx] = mem; + plane_user_data[cIdx] = userdata; + + if (cIdx==0) { this->stride = stride; } + else { this->chroma_stride = stride; } +} + + +uint32_t de265_image::s_next_image_ID = 0; + +de265_image::de265_image() +{ + ID = -1; + removed_at_picture_id = 0; // picture not used, so we can assume it has been removed + + decctx = NULL; + //encctx = NULL; + + //encoder_image_release_func = NULL; + + //alloc_functions.get_buffer = NULL; + //alloc_functions.release_buffer = NULL; + + for (int c=0;c<3;c++) { + pixels[c] = NULL; + pixels_confwin[c] = NULL; + plane_user_data[c] = NULL; + } + + width=height=0; + + pts = 0; + user_data = NULL; + + ctb_progress = NULL; + + integrity = INTEGRITY_NOT_DECODED; + + picture_order_cnt_lsb = -1; // undefined + PicOrderCntVal = -1; // undefined + PicState = UnusedForReference; + PicOutputFlag = false; + + nThreadsQueued = 0; + nThreadsRunning = 0; + nThreadsBlocked = 0; + nThreadsFinished = 0; + nThreadsTotal = 0; + + de265_mutex_init(&mutex); + de265_cond_init(&finished_cond); +} + + +de265_error de265_image::alloc_image(int w,int h, enum de265_chroma c, + std::shared_ptr sps, bool allocMetadata, + decoder_context* dctx, + //encoder_context* ectx, + de265_PTS pts, void* user_data, + bool useCustomAllocFunc) +{ + //if (allocMetadata) { assert(sps); } + if (allocMetadata) { assert(sps); } + + if (sps) { this->sps = sps; } + + release(); /* TODO: review code for efficient allocation when arrays are already + allocated to the requested size. Without the release, the old image-data + will not be freed. */ + + ID = s_next_image_ID++; + removed_at_picture_id = std::numeric_limits::max(); + + decctx = dctx; + //encctx = ectx; + + // --- allocate image buffer --- + + chroma_format= c; + + width = w; + height = h; + chroma_width = w; + chroma_height= h; + + this->user_data = user_data; + this->pts = pts; + + de265_image_spec spec; + + int WinUnitX, WinUnitY; + + switch (chroma_format) { + case de265_chroma_mono: WinUnitX=1; WinUnitY=1; break; + case de265_chroma_420: WinUnitX=2; WinUnitY=2; break; + case de265_chroma_422: WinUnitX=2; WinUnitY=1; break; + case de265_chroma_444: WinUnitX=1; WinUnitY=1; break; + default: + assert(0); + } + + switch (chroma_format) { + case de265_chroma_420: + spec.format = de265_image_format_YUV420P8; + chroma_width = (chroma_width +1)/2; + chroma_height = (chroma_height+1)/2; + SubWidthC = 2; + SubHeightC = 2; + break; + + case de265_chroma_422: + spec.format = de265_image_format_YUV422P8; + chroma_width = (chroma_width+1)/2; + SubWidthC = 2; + SubHeightC = 1; + break; + + case de265_chroma_444: + spec.format = de265_image_format_YUV444P8; + SubWidthC = 1; + SubHeightC = 1; + break; + + case de265_chroma_mono: + spec.format = de265_image_format_mono8; + chroma_width = 0; + chroma_height= 0; + SubWidthC = 1; + SubHeightC = 1; + break; + + default: + assert(false); + break; + } + + if (chroma_format != de265_chroma_mono && sps) { + assert(sps->SubWidthC == SubWidthC); + assert(sps->SubHeightC == SubHeightC); + } + + spec.width = w; + spec.height = h; + spec.alignment = STANDARD_ALIGNMENT; + + + // conformance window cropping + + int left = sps ? sps->conf_win_left_offset : 0; + int right = sps ? sps->conf_win_right_offset : 0; + int top = sps ? sps->conf_win_top_offset : 0; + int bottom = sps ? sps->conf_win_bottom_offset : 0; + + width_confwin = width - (left+right)*WinUnitX; + height_confwin= height- (top+bottom)*WinUnitY; + chroma_width_confwin = chroma_width -left-right; + chroma_height_confwin= chroma_height-top-bottom; + + spec.crop_left = left *WinUnitX; + spec.crop_right = right*WinUnitX; + spec.crop_top = top *WinUnitY; + spec.crop_bottom= bottom*WinUnitY; + + spec.visible_width = width_confwin; + spec.visible_height= height_confwin; + + + BitDepth_Y = (sps==NULL) ? 8 : sps->BitDepth_Y; + BitDepth_C = (sps==NULL) ? 8 : sps->BitDepth_C; + + bpp_shift[0] = (BitDepth_Y <= 8) ? 0 : 1; + bpp_shift[1] = (BitDepth_C <= 8) ? 0 : 1; + bpp_shift[2] = bpp_shift[1]; + + + // allocate memory and set conformance window pointers + + void* alloc_userdata = NULL; + if (decctx) alloc_userdata = decctx->param_image_allocation_userdata; + // if (encctx) alloc_userdata = encctx->param_image_allocation_userdata; // actually not needed + + /* + if (encctx && useCustomAllocFunc) { + encoder_image_release_func = encctx->release_func; + + // if we do not provide a release function, use our own + + if (encoder_image_release_func == NULL) { + image_allocation_functions = de265_image::default_image_allocation; + } + else { + image_allocation_functions.get_buffer = NULL; + image_allocation_functions.release_buffer = NULL; + } + } + else*/ if (decctx && useCustomAllocFunc) { + image_allocation_functions = decctx->param_image_allocation_functions; + } + else { + image_allocation_functions = de265_image::default_image_allocation; + } + + bool mem_alloc_success = true; + + if (image_allocation_functions.get_buffer != NULL) { + mem_alloc_success = image_allocation_functions.get_buffer(decctx, &spec, this, + alloc_userdata); + + pixels_confwin[0] = pixels[0] + left*WinUnitX + top*WinUnitY*stride; + + if (chroma_format != de265_chroma_mono) { + pixels_confwin[1] = pixels[1] + left + top*chroma_stride; + pixels_confwin[2] = pixels[2] + left + top*chroma_stride; + } + else { + pixels_confwin[1] = NULL; + pixels_confwin[2] = NULL; + } + + // check for memory shortage + + if (!mem_alloc_success) + { + return DE265_ERROR_OUT_OF_MEMORY; + } + } + + //alloc_functions = *allocfunc; + //alloc_userdata = userdata; + + // --- allocate decoding info arrays --- + + if (allocMetadata) { + // intra pred mode + + mem_alloc_success &= intraPredMode.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs, + sps->Log2MinPUSize); + + mem_alloc_success &= intraPredModeC.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs, + sps->Log2MinPUSize); + + // cb info + + mem_alloc_success &= cb_info.alloc(sps->PicWidthInMinCbsY, sps->PicHeightInMinCbsY, + sps->Log2MinCbSizeY); + + // pb info + + int puWidth = sps->PicWidthInMinCbsY << (sps->Log2MinCbSizeY -2); + int puHeight = sps->PicHeightInMinCbsY << (sps->Log2MinCbSizeY -2); + + mem_alloc_success &= pb_info.alloc(puWidth,puHeight, 2); + + + // tu info + + mem_alloc_success &= tu_info.alloc(sps->PicWidthInTbsY, sps->PicHeightInTbsY, + sps->Log2MinTrafoSize); + + // deblk info + + int deblk_w = (sps->pic_width_in_luma_samples +3)/4; + int deblk_h = (sps->pic_height_in_luma_samples+3)/4; + + mem_alloc_success &= deblk_info.alloc(deblk_w, deblk_h, 2); + + // CTB info + + if (ctb_info.data_size != sps->PicSizeInCtbsY) + { + delete[] ctb_progress; + + mem_alloc_success &= ctb_info.alloc(sps->PicWidthInCtbsY, sps->PicHeightInCtbsY, + sps->Log2CtbSizeY); + + ctb_progress = new de265_progress_lock[ ctb_info.data_size ]; + } + + + // check for memory shortage + + if (!mem_alloc_success) + { + return DE265_ERROR_OUT_OF_MEMORY; + } + } + + return DE265_OK; +} + + +de265_image::~de265_image() +{ + release(); + + // free progress locks + + if (ctb_progress) { + delete[] ctb_progress; + } + + de265_cond_destroy(&finished_cond); + de265_mutex_destroy(&mutex); +} + + +void de265_image::release() +{ + // free image memory + + if (pixels[0]) + { + /* + if (encoder_image_release_func != NULL) { + encoder_image_release_func(encctx, this, + encctx->param_image_allocation_userdata); + } + else*/ { + image_allocation_functions.release_buffer(decctx, this, + decctx ? + decctx->param_image_allocation_userdata : + NULL); + } + + for (int i=0;i<3;i++) + { + pixels[i] = NULL; + pixels_confwin[i] = NULL; + } + } + + // free slices + + for (int i=0;i=0) { + memset(pixels[0], y, stride * height); + } + + if (cb>=0) { + memset(pixels[1], cb, chroma_stride * chroma_height); + } + + if (cr>=0) { + memset(pixels[2], cr, chroma_stride * chroma_height); + } +} + + +de265_error de265_image::copy_image(const de265_image* src) +{ + /* TODO: actually, since we allocate the image only for internal purpose, we + do not have to call the external allocation routines for this. However, then + we have to track for each image how to release it again. + Another option would be to safe the copied data not in an de265_image at all. + */ + + de265_error err = alloc_image(src->width, src->height, src->chroma_format, src->sps, false, + src->decctx, /*src->encctx,*/ src->pts, src->user_data, false); + if (err != DE265_OK) { + return err; + } + + copy_lines_from(src, 0, src->height); + + return err; +} + + +// end = last line + 1 +void de265_image::copy_lines_from(const de265_image* src, int first, int end) +{ + if (end > src->height) end=src->height; + + assert(first % 2 == 0); + assert(end % 2 == 0); + + int luma_bpp = (sps->BitDepth_Y+7)/8; + int chroma_bpp = (sps->BitDepth_C+7)/8; + + if (src->stride == stride) { + memcpy(pixels[0] + first*stride * luma_bpp, + src->pixels[0] + first*src->stride * luma_bpp, + (end-first)*stride * luma_bpp); + } + else { + for (int yp=first;yppixels[0]+yp*src->stride * luma_bpp, + src->width * luma_bpp); + } + } + + int first_chroma = first / src->SubHeightC; + int end_chroma = end / src->SubHeightC; + + if (src->chroma_format != de265_chroma_mono) { + if (src->chroma_stride == chroma_stride) { + memcpy(pixels[1] + first_chroma*chroma_stride * chroma_bpp, + src->pixels[1] + first_chroma*chroma_stride * chroma_bpp, + (end_chroma-first_chroma) * chroma_stride * chroma_bpp); + memcpy(pixels[2] + first_chroma*chroma_stride * chroma_bpp, + src->pixels[2] + first_chroma*chroma_stride * chroma_bpp, + (end_chroma-first_chroma) * chroma_stride * chroma_bpp); + } + else { + for (int y=first_chroma;ypixels[1]+y*src->chroma_stride * chroma_bpp, + src->chroma_width * chroma_bpp); + memcpy(pixels[2]+y*chroma_stride * chroma_bpp, + src->pixels[2]+y*src->chroma_stride * chroma_bpp, + src->chroma_width * chroma_bpp); + } + } + } +} + + +void de265_image::exchange_pixel_data_with(de265_image& b) +{ + for (int i=0;i<3;i++) { + std::swap(pixels[i], b.pixels[i]); + std::swap(pixels_confwin[i], b.pixels_confwin[i]); + std::swap(plane_user_data[i], b.plane_user_data[i]); + } + + std::swap(stride, b.stride); + std::swap(chroma_stride, b.chroma_stride); + std::swap(image_allocation_functions, b.image_allocation_functions); +} + + +void de265_image::thread_start(int nThreads) +{ + de265_mutex_lock(&mutex); + + //printf("nThreads before: %d %d\n",nThreadsQueued, nThreadsTotal); + + nThreadsQueued += nThreads; + nThreadsTotal += nThreads; + + //printf("nThreads after: %d %d\n",nThreadsQueued, nThreadsTotal); + + de265_mutex_unlock(&mutex); +} + +void de265_image::thread_run(const thread_task* task) +{ + //printf("run thread %s\n", task->name().c_str()); + + de265_mutex_lock(&mutex); + nThreadsQueued--; + nThreadsRunning++; + de265_mutex_unlock(&mutex); +} + +void de265_image::thread_blocks() +{ + de265_mutex_lock(&mutex); + nThreadsRunning--; + nThreadsBlocked++; + de265_mutex_unlock(&mutex); +} + +void de265_image::thread_unblocks() +{ + de265_mutex_lock(&mutex); + nThreadsBlocked--; + nThreadsRunning++; + de265_mutex_unlock(&mutex); +} + +void de265_image::thread_finishes(const thread_task* task) +{ + //printf("finish thread %s\n", task->name().c_str()); + + de265_mutex_lock(&mutex); + + nThreadsRunning--; + nThreadsFinished++; + assert(nThreadsRunning >= 0); + + if (nThreadsFinished==nThreadsTotal) { + de265_cond_broadcast(&finished_cond, &mutex); + } + + de265_mutex_unlock(&mutex); +} + +void de265_image::wait_for_progress(thread_task* task, int ctbx,int ctby, int progress) +{ + const int ctbW = sps->PicWidthInCtbsY; + + wait_for_progress(task, ctbx + ctbW*ctby, progress); +} + +void de265_image::wait_for_progress(thread_task* task, int ctbAddrRS, int progress) +{ + if (task==NULL) { return; } + + de265_progress_lock* progresslock = &ctb_progress[ctbAddrRS]; + if (progresslock->get_progress() < progress) { + thread_blocks(); + + assert(task!=NULL); + task->state = thread_task::Blocked; + + /* TODO: check whether we are the first blocked task in the list. + If we are, we have to conceal input errors. + Simplest concealment: do not block. + */ + + progresslock->wait_for_progress(progress); + task->state = thread_task::Running; + thread_unblocks(); + } +} + + +void de265_image::wait_for_completion() +{ + de265_mutex_lock(&mutex); + while (nThreadsFinished!=nThreadsTotal) { + de265_cond_wait(&finished_cond, &mutex); + } + de265_mutex_unlock(&mutex); +} + +bool de265_image::debug_is_completed() const +{ + return nThreadsFinished==nThreadsTotal; +} + + + +void de265_image::clear_metadata() +{ + // TODO: maybe we could avoid the memset by ensuring that all data is written to + // during decoding (especially log2CbSize), but it is unlikely to be faster than the memset. + + cb_info.clear(); + //tu_info.clear(); // done on the fly + ctb_info.clear(); + deblk_info.clear(); + + // --- reset CTB progresses --- + + for (int i=0;i> log2PuSize; + int yPu = y >> log2PuSize; + int wPu = nPbW >> log2PuSize; + int hPu = nPbH >> log2PuSize; + + int stride = pb_info.width_in_units; + + for (int pby=0;pby=sps->pic_width_in_luma_samples || + yN>=sps->pic_height_in_luma_samples) return false; + + int minBlockAddrN = pps->MinTbAddrZS[ (xN>>sps->Log2MinTrafoSize) + + (yN>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; + int minBlockAddrCurr = pps->MinTbAddrZS[ (xCurr>>sps->Log2MinTrafoSize) + + (yCurr>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; + + if (minBlockAddrN > minBlockAddrCurr) return false; + + int xCurrCtb = xCurr >> sps->Log2CtbSizeY; + int yCurrCtb = yCurr >> sps->Log2CtbSizeY; + int xNCtb = xN >> sps->Log2CtbSizeY; + int yNCtb = yN >> sps->Log2CtbSizeY; + + if (get_SliceAddrRS(xCurrCtb,yCurrCtb) != + get_SliceAddrRS(xNCtb, yNCtb)) { + return false; + } + + if (pps->TileIdRS[xCurrCtb + yCurrCtb*sps->PicWidthInCtbsY] != + pps->TileIdRS[xNCtb + yNCtb *sps->PicWidthInCtbsY]) { + return false; + } + + return true; +} + + +bool de265_image::available_pred_blk(int xC,int yC, int nCbS, int xP, int yP, + int nPbW, int nPbH, int partIdx, int xN,int yN) const +{ + logtrace(LogMotion,"C:%d;%d P:%d;%d N:%d;%d size=%d;%d\n",xC,yC,xP,yP,xN,yN,nPbW,nPbH); + + int sameCb = (xC <= xN && xN < xC+nCbS && + yC <= yN && yN < yC+nCbS); + + bool availableN; + + if (!sameCb) { + availableN = available_zscan(xP,yP,xN,yN); + } + else { + availableN = !(nPbW<<1 == nCbS && nPbH<<1 == nCbS && // NxN + partIdx==1 && + yN >= yC+nPbH && xN < xC+nPbW); // xN/yN inside partIdx 2 + } + + if (availableN && get_pred_mode(xN,yN) == MODE_INTRA) { + availableN = false; + } + + return availableN; +} diff --git a/intrapred.cc b/intrapred.cc new file mode 100644 index 0000000..cf049b8 --- /dev/null +++ b/intrapred.cc @@ -0,0 +1,364 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "intrapred.h" +#include "transform.h" +#include "util.h" +#include + + +#include +#include + + + +void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], + enum IntraPredMode candIntraPredModeA, + enum IntraPredMode candIntraPredModeB) +{ + // build candidate list + + if (candIntraPredModeA == candIntraPredModeB) { + if (candIntraPredModeA < 2) { + candModeList[0] = INTRA_PLANAR; + candModeList[1] = INTRA_DC; + candModeList[2] = INTRA_ANGULAR_26; + } + else { + candModeList[0] = candIntraPredModeA; + candModeList[1] = (enum IntraPredMode)(2 + ((candIntraPredModeA-2 -1 +32) % 32)); + candModeList[2] = (enum IntraPredMode)(2 + ((candIntraPredModeA-2 +1 ) % 32)); + } + } + else { + candModeList[0] = candIntraPredModeA; + candModeList[1] = candIntraPredModeB; + + if (candIntraPredModeA != INTRA_PLANAR && + candIntraPredModeB != INTRA_PLANAR) { + candModeList[2] = INTRA_PLANAR; + } + else if (candIntraPredModeA != INTRA_DC && + candIntraPredModeB != INTRA_DC) { + candModeList[2] = INTRA_DC; + } + else { + candModeList[2] = INTRA_ANGULAR_26; + } + } + + /* + printf("candModeList: %d %d %d\n", + candModeList[0], + candModeList[1], + candModeList[2] + ); + */ +} + + +void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], int x,int y, int PUidx, + bool availableA, // left + bool availableB, // top + const de265_image* img) +{ + const seq_parameter_set* sps = &img->get_sps(); + + // block on left side + + enum IntraPredMode candIntraPredModeA, candIntraPredModeB; + if (availableA==false) { + candIntraPredModeA=INTRA_DC; + } + else if (img->get_pred_mode(x-1,y) != MODE_INTRA || + img->get_pcm_flag (x-1,y)) { + candIntraPredModeA=INTRA_DC; + } + else { + candIntraPredModeA = img->get_IntraPredMode_atIndex(PUidx-1); + } + + // block above + + if (availableB==false) { + candIntraPredModeB=INTRA_DC; + } + else if (img->get_pred_mode(x,y-1) != MODE_INTRA || + img->get_pcm_flag (x,y-1)) { + candIntraPredModeB=INTRA_DC; + } + else if (y-1 < ((y >> sps->Log2CtbSizeY) << sps->Log2CtbSizeY)) { + candIntraPredModeB=INTRA_DC; + } + else { + candIntraPredModeB = img->get_IntraPredMode_atIndex(PUidx-sps->PicWidthInMinPUs); + } + + + logtrace(LogSlice,"%d;%d candA:%d / candB:%d\n", x,y, + availableA ? candIntraPredModeA : -999, + availableB ? candIntraPredModeB : -999); + + + fillIntraPredModeCandidates(candModeList, + candIntraPredModeA, + candIntraPredModeB); +} + + +int find_intra_pred_mode(enum IntraPredMode mode, + enum IntraPredMode candModeList[3]) +{ + // check whether the mode is in the candidate list + + for (int i=0;i<3;i++) { + if (candModeList[i] == mode) { + return i; + } + } + + // sort candModeList + + if (candModeList[0] > candModeList[1]) { + std::swap(candModeList[0],candModeList[1]); + } + if (candModeList[0] > candModeList[2]) { + std::swap(candModeList[0],candModeList[2]); + } + if (candModeList[1] > candModeList[2]) { + std::swap(candModeList[1],candModeList[2]); + } + + // skip modes already in the candidate list + + int intraMode = mode; + + for (int i=2;i>=0;i--) { + if (intraMode >= candModeList[i]) { intraMode--; } + } + + return -intraMode-1; +} + + +void list_chroma_pred_candidates(enum IntraPredMode chroma_mode[5], + enum IntraPredMode luma_mode) +{ + enum IntraPredMode chroma_cand[5]; + chroma_cand[0] = INTRA_PLANAR; + chroma_cand[1] = INTRA_ANGULAR_26; + chroma_cand[2] = INTRA_ANGULAR_10; + chroma_cand[3] = INTRA_DC; + chroma_cand[4] = luma_mode; + + switch (luma_mode) { + case INTRA_PLANAR: chroma_cand[0] = INTRA_ANGULAR_34; break; + case INTRA_ANGULAR_26: chroma_cand[1] = INTRA_ANGULAR_34; break; + case INTRA_ANGULAR_10: chroma_cand[2] = INTRA_ANGULAR_34; break; + case INTRA_DC: chroma_cand[3] = INTRA_ANGULAR_34; break; + default: + // use defaults from above + break; + } +} + + +int get_intra_scan_idx(int log2TrafoSize, enum IntraPredMode intraPredMode, int cIdx, + const seq_parameter_set* sps) +{ + if (log2TrafoSize==2 || + (log2TrafoSize==3 && (cIdx==0 || + sps->ChromaArrayType==CHROMA_444))) { + /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; + else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; + else return 0; + } + else { return 0; } +} + + +int get_intra_scan_idx_luma(int log2TrafoSize, enum IntraPredMode intraPredMode) +{ + if (log2TrafoSize==2 || log2TrafoSize==3) { + /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; + else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; + else return 0; + } + else { return 0; } +} + +int get_intra_scan_idx_chroma(int log2TrafoSize, enum IntraPredMode intraPredMode) +{ + if (log2TrafoSize==1 || log2TrafoSize==2) { + /**/ if (intraPredMode >= 6 && intraPredMode <= 14) return 2; + else if (intraPredMode >= 22 && intraPredMode <= 30) return 1; + else return 0; + } + else { return 0; } +} + + +enum IntraPredMode lumaPredMode_to_chromaPredMode(enum IntraPredMode luma, + enum IntraChromaPredMode chroma) +{ + switch (chroma) { + case INTRA_CHROMA_LIKE_LUMA: + return luma; + + case INTRA_CHROMA_PLANAR_OR_34: + if (luma==INTRA_PLANAR) return INTRA_ANGULAR_34; + else return INTRA_PLANAR; + + case INTRA_CHROMA_ANGULAR_26_OR_34: + if (luma==INTRA_ANGULAR_26) return INTRA_ANGULAR_34; + else return INTRA_ANGULAR_26; + + case INTRA_CHROMA_ANGULAR_10_OR_34: + if (luma==INTRA_ANGULAR_10) return INTRA_ANGULAR_34; + else return INTRA_ANGULAR_10; + + case INTRA_CHROMA_DC_OR_34: + if (luma==INTRA_DC) return INTRA_ANGULAR_34; + else return INTRA_DC; + } + + + assert(false); + return INTRA_DC; +} + + + + +// (8.4.4.2.2) +template +void fill_border_samples(de265_image* img, + int xB,int yB, // in component specific resolution + int nT, int cIdx, + pixel_t* out_border) +{ + intra_border_computer c; + c.init(out_border, img, nT, cIdx, xB, yB); + c.preproc(); + c.fill_from_image(); + c.reference_sample_substitution(); +} + + +const int intraPredAngle_table[1+34] = + { 0, 0,32,26,21,17,13, 9, 5, 2, 0,-2,-5,-9,-13,-17,-21,-26, + -32,-26,-21,-17,-13,-9,-5,-2,0,2,5,9,13,17,21,26,32 }; + +const int invAngle_table[25-10] = + { -4096,-1638,-910,-630,-482,-390,-315,-256, + -315,-390,-482,-630,-910,-1638,-4096 }; + + +template +void decode_intra_prediction_internal(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + pixel_t* dst, int dstStride, + int nT, int cIdx) +{ + pixel_t border_pixels_mem[4*MAX_INTRA_PRED_BLOCK_SIZE+1]; + pixel_t* border_pixels = &border_pixels_mem[2*MAX_INTRA_PRED_BLOCK_SIZE]; + + fill_border_samples(img, xB0,yB0, nT, cIdx, border_pixels); + + if (img->get_sps().range_extension.intra_smoothing_disabled_flag == 0 && + (cIdx==0 || img->get_sps().ChromaArrayType==CHROMA_444)) + { + intra_prediction_sample_filtering(img->get_sps(), border_pixels, nT, cIdx, intraPredMode); + } + + + switch (intraPredMode) { + case INTRA_PLANAR: + intra_prediction_planar(dst,dstStride, nT,cIdx, border_pixels); + break; + case INTRA_DC: + intra_prediction_DC(dst,dstStride, nT,cIdx, border_pixels); + break; + default: + { + int bit_depth = img->get_bit_depth(cIdx); + bool disableIntraBoundaryFilter = + (img->get_sps().range_extension.implicit_rdpcm_enabled_flag && + img->get_cu_transquant_bypass(xB0,yB0)); + + intra_prediction_angular(dst,dstStride, bit_depth,disableIntraBoundaryFilter, + xB0,yB0,intraPredMode,nT,cIdx, border_pixels); + } + break; + } +} + + +// (8.4.4.2.1) +void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + int nT, int cIdx) +{ + logtrace(LogIntraPred,"decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n", + xB0,yB0, intraPredMode, nT,cIdx); + /* + printf("decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n", + xB0,yB0, intraPredMode, nT,cIdx); + */ + + if (img->high_bit_depth(cIdx)) { + decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, + img->get_image_plane_at_pos_NEW(cIdx,xB0,yB0), + img->get_image_stride(cIdx), + nT,cIdx); + } + else { + decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, + img->get_image_plane_at_pos_NEW(cIdx,xB0,yB0), + img->get_image_stride(cIdx), + nT,cIdx); + } +} + + +// TODO: remove this +template <> void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + uint8_t* dst, int nT, int cIdx) +{ + decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, + dst,nT, + nT,cIdx); +} + + +// TODO: remove this +template <> void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + uint16_t* dst, int nT, int cIdx) +{ + decode_intra_prediction_internal(img,xB0,yB0, intraPredMode, + dst,nT, + nT,cIdx); +} diff --git a/libde265/acceleration.h b/libde265/acceleration.h new file mode 100644 index 0000000..2f1148b --- /dev/null +++ b/libde265/acceleration.h @@ -0,0 +1,359 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_ACCELERATION_H +#define DE265_ACCELERATION_H + +#include +#include +#include + + +struct acceleration_functions +{ + void (*put_weighted_pred_avg_8)(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height); + + void (*put_unweighted_pred_8)(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height); + + void (*put_weighted_pred_8)(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD); + void (*put_weighted_bipred_8)(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD); + + + void (*put_weighted_pred_avg_16)(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth); + + void (*put_unweighted_pred_16)(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth); + + void (*put_weighted_pred_16)(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth); + void (*put_weighted_bipred_16)(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth); + + + void put_weighted_pred_avg(void *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth) const; + + void put_unweighted_pred(void *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) const; + + void put_weighted_pred(void *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) const; + void put_weighted_bipred(void *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) const; + + + + + void (*put_hevc_epel_8)(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer); + void (*put_hevc_epel_h_8)(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_v_8)(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_hv_8)(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + + void (*put_hevc_qpel_8[4][4])(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, int width, int height, + int16_t* mcbuffer); + + + void (*put_hevc_epel_16)(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_h_16)(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_v_16)(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + void (*put_hevc_epel_hv_16)(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + + void (*put_hevc_qpel_16[4][4])(int16_t *dst, ptrdiff_t dststride, + const uint16_t *src, ptrdiff_t srcstride, int width, int height, + int16_t* mcbuffer, int bit_depth); + + + void put_hevc_epel(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const; + void put_hevc_epel_h(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const; + void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const; + void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const; + + void put_hevc_qpel(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int16_t* mcbuffer, int dX,int dY, int bit_depth) const; + + + // --- inverse transforms --- + + void (*transform_bypass)(int32_t *residual, const int16_t *coeffs, int nT); + void (*transform_bypass_rdpcm_v)(int32_t *r, const int16_t *coeffs, int nT); + void (*transform_bypass_rdpcm_h)(int32_t *r, const int16_t *coeffs, int nT); + + // 8 bit + + void (*transform_skip_8)(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride); // no transform + void (*transform_skip_rdpcm_v_8)(uint8_t *_dst, const int16_t *coeffs, int nT, ptrdiff_t _stride); + void (*transform_skip_rdpcm_h_8)(uint8_t *_dst, const int16_t *coeffs, int nT, ptrdiff_t _stride); + void (*transform_4x4_dst_add_8)(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); // iDST + void (*transform_add_8[4])(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); // iDCT + + // 9-16 bit + + void (*transform_skip_16)(uint16_t *_dst, const int16_t *coeffs, ptrdiff_t _stride, int bit_depth); // no transform + void (*transform_4x4_dst_add_16)(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); // iDST + void (*transform_add_16[4])(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); // iDCT + + + void (*rotate_coefficients)(int16_t *coeff, int nT); + + void (*transform_idst_4x4)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); + void (*transform_idct_4x4)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); + void (*transform_idct_8x8)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); + void (*transform_idct_16x16)(int32_t *dst,const int16_t *coeffs,int bdShift, int max_coeff_bits); + void (*transform_idct_32x32)(int32_t *dst,const int16_t *coeffs,int bdShift, int max_coeff_bits); + void (*add_residual_8)(uint8_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth); + void (*add_residual_16)(uint16_t *dst,ptrdiff_t stride,const int32_t* r, int nT, int bit_depth); + + template + void add_residual(pixel_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const; + + void (*rdpcm_v)(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift); + void (*rdpcm_h)(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift); + + void (*transform_skip_residual)(int32_t *residual, const int16_t *coeffs, int nT, + int tsShift,int bdShift); + + + template void transform_skip(pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; + template void transform_skip_rdpcm_v(pixel_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const; + template void transform_skip_rdpcm_h(pixel_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const; + template void transform_4x4_dst_add(pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; + template void transform_add(int sizeIdx, pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const; + + + + // --- forward transforms --- + + void (*fwd_transform_4x4_dst_8)(int16_t *coeffs, const int16_t* src, ptrdiff_t stride); // fDST + + // indexed with (log2TbSize-2) + void (*fwd_transform_8[4]) (int16_t *coeffs, const int16_t *src, ptrdiff_t stride); // fDCT + + + // forward Hadamard transform (without scaling factor) + // (4x4,8x8,16x16,32x32) indexed with (log2TbSize-2) + void (*hadamard_transform_8[4]) (int16_t *coeffs, const int16_t *src, ptrdiff_t stride); +}; + + +/* +template <> inline void acceleration_functions::put_weighted_pred_avg(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth) { put_weighted_pred_avg_8(_dst,dststride,src1,src2,srcstride,width,height); } +template <> inline void acceleration_functions::put_weighted_pred_avg(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth) { put_weighted_pred_avg_16(_dst,dststride,src1,src2, + srcstride,width,height,bit_depth); } + +template <> inline void acceleration_functions::put_unweighted_pred(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) { put_unweighted_pred_8(_dst,dststride,src,srcstride,width,height); } +template <> inline void acceleration_functions::put_unweighted_pred(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) { put_unweighted_pred_16(_dst,dststride,src,srcstride,width,height,bit_depth); } + +template <> inline void acceleration_functions::put_weighted_pred(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) { put_weighted_pred_8(_dst,dststride,src,srcstride,width,height,w,o,log2WD); } +template <> inline void acceleration_functions::put_weighted_pred(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) { put_weighted_pred_16(_dst,dststride,src,srcstride,width,height,w,o,log2WD,bit_depth); } + +template <> inline void acceleration_functions::put_weighted_bipred(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { put_weighted_bipred_8(_dst,dststride,src1,src2,srcstride, + width,height, + w1,o1,w2,o2,log2WD); } +template <> inline void acceleration_functions::put_weighted_bipred(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { put_weighted_bipred_16(_dst,dststride,src1,src2,srcstride, + width,height, + w1,o1,w2,o2,log2WD,bit_depth); } +*/ + + +inline void acceleration_functions::put_weighted_pred_avg(void* _dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, int bit_depth) const +{ + if (bit_depth <= 8) + put_weighted_pred_avg_8((uint8_t*)_dst,dststride,src1,src2,srcstride,width,height); + else + put_weighted_pred_avg_16((uint16_t*)_dst,dststride,src1,src2,srcstride,width,height,bit_depth); +} + + +inline void acceleration_functions::put_unweighted_pred(void* _dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth) const +{ + if (bit_depth <= 8) + put_unweighted_pred_8((uint8_t*)_dst,dststride,src,srcstride,width,height); + else + put_unweighted_pred_16((uint16_t*)_dst,dststride,src,srcstride,width,height,bit_depth); +} + + +inline void acceleration_functions::put_weighted_pred(void* _dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth) const +{ + if (bit_depth <= 8) + put_weighted_pred_8((uint8_t*)_dst,dststride,src,srcstride,width,height,w,o,log2WD); + else + put_weighted_pred_16((uint16_t*)_dst,dststride,src,srcstride,width,height,w,o,log2WD,bit_depth); +} + + +inline void acceleration_functions::put_weighted_bipred(void* _dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth) const +{ + if (bit_depth <= 8) + put_weighted_bipred_8((uint8_t*)_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD); + else + put_weighted_bipred_16((uint16_t*)_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD,bit_depth); +} + + + +inline void acceleration_functions::put_hevc_epel(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_epel_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer); + else + put_hevc_epel_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); +} + +inline void acceleration_functions::put_hevc_epel_h(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_epel_h_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); + else + put_hevc_epel_h_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); +} + +inline void acceleration_functions::put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_epel_v_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); + else + put_hevc_epel_v_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); +} + +inline void acceleration_functions::put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_epel_hv_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth); + else + put_hevc_epel_hv_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth); +} + +inline void acceleration_functions::put_hevc_qpel(int16_t *dst, ptrdiff_t dststride, + const void *src, ptrdiff_t srcstride, int width, int height, + int16_t* mcbuffer, int dX,int dY, int bit_depth) const +{ + if (bit_depth <= 8) + put_hevc_qpel_8[dX][dY](dst,dststride,(const uint8_t*)src,srcstride,width,height,mcbuffer); + else + put_hevc_qpel_16[dX][dY](dst,dststride,(const uint16_t*)src,srcstride,width,height,mcbuffer, bit_depth); +} + +template <> inline void acceleration_functions::transform_skip(uint8_t *dst, const int16_t *coeffs,ptrdiff_t stride, int bit_depth) const { transform_skip_8(dst,coeffs,stride); } +template <> inline void acceleration_functions::transform_skip(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_skip_16(dst,coeffs,stride, bit_depth); } + +template <> inline void acceleration_functions::transform_skip_rdpcm_v(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(bit_depth==8); transform_skip_rdpcm_v_8(dst,coeffs,nT,stride); } +template <> inline void acceleration_functions::transform_skip_rdpcm_h(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(bit_depth==8); transform_skip_rdpcm_h_8(dst,coeffs,nT,stride); } +template <> inline void acceleration_functions::transform_skip_rdpcm_v(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(false); /*transform_skip_rdpcm_v_8(dst,coeffs,nT,stride);*/ } +template <> inline void acceleration_functions::transform_skip_rdpcm_h(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(false); /*transform_skip_rdpcm_h_8(dst,coeffs,nT,stride);*/ } + + +template <> inline void acceleration_functions::transform_4x4_dst_add(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride,int bit_depth) const { transform_4x4_dst_add_8(dst,coeffs,stride); } +template <> inline void acceleration_functions::transform_4x4_dst_add(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride,int bit_depth) const { transform_4x4_dst_add_16(dst,coeffs,stride,bit_depth); } + +template <> inline void acceleration_functions::transform_add(int sizeIdx, uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_add_8[sizeIdx](dst,coeffs,stride); } +template <> inline void acceleration_functions::transform_add(int sizeIdx, uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_add_16[sizeIdx](dst,coeffs,stride,bit_depth); } + +template <> inline void acceleration_functions::add_residual(uint8_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const { add_residual_8(dst,stride,r,nT,bit_depth); } +template <> inline void acceleration_functions::add_residual(uint16_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const { add_residual_16(dst,stride,r,nT,bit_depth); } + +#endif diff --git a/libde265/alloc_pool.h b/libde265/alloc_pool.h new file mode 100644 index 0000000..41dd4a4 --- /dev/null +++ b/libde265/alloc_pool.h @@ -0,0 +1,61 @@ +/* + * H.265 video codec. + * Copyright (c) 2014 struktur AG, Dirk Farin + * + * Authors: Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef ALLOC_POOL_H +#define ALLOC_POOL_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#ifdef HAVE_STDINT_H +#include +#endif +#ifdef HAVE_CSTDINT +#include +#endif + + +class alloc_pool +{ + public: + alloc_pool(size_t objSize, int poolSize=1000, bool grow=true); + ~alloc_pool(); + + void* new_obj(const size_t size); + void delete_obj(void*); + void purge(); + + private: + size_t mObjSize; + int mPoolSize; + bool mGrow; + + std::vector m_memBlocks; + std::vector m_freeList; + + void add_memory_block(); +}; + +#endif diff --git a/libde265/bitstream.h b/libde265/bitstream.h new file mode 100644 index 0000000..3100b43 --- /dev/null +++ b/libde265/bitstream.h @@ -0,0 +1,63 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_BITSTREAM_H +#define DE265_BITSTREAM_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#ifdef HAVE_STDBOOL_H +#include +#endif +#include + + +#define MAX_UVLC_LEADING_ZEROS 20 +#define UVLC_ERROR -99999 + + +typedef struct { + uint8_t* data; + int bytes_remaining; + + uint64_t nextbits; // left-aligned bits + int nextbits_cnt; +} bitreader; + +void bitreader_init(bitreader*, unsigned char* buffer, int len); +void bitreader_refill(bitreader*); // refill to at least 56+1 bits +int next_bit(bitreader*); +int next_bit_norefill(bitreader*); +int get_bits(bitreader*, int n); +int get_bits_fast(bitreader*, int n); +int peek_bits(bitreader*, int n); +void skip_bits(bitreader*, int n); +void skip_bits_fast(bitreader*, int n); +void skip_to_byte_boundary(bitreader*); +void prepare_for_CABAC(bitreader*); +int get_uvlc(bitreader*); // may return UVLC_ERROR +int get_svlc(bitreader*); // may return UVLC_ERROR + +bool check_rbsp_trailing_bits(bitreader*); // return true if remaining filler bits are all zero + +#endif diff --git a/libde265/cabac.h b/libde265/cabac.h new file mode 100644 index 0000000..e28aeeb --- /dev/null +++ b/libde265/cabac.h @@ -0,0 +1,211 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_CABAC_H +#define DE265_CABAC_H + +#include +#include "contextmodel.h" + + +typedef struct { + uint8_t* bitstream_start; + uint8_t* bitstream_curr; + uint8_t* bitstream_end; + + uint32_t range; + uint32_t value; + int16_t bits_needed; +} CABAC_decoder; + + +void init_CABAC_decoder(CABAC_decoder* decoder, uint8_t* bitstream, int length); +void init_CABAC_decoder_2(CABAC_decoder* decoder); +int decode_CABAC_bit(CABAC_decoder* decoder, context_model* model); +int decode_CABAC_TU(CABAC_decoder* decoder, int cMax, context_model* model); +int decode_CABAC_term_bit(CABAC_decoder* decoder); + +int decode_CABAC_bypass(CABAC_decoder* decoder); +int decode_CABAC_TU_bypass(CABAC_decoder* decoder, int cMax); +int decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits); +int decode_CABAC_TR_bypass(CABAC_decoder* decoder, int cRiceParam, int cTRMax); +int decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k); + + +// --------------------------------------------------------------------------- + +class CABAC_encoder +{ +public: + CABAC_encoder() : mCtxModels(NULL) { } + virtual ~CABAC_encoder() { } + + virtual int size() const = 0; + virtual void reset() = 0; + + // --- VLC --- + + virtual void write_bits(uint32_t bits,int n) = 0; + virtual void write_bit(int bit) { write_bits(bit,1); } + virtual void write_uvlc(int value); + virtual void write_svlc(int value); + virtual void write_startcode() = 0; + virtual void skip_bits(int nBits) = 0; + + virtual void add_trailing_bits(); + virtual int number_free_bits_in_byte() const = 0; + + // output all remaining bits and fill with zeros to next byte boundary + virtual void flush_VLC() { } + + + // --- CABAC --- + + void set_context_models(context_model_table* models) { mCtxModels=models; } + + virtual void init_CABAC() { } + virtual void write_CABAC_bit(int modelIdx, int bit) = 0; + virtual void write_CABAC_bypass(int bit) = 0; + virtual void write_CABAC_TU_bypass(int value, int cMax); + virtual void write_CABAC_FL_bypass(int value, int nBits); + virtual void write_CABAC_term_bit(int bit) = 0; + virtual void flush_CABAC() { } + + void write_CABAC_EGk(int absolute_symbol, int k); // absolute_symbol >= 0 + + virtual bool modifies_context() const = 0; + + float RDBits_for_CABAC_bin(int modelIdx, int bit); + + protected: + context_model_table* mCtxModels; +}; + + +class CABAC_encoder_bitstream : public CABAC_encoder +{ +public: + CABAC_encoder_bitstream(); + ~CABAC_encoder_bitstream(); + + virtual void reset(); + + virtual int size() const { return data_size; } + uint8_t* data() const { return data_mem; } + + // --- VLC --- + + virtual void write_bits(uint32_t bits,int n); + virtual void write_startcode(); + virtual void skip_bits(int nBits); + + virtual int number_free_bits_in_byte() const; + + // output all remaining bits and fill with zeros to next byte boundary + virtual void flush_VLC(); + + + // --- CABAC --- + + virtual void init_CABAC(); + virtual void write_CABAC_bit(int modelIdx, int bit); + virtual void write_CABAC_bypass(int bit); + virtual void write_CABAC_term_bit(int bit); + virtual void flush_CABAC(); + + virtual bool modifies_context() const { return true; } + +private: + // data buffer + + uint8_t* data_mem; + uint32_t data_capacity; + uint32_t data_size; + char state; // for inserting emulation-prevention bytes + + // VLC + + uint32_t vlc_buffer; + uint32_t vlc_buffer_len; + + + // CABAC + + uint32_t range; + uint32_t low; + int8_t bits_left; + uint8_t buffered_byte; + uint16_t num_buffered_bytes; + + + void check_size_and_resize(int nBytes); + void testAndWriteOut(); + void write_out(); + void append_byte(int byte); +}; + + +class CABAC_encoder_estim : public CABAC_encoder +{ +public: + CABAC_encoder_estim() : mFracBits(0) { } + + virtual void reset() { mFracBits=0; } + + virtual int size() const { return mFracBits>>(15+3); } + + uint64_t getFracBits() const { return mFracBits; } + float getRDBits() const { return mFracBits / float(1<<15); } + + // --- VLC --- + + virtual void write_bits(uint32_t bits,int n) { mFracBits += n<<15; } + virtual void write_bit(int bit) { mFracBits+=1<<15; } + virtual void write_startcode() { mFracBits += (1<<15)*8*3; } + virtual void skip_bits(int nBits) { mFracBits += nBits<<15; } + virtual int number_free_bits_in_byte() const { return 0; } // TODO, good enough for now + + // --- CABAC --- + + virtual void write_CABAC_bit(int modelIdx, int bit); + virtual void write_CABAC_bypass(int bit) { + mFracBits += 0x8000; + } + virtual void write_CABAC_FL_bypass(int value, int nBits) { + mFracBits += nBits<<15; + } + virtual void write_CABAC_term_bit(int bit) { /* not implemented (not needed) */ } + + virtual bool modifies_context() const { return true; } + + protected: + uint64_t mFracBits; +}; + + +class CABAC_encoder_estim_constant : public CABAC_encoder_estim +{ + public: + void write_CABAC_bit(int modelIdx, int bit); + + virtual bool modifies_context() const { return false; } +}; + +#endif diff --git a/libde265/configparam.h b/libde265/configparam.h new file mode 100644 index 0000000..58b1daa --- /dev/null +++ b/libde265/configparam.h @@ -0,0 +1,401 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef CONFIG_PARAM_H +#define CONFIG_PARAM_H + +#include "en265.h" +#include "util.h" + +#include +#include +#include +#include +#include + + +/* Notes: probably best to keep cmd-line-options here. So it will be: + - automatically consistent even when having different combinations of algorithms + - no other place to edit + - if needed, one can still override it at another place + */ + +// TODO: set a stack of default prefixes in config_parameters, such that all options added +// will receive this namespace prefix. + +// TODO: add the possibility to remove long options again, i.e., not use the default id name +class option_base +{ + public: + option_base() : mShortOption(0), mLongOption(NULL) { } + option_base(const char* name) : mIDName(name), mShortOption(0), mLongOption(NULL) { } + virtual ~option_base() { } + + + // --- option identifier --- + + void set_ID(const char* name) { mIDName=name; } + void add_namespace_prefix(std::string prefix) { mPrefix = prefix + ":" + mPrefix; } + + std::string get_name() const { return mPrefix + mIDName; } + + + // --- description --- + + void set_description(std::string descr) { mDescription = descr; } + std::string get_description() const { return mDescription; } + bool has_description() const { return !mDescription.empty(); } + + + // --- value --- + + virtual bool is_defined() const = 0; + bool is_undefined() const { return !is_defined(); } + + virtual bool has_default() const = 0; + + + // --- command line options ---- + + void set_cmd_line_options(const char* long_option, char short_option = 0) + { + mShortOption = short_option; + mLongOption = long_option; + } + + void set_short_option(char short_option) { mShortOption=short_option; } + + void unsetCmdLineOption() + { + mShortOption = 0; + mLongOption = NULL; + } + + bool hasShortOption() const { return mShortOption!=0; } + char getShortOption() const { return mShortOption; } + bool hasLongOption() const { return true; } //mLongOption!=NULL; } + std::string getLongOption() const { return mLongOption ? std::string(mLongOption) : get_name(); } + + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx) { return false; } + + + + virtual std::string getTypeDescr() const = 0; + + virtual std::string get_default_string() const { return "N/A"; } + + private: + std::string mPrefix; + std::string mIDName; + + std::string mDescription; + + char mShortOption; + const char* mLongOption; +}; + + + +class option_bool : public option_base +{ +public: + option_bool() : value_set(false), default_set(false) { } + + operator bool() const { + assert(value_set || default_set); + return value_set ? value : default_value; + } + + virtual bool is_defined() const { return value_set || default_set; } + virtual bool has_default() const { return default_set; } + + void set_default(bool v) { default_value=v; default_set=true; } + virtual std::string get_default_string() const { return default_value ? "true":"false"; } + + virtual std::string getTypeDescr() const { return "(boolean)"; } + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx) { set(true); return true; } + + bool set(bool v) { value_set=true; value=v; return true; } + + private: + bool value_set; + bool value; + + bool default_set; + bool default_value; +}; + + +class option_string : public option_base +{ +public: + option_string() : value_set(false), default_set(false) { } + + const option_string& operator=(std::string v) { value=v; value_set=true; return *this; } + + operator std::string() const { return get(); } + std::string get() const { + assert(value_set || default_set); + return value_set ? value : default_value; + } + + virtual bool is_defined() const { return value_set || default_set; } + virtual bool has_default() const { return default_set; } + + void set_default(std::string v) { default_value=v; default_set=true; } + virtual LIBDE265_API std::string get_default_string() const { return default_value; } + + virtual LIBDE265_API std::string getTypeDescr() const { return "(string)"; } + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx); + + bool set(std::string v) { value_set=true; value=v; return true; } + + private: + bool value_set; + std::string value; + + bool default_set; + std::string default_value; +}; + + +class option_int : public option_base +{ +public: + option_int() : value_set(false), default_set(false), + have_low_limit(false), have_high_limit(false) { } + + void set_minimum(int mini) { have_low_limit =true; low_limit =mini; } + void set_maximum(int maxi) { have_high_limit=true; high_limit=maxi; } + void set_range(int mini,int maxi); + void set_valid_values(const std::vector& v) { valid_values_set = v; } + + const option_int& operator=(int v) { value=v; value_set=true; return *this; } + + int operator() () const { + assert(value_set || default_set); + return value_set ? value : default_value; + } + operator int() const { return operator()(); } + + virtual bool is_defined() const { return value_set || default_set; } + virtual bool has_default() const { return default_set; } + + void set_default(int v) { default_value=v; default_set=true; } + virtual LIBDE265_API std::string get_default_string() const; + + virtual LIBDE265_API std::string getTypeDescr() const; + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx); + + bool set(int v) { + if (is_valid(v)) { value_set=true; value=v; return true; } + else { return false; } + } + + private: + bool value_set; + int value; + + bool default_set; + int default_value; + + bool have_low_limit, have_high_limit; + int low_limit, high_limit; + + std::vector valid_values_set; + + bool is_valid(int v) const; +}; + + + +class choice_option_base : public option_base +{ +public: + choice_option_base() : choice_string_table(NULL) { } + ~choice_option_base() { delete[] choice_string_table; } + + bool set(std::string v) { return set_value(v); } + virtual bool set_value(const std::string& val) = 0; + virtual std::vector get_choice_names() const = 0; + + virtual std::string getTypeDescr() const; + virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx); + + const char** get_choices_string_table() const; + + protected: + void invalidate_choices_string_table() { + delete[] choice_string_table; + choice_string_table = NULL; + } + + private: + mutable char* choice_string_table; +}; + + +template class choice_option : public choice_option_base +{ + public: + choice_option() : default_set(false), value_set(false) { } + + // --- initialization --- + + void add_choice(const std::string& s, T id, bool default_value=false) { + choices.push_back( std::make_pair(s,id) ); + if (default_value) { + defaultID = id; + defaultValue = s; + default_set = true; + } + + invalidate_choices_string_table(); + } + + void set_default(T val) { +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, c, choices) { +#else + for (typename std::vector< std::pair >::const_iterator it=choices.begin(); it!=choices.end(); ++it) { + const std::pair & c = *it; +#endif + if (c.second == val) { + defaultID = val; + defaultValue = c.first; + default_set = true; + return; + } + } + + assert(false); // value does not exist + } + + + // --- usage --- + + bool set_value(const std::string& val) // returns false if it is not a valid option + { + value_set = true; + selectedValue=val; + + validValue = false; + +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, c, choices) { +#else + for (typename std::vector< std::pair >::const_iterator it=choices.begin(); it!=choices.end(); ++it) { + const std::pair & c = *it; +#endif + if (val == c.first) { + selectedID = c.second; + validValue = true; + } + } + + return validValue; + } + + bool isValidValue() const { return validValue; } + + const std::string& getValue() const { + assert(value_set || default_set); + return value_set ? selectedValue : defaultValue; + } + void setID(T id) { selectedID=id; validValue=true; } + const T getID() const { return value_set ? selectedID : defaultID; } + + virtual bool is_defined() const { return value_set || default_set; } + virtual bool has_default() const { return default_set; } + + std::vector get_choice_names() const + { + std::vector names; +#ifdef FOR_LOOP_AUTO_SUPPORT + FOR_LOOP(auto, p, choices) { +#else + for (typename std::vector< std::pair >::const_iterator it=choices.begin(); it!=choices.end(); ++it) { + const std::pair & p = *it; +#endif + names.push_back(p.first); + } + return names; + } + + std::string get_default_string() const { return defaultValue; } + + T operator() () const { return (T)getID(); } + + private: + std::vector< std::pair > choices; + + bool default_set; + std::string defaultValue; + T defaultID; + + bool value_set; + std::string selectedValue; + T selectedID; + + bool validValue; +}; + + + + +class config_parameters +{ + public: + config_parameters() : param_string_table(NULL) { } + ~config_parameters() { delete[] param_string_table; } + + void LIBDE265_API add_option(option_base* o); + + void LIBDE265_API print_params() const; + bool LIBDE265_API parse_command_line_params(int* argc, char** argv, int* first_idx=NULL, + bool ignore_unknown_options=false); + + + // --- connection to C API --- + + std::vector get_parameter_IDs() const; + enum en265_parameter_type get_parameter_type(const char* param) const; + + std::vector get_parameter_choices(const char* param) const; + + bool set_bool(const char* param, bool value); + bool set_int(const char* param, int value); + bool set_string(const char* param, const char* value); + bool set_choice(const char* param, const char* value); + + const char** get_parameter_string_table() const; + const char** get_parameter_choices_table(const char* param) const; + + private: + std::vector mOptions; + + option_base* find_option(const char* param) const; + + mutable char* param_string_table; +}; + +#endif diff --git a/libde265/contextmodel.h b/libde265/contextmodel.h new file mode 100644 index 0000000..cde83e1 --- /dev/null +++ b/libde265/contextmodel.h @@ -0,0 +1,130 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * Min Chen + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_CONTEXTMODEL_H +#define DE265_CONTEXTMODEL_H + +#include "libde265/cabac.h" +#include "libde265/de265.h" + +#include +#include + + +struct context_model { + uint8_t MPSbit : 1; + uint8_t state : 7; + + bool operator==(context_model b) const { return state==b.state && MPSbit==b.MPSbit; } + bool operator!=(context_model b) const { return state!=b.state || MPSbit!=b.MPSbit; } +}; + + +enum context_model_index { + // SAO + CONTEXT_MODEL_SAO_MERGE_FLAG = 0, + CONTEXT_MODEL_SAO_TYPE_IDX = CONTEXT_MODEL_SAO_MERGE_FLAG +1, + + // CB-tree + CONTEXT_MODEL_SPLIT_CU_FLAG = CONTEXT_MODEL_SAO_TYPE_IDX + 1, + CONTEXT_MODEL_CU_SKIP_FLAG = CONTEXT_MODEL_SPLIT_CU_FLAG + 3, + + // intra-prediction + CONTEXT_MODEL_PART_MODE = CONTEXT_MODEL_CU_SKIP_FLAG + 3, + CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG = CONTEXT_MODEL_PART_MODE + 4, + CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE = CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG + 1, + + // transform-tree + CONTEXT_MODEL_CBF_LUMA = CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE + 1, + CONTEXT_MODEL_CBF_CHROMA = CONTEXT_MODEL_CBF_LUMA + 2, + CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG = CONTEXT_MODEL_CBF_CHROMA + 4, + CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG = CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + 3, + CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX = CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG + 1, + + // residual + CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX = CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX + 1, + CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX + 18, + CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX + 18, + CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG = CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + 4, + CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG = CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + 42+2, + CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + 24, + + CONTEXT_MODEL_CU_QP_DELTA_ABS = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + 6, + CONTEXT_MODEL_TRANSFORM_SKIP_FLAG = CONTEXT_MODEL_CU_QP_DELTA_ABS + 2, + CONTEXT_MODEL_RDPCM_FLAG = CONTEXT_MODEL_TRANSFORM_SKIP_FLAG + 2, + CONTEXT_MODEL_RDPCM_DIR = CONTEXT_MODEL_RDPCM_FLAG + 2, + + // motion + CONTEXT_MODEL_MERGE_FLAG = CONTEXT_MODEL_RDPCM_DIR + 2, + CONTEXT_MODEL_MERGE_IDX = CONTEXT_MODEL_MERGE_FLAG + 1, + CONTEXT_MODEL_PRED_MODE_FLAG = CONTEXT_MODEL_MERGE_IDX + 1, + CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG = CONTEXT_MODEL_PRED_MODE_FLAG + 1, + CONTEXT_MODEL_MVP_LX_FLAG = CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 2, + CONTEXT_MODEL_RQT_ROOT_CBF = CONTEXT_MODEL_MVP_LX_FLAG + 1, + CONTEXT_MODEL_REF_IDX_LX = CONTEXT_MODEL_RQT_ROOT_CBF + 1, + CONTEXT_MODEL_INTER_PRED_IDC = CONTEXT_MODEL_REF_IDX_LX + 2, + CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG = CONTEXT_MODEL_INTER_PRED_IDC + 5, + CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 = CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG + 1, + CONTEXT_MODEL_RES_SCALE_SIGN_FLAG = CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 + 8, + CONTEXT_MODEL_TABLE_LENGTH = CONTEXT_MODEL_RES_SCALE_SIGN_FLAG + 2 +}; + + + +void initialize_CABAC_models(context_model context_model_table[CONTEXT_MODEL_TABLE_LENGTH], + int initType, + int QPY); + + +class context_model_table +{ + public: + context_model_table(); + context_model_table(const context_model_table&); + ~context_model_table(); + + void init(int initType, int QPY); + void release(); + void decouple(); + context_model_table transfer(); + context_model_table copy() const { context_model_table t=*this; t.decouple(); return t; } + + bool empty() const { return refcnt != NULL; } + + context_model& operator[](int i) { return model[i]; } + + context_model_table& operator=(const context_model_table&); + + bool operator==(const context_model_table&) const; + + std::string debug_dump() const; + + private: + void decouple_or_alloc_with_empty_data(); + + context_model* model; // [CONTEXT_MODEL_TABLE_LENGTH] + int* refcnt; +}; + + +#endif diff --git a/libde265/de265-version.h b/libde265/de265-version.h new file mode 100644 index 0000000..0e22cbe --- /dev/null +++ b/libde265/de265-version.h @@ -0,0 +1,36 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +/* de265-version.h + * + * This file was generated by autoconf when libde265 was built. + * + * DO NOT EDIT THIS FILE. + */ +#ifndef LIBDE265_VERSION_H +#define LIBDE265_VERSION_H + +/* Numeric representation of the version */ +#define LIBDE265_NUMERIC_VERSION 0x01000500 + +/* Version string */ +#define LIBDE265_VERSION "1.0.5" + +#endif diff --git a/libde265/de265.h b/libde265/de265.h new file mode 100644 index 0000000..6481d8f --- /dev/null +++ b/libde265/de265.h @@ -0,0 +1,437 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . +*/ + + +#ifndef DE265_H +#define DE265_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +//#define inline static __inline + + +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif +#include + +#if defined(_MSC_VER) && !defined(LIBDE265_STATIC_BUILD) + #ifdef LIBDE265_EXPORTS + #define LIBDE265_API __declspec(dllexport) + #else + #define LIBDE265_API __declspec(dllimport) + #endif +#elif HAVE_VISIBILITY + #ifdef LIBDE265_EXPORTS + #define LIBDE265_API __attribute__((__visibility__("default"))) + #else + #define LIBDE265_API + #endif +#else + #define LIBDE265_API +#endif + +#if __GNUC__ +#define LIBDE265_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define LIBDE265_DEPRECATED __declspec(deprecated) +#else +#define LIBDE265_DEPRECATED +#endif + +#if defined(_MSC_VER) +#define LIBDE265_INLINE __inline +#else +#define LIBDE265_INLINE inline +#endif + +/* === version numbers === */ + +// version of linked libde265 library +LIBDE265_API const char *de265_get_version(void); +LIBDE265_API uint32_t de265_get_version_number(void); + +LIBDE265_API int de265_get_version_number_major(void); +LIBDE265_API int de265_get_version_number_minor(void); +LIBDE265_API int de265_get_version_number_maintenance(void); + + +/* === error codes === */ + +typedef enum { + DE265_OK = 0, + DE265_ERROR_NO_SUCH_FILE=1, + //DE265_ERROR_NO_STARTCODE=2, obsolet + //DE265_ERROR_EOF=3, + DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS=4, + DE265_ERROR_CHECKSUM_MISMATCH=5, + DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA=6, + DE265_ERROR_OUT_OF_MEMORY=7, + DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE=8, + DE265_ERROR_IMAGE_BUFFER_FULL=9, + DE265_ERROR_CANNOT_START_THREADPOOL=10, + DE265_ERROR_LIBRARY_INITIALIZATION_FAILED=11, + DE265_ERROR_LIBRARY_NOT_INITIALIZED=12, + DE265_ERROR_WAITING_FOR_INPUT_DATA=13, + DE265_ERROR_CANNOT_PROCESS_SEI=14, + DE265_ERROR_PARAMETER_PARSING=15, + DE265_ERROR_NO_INITIAL_SLICE_HEADER=16, + DE265_ERROR_PREMATURE_END_OF_SLICE=17, + DE265_ERROR_UNSPECIFIED_DECODING_ERROR=18, + + // --- errors that should become obsolete in later libde265 versions --- + + //DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED = 500, obsolet + //DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED = 501, obsolet + DE265_ERROR_NOT_IMPLEMENTED_YET = 502, + //DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED = 502, obsolet + + // --- warnings --- + + DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING = 1000, + DE265_WARNING_WARNING_BUFFER_FULL=1001, + DE265_WARNING_PREMATURE_END_OF_SLICE_SEGMENT=1002, + DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET=1003, + DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA=1004, + DE265_WARNING_SPS_HEADER_INVALID=1005, + DE265_WARNING_PPS_HEADER_INVALID=1006, + DE265_WARNING_SLICEHEADER_INVALID=1007, + DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING=1008, + DE265_WARNING_NONEXISTING_PPS_REFERENCED=1009, + DE265_WARNING_NONEXISTING_SPS_REFERENCED=1010, + DE265_WARNING_BOTH_PREDFLAGS_ZERO=1011, + DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED=1012, + DE265_WARNING_NUMMVP_NOT_EQUAL_TO_NUMMVQ=1013, + DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE=1014, + DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE=1015, + DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST=1016, + DE265_WARNING_EOSS_BIT_NOT_SET=1017, + DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED=1018, + DE265_WARNING_INVALID_CHROMA_FORMAT=1019, + DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID=1020, + DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO=1021, + DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM=1022, + DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER=1023, + DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY=1024, + DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI=1025, + DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA=1026 +} de265_error; + +LIBDE265_API const char* de265_get_error_text(de265_error err); + +/* Returns true, if 'err' is DE265_OK or a warning. + */ +LIBDE265_API int de265_isOK(de265_error err); + +LIBDE265_API void de265_disable_logging(); // DEPRECATED +LIBDE265_API void de265_set_verbosity(int level); + + +/* === image === */ + +/* The image is currently always 3-channel YCbCr, with 4:2:0 chroma. + But you may want to check the chroma format anyway for future compatibility. + */ + +struct de265_image; + +enum de265_chroma { + de265_chroma_mono=0, + de265_chroma_420=1, + de265_chroma_422=2, + de265_chroma_444=3 +}; + +typedef int64_t de265_PTS; + + +LIBDE265_API int de265_get_image_width(const struct de265_image*,int channel); +LIBDE265_API int de265_get_image_height(const struct de265_image*,int channel); +LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image*); +LIBDE265_API int de265_get_bits_per_pixel(const struct de265_image*,int channel); +/* The |out_stride| is returned as "bytes per line" if a non-NULL parameter is given. */ +LIBDE265_API const uint8_t* de265_get_image_plane(const struct de265_image*, int channel, int* out_stride); +LIBDE265_API void* de265_get_image_plane_user_data(const struct de265_image*, int channel); +LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image*); +LIBDE265_API void* de265_get_image_user_data(const struct de265_image*); +LIBDE265_API void de265_set_image_user_data(struct de265_image*, void *user_data); + +/* Get NAL-header information of this frame. You can pass in NULL pointers if you + do not need this piece of information. + */ +LIBDE265_API void de265_get_image_NAL_header(const struct de265_image*, + int* nal_unit_type, + const char** nal_unit_name, // textual description of 'nal_unit_type' + int* nuh_layer_id, + int* nuh_temporal_id); + + +/* === decoder === */ + +typedef void de265_decoder_context; // private structure + + + +/* Get a new decoder context. Must be freed with de265_free_decoder(). */ +LIBDE265_API de265_decoder_context* de265_new_decoder(void); + +/* Initialize background decoding threads. If this function is not called, + all decoding is done in the main thread (no multi-threading). */ +LIBDE265_API de265_error de265_start_worker_threads(de265_decoder_context*, int number_of_threads); + +/* Free decoder context. May only be called once on a context. */ +LIBDE265_API de265_error de265_free_decoder(de265_decoder_context*); + +#ifndef LIBDE265_DISABLE_DEPRECATED +/* Push more data into the decoder, must be raw h265. + All complete images in the data will be decoded, hence, do not push + too much data at once to prevent image buffer overflows. + The end of a picture can only be detected when the succeeding start-code + is read from the data. + If you want to flush the data and force decoding of the data so far + (e.g. at the end of a file), call de265_decode_data() with 'length' zero. + + NOTE: This method is deprecated and will be removed in a future version. + You should use "de265_push_data" or "de265_push_NAL" and "de265_decode" + instead. +*/ +LIBDE265_API LIBDE265_DEPRECATED de265_error de265_decode_data(de265_decoder_context*, const void* data, int length); +#endif + +/* Push more data into the decoder, must be a raw h265 bytestream with startcodes. + The PTS is assigned to all NALs whose start-code 0x000001 is contained in the data. + The bytestream must contain all stuffing-bytes. + This function only pushes data into the decoder, nothing will be decoded. +*/ +LIBDE265_API de265_error de265_push_data(de265_decoder_context*, const void* data, int length, + de265_PTS pts, void* user_data); + +/* Indicate that de265_push_data has just received data until the end of a NAL. + The remaining pending input data is put into a NAL package and forwarded to the decoder. +*/ +LIBDE265_API void de265_push_end_of_NAL(de265_decoder_context*); + +/* Indicate that de265_push_data has just received data until the end of a frame. + All data pending at the decoder input will be pushed into the decoder and + the decoded picture is pushed to the output queue. +*/ +LIBDE265_API void de265_push_end_of_frame(de265_decoder_context*); + +/* Push a complete NAL unit without startcode into the decoder. The data must still + contain all stuffing-bytes. + This function only pushes data into the decoder, nothing will be decoded. +*/ +LIBDE265_API de265_error de265_push_NAL(de265_decoder_context*, const void* data, int length, + de265_PTS pts, void* user_data); + +/* Indicate the end-of-stream. All data pending at the decoder input will be + pushed into the decoder and the decoded picture queue will be completely emptied. + */ +LIBDE265_API de265_error de265_flush_data(de265_decoder_context*); + +/* Return number of bytes pending at the decoder input. + Can be used to avoid overflowing the decoder with too much data. + */ +LIBDE265_API int de265_get_number_of_input_bytes_pending(de265_decoder_context*); + +/* Return number of NAL units pending at the decoder input. + Can be used to avoid overflowing the decoder with too much data. + */ +LIBDE265_API int de265_get_number_of_NAL_units_pending(de265_decoder_context*); + +/* Do some decoding. Returns status whether it did perform some decoding or + why it could not do so. If 'more' is non-null, indicates whether de265_decode() + should be called again (possibly after resolving the indicated problem). + DE265_OK - decoding ok + DE265_ERROR_IMAGE_BUFFER_FULL - DPB full, extract some images before continuing + DE265_ERROR_WAITING_FOR_INPUT_DATA - insert more data before continuing + + You have to consider these cases: + - decoding successful -> err = DE265_OK, more=true + - decoding stalled -> err != DE265_OK, more=true + - decoding finished -> err = DE265_OK, more=false + - unresolvable error -> err != DE265_OK, more=false + */ +LIBDE265_API de265_error de265_decode(de265_decoder_context*, int* more); + +/* Clear decoder state. Call this when skipping in the stream. + */ +LIBDE265_API void de265_reset(de265_decoder_context*); + +/* Return next decoded picture, if there is any. If no complete picture has been + decoded yet, NULL is returned. You should call de265_release_next_picture() to + advance to the next picture. */ +LIBDE265_API const struct de265_image* de265_peek_next_picture(de265_decoder_context*); // may return NULL + +/* Get next decoded picture and remove this picture from the decoder output queue. + Returns NULL is there is no decoded picture ready. + You can use the picture only until you call any other de265_* function. */ +LIBDE265_API const struct de265_image* de265_get_next_picture(de265_decoder_context*); // may return NULL + +/* Release the current decoded picture for reuse in the decoder. You should not + use the data anymore after calling this function. */ +LIBDE265_API void de265_release_next_picture(de265_decoder_context*); + + +LIBDE265_API de265_error de265_get_warning(de265_decoder_context*); + + +enum de265_image_format { + de265_image_format_mono8 = 1, + de265_image_format_YUV420P8 = 2, + de265_image_format_YUV422P8 = 3, + de265_image_format_YUV444P8 = 4 +}; + +struct de265_image_spec +{ + enum de265_image_format format; + int width; + int height; + int alignment; + + // conformance window + + int crop_left; + int crop_right; + int crop_top; + int crop_bottom; + + int visible_width; // convenience, width - crop_left - crop_right + int visible_height; // convenience, height - crop_top - crop_bottom +}; + +struct de265_image_allocation +{ + int (*get_buffer)(de265_decoder_context* ctx, // first parameter deprecated + struct de265_image_spec* spec, + struct de265_image* img, + void* userdata); + void (*release_buffer)(de265_decoder_context* ctx, // first parameter deprecated + struct de265_image* img, + void* userdata); +}; + +/* The user data pointer will be given to the get_buffer() and release_buffer() functions + in de265_image_allocation. */ +LIBDE265_API void de265_set_image_allocation_functions(de265_decoder_context*, + struct de265_image_allocation*, + void* userdata); +LIBDE265_API const struct de265_image_allocation *de265_get_default_image_allocation_functions(void); + +LIBDE265_API void de265_set_image_plane(struct de265_image* img, int cIdx, void* mem, int stride, void *userdata); + + +/* --- frame dropping API --- + + To limit decoding to a maximum temporal layer (TID), use de265_set_limit_TID(). + The maximum layer ID in the stream can be queried with de265_get_highest_TID(). + Note that the maximum layer ID can change throughout the stream. + + For a fine-grained selection of the frame-rate, use de265_set_framerate_ratio(). + A percentage of 100% will decode all frames in all temporal layers. A lower percentage + will drop approximately as many frames. Note that this only accurate if the frames + are distributed evenly among the layers. Otherwise, the mapping is non-linear. + + The limit_TID has a higher precedence than framerate_ratio. Hence, setting a higher + framerate-ratio will decode at limit_TID without dropping. + + With change_framerate(), the output frame-rate can be increased/decreased to some + discrete preferable values. Currently, these are non-dropped decoding at various + TID layers. +*/ + +LIBDE265_API int de265_get_highest_TID(de265_decoder_context*); // highest temporal substream to decode +LIBDE265_API int de265_get_current_TID(de265_decoder_context*); // currently decoded temporal substream + +LIBDE265_API void de265_set_limit_TID(de265_decoder_context*,int max_tid); // highest temporal substream to decode +LIBDE265_API void de265_set_framerate_ratio(de265_decoder_context*,int percent); // percentage of frames to decode (approx) +LIBDE265_API int de265_change_framerate(de265_decoder_context*,int more_vs_less); // 1: more, -1: less, returns corresponding framerate_ratio + + +/* --- decoding parameters --- */ + +enum de265_param { + DE265_DECODER_PARAM_BOOL_SEI_CHECK_HASH=0, // (bool) Perform SEI hash check on decoded pictures. + DE265_DECODER_PARAM_DUMP_SPS_HEADERS=1, // (int) Dump headers to specified file-descriptor. + DE265_DECODER_PARAM_DUMP_VPS_HEADERS=2, + DE265_DECODER_PARAM_DUMP_PPS_HEADERS=3, + DE265_DECODER_PARAM_DUMP_SLICE_HEADERS=4, + DE265_DECODER_PARAM_ACCELERATION_CODE=5, // (int) enum de265_acceleration, default: AUTO + DE265_DECODER_PARAM_SUPPRESS_FAULTY_PICTURES=6, // (bool) do not output frames with decoding errors, default: no (output all images) + + DE265_DECODER_PARAM_DISABLE_DEBLOCKING=7, // (bool) disable deblocking + DE265_DECODER_PARAM_DISABLE_SAO=8 // (bool) disable SAO filter + //DE265_DECODER_PARAM_DISABLE_MC_RESIDUAL_IDCT=9, // (bool) disable decoding of IDCT residuals in MC blocks + //DE265_DECODER_PARAM_DISABLE_INTRA_RESIDUAL_IDCT=10 // (bool) disable decoding of IDCT residuals in MC blocks +}; + +// sorted such that a large ID includes all optimizations from lower IDs +enum de265_acceleration { + de265_acceleration_SCALAR = 0, // only fallback implementation + de265_acceleration_MMX = 10, + de265_acceleration_SSE = 20, + de265_acceleration_SSE2 = 30, + de265_acceleration_SSE4 = 40, + de265_acceleration_AVX = 50, // not implemented yet + de265_acceleration_AVX2 = 60, // not implemented yet + de265_acceleration_ARM = 70, + de265_acceleration_NEON = 80, + de265_acceleration_AUTO = 10000 +}; + + +/* Set decoding parameters. */ +LIBDE265_API void de265_set_parameter_bool(de265_decoder_context*, enum de265_param param, int value); + +LIBDE265_API void de265_set_parameter_int(de265_decoder_context*, enum de265_param param, int value); + +/* Get decoding parameters. */ +LIBDE265_API int de265_get_parameter_bool(de265_decoder_context*, enum de265_param param); + + + +/* --- optional library initialization --- */ + +/* Static library initialization. Must be paired with de265_free(). + Initialization is optional, since it will be done implicitly in de265_new_decoder(). + Return value is false if initialization failed. + Only call de265_free() when initialization was successful. + Multiple calls to 'init' are allowed, but must be matched with an equal number of 'free' calls. +*/ +LIBDE265_API de265_error de265_init(void); + +/* Free global library data. + An implicit free call is made in de265_free_decoder(). + Returns false if library was not initialized before, or if 'free' was called + more often than 'init'. + */ +LIBDE265_API de265_error de265_free(void); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libde265/deblock.h b/libde265/deblock.h new file mode 100644 index 0000000..b8f3781 --- /dev/null +++ b/libde265/deblock.h @@ -0,0 +1,29 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_DEBLOCK_H +#define DE265_DEBLOCK_H + +#include "libde265/decctx.h" + +void add_deblocking_tasks(image_unit* imgunit); +void apply_deblocking_filter(de265_image* img); //decoder_context* ctx); + +#endif diff --git a/libde265/decctx.h b/libde265/decctx.h new file mode 100644 index 0000000..c1acdce --- /dev/null +++ b/libde265/decctx.h @@ -0,0 +1,528 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_DECCTX_H +#define DE265_DECCTX_H + +#include "libde265/vps.h" +#include "libde265/sps.h" +#include "libde265/pps.h" +#include "libde265/nal.h" +#include "libde265/slice.h" +#include "libde265/image.h" +#include "libde265/motion.h" +#include "libde265/de265.h" +#include "libde265/dpb.h" +#include "libde265/sei.h" +#include "libde265/threads.h" +#include "libde265/acceleration.h" +#include "libde265/nal-parser.h" + +#include + +#define DE265_MAX_VPS_SETS 16 // this is the maximum as defined in the standard +#define DE265_MAX_SPS_SETS 16 // this is the maximum as defined in the standard +#define DE265_MAX_PPS_SETS 64 // this is the maximum as defined in the standard + +#define MAX_WARNINGS 20 + + +class slice_segment_header; +class image_unit; +class slice_unit; +class decoder_context; + + +class thread_context +{ +public: + thread_context(); + + int CtbAddrInRS; + int CtbAddrInTS; + + int CtbX, CtbY; + + + // motion vectors + + PBMotionCoding motion; + + + // prediction + + // enum IntraPredMode IntraPredModeC[4]; // chroma intra-prediction mode for current CB + int ResScaleVal; + + + // residual data + + uint8_t cu_transquant_bypass_flag; + uint8_t transform_skip_flag[3]; + uint8_t explicit_rdpcm_flag; + uint8_t explicit_rdpcm_dir; + + // we need 16 bytes of extra memory (8*int16) to shift the base for the + // alignment required for SSE code ! + int16_t _coeffBuf[(32*32)+8]; + int16_t *coeffBuf; // the base pointer for into _coeffBuf, aligned to 16 bytes + + int16_t coeffList[3][32*32]; + int16_t coeffPos[3][32*32]; + int16_t nCoeff[3]; + + int32_t residual_luma[32*32]; // only used when cross-comp-prediction is enabled + + + // quantization + + int IsCuQpDeltaCoded; + int CuQpDelta; + int IsCuChromaQpOffsetCoded; + int CuQpOffsetCb, CuQpOffsetCr; + + int currentQPY; + int currentQG_x, currentQG_y; + int lastQPYinPreviousQG; + + int qPYPrime, qPCbPrime, qPCrPrime; + + CABAC_decoder cabac_decoder; + + context_model_table ctx_model; + uint8_t StatCoeff[4]; + + decoder_context* decctx; + struct de265_image *img; + slice_segment_header* shdr; + + image_unit* imgunit; + slice_unit* sliceunit; + thread_task* task; // executing thread_task or NULL if not multi-threaded + +private: + thread_context(const thread_context&); // not allowed + const thread_context& operator=(const thread_context&); // not allowed +}; + + + +class error_queue +{ + public: + error_queue(); + + void add_warning(de265_error warning, bool once); + de265_error get_warning(); + + private: + de265_error warnings[MAX_WARNINGS]; + int nWarnings; + de265_error warnings_shown[MAX_WARNINGS]; // warnings that have already occurred + int nWarningsShown; +}; + + + +class slice_unit +{ +public: + slice_unit(decoder_context* decctx); + ~slice_unit(); + + NAL_unit* nal; // we are the owner + slice_segment_header* shdr; // not the owner (de265_image is owner) + bitreader reader; + + image_unit* imgunit; + + bool flush_reorder_buffer; + + + // decoding status + + enum SliceDecodingProgress { Unprocessed, + InProgress, + Decoded + } state; + + de265_progress_lock finished_threads; + int nThreads; + + int first_decoded_CTB_RS; // TODO + int last_decoded_CTB_RS; // TODO + + void allocate_thread_contexts(int n); + thread_context* get_thread_context(int n) { + assert(n < nThreadContexts); + return &thread_contexts[n]; + } + int num_thread_contexts() const { return nThreadContexts; } + +private: + thread_context* thread_contexts; /* NOTE: cannot use std::vector, because thread_context has + no copy constructor. */ + int nThreadContexts; + +public: + decoder_context* ctx; + +private: + slice_unit(const slice_unit&); // not allowed + const slice_unit& operator=(const slice_unit&); // not allowed +}; + + +class image_unit +{ +public: + image_unit(); + ~image_unit(); + + de265_image* img; + de265_image sao_output; // if SAO is used, this is allocated and used as SAO output buffer + + std::vector slice_units; + std::vector suffix_SEIs; + + slice_unit* get_next_unprocessed_slice_segment() const { + for (int i=0;istate == slice_unit::Unprocessed) { + return slice_units[i]; + } + } + + return NULL; + } + + slice_unit* get_prev_slice_segment(slice_unit* s) const { + for (int i=1; istate != slice_unit::Unprocessed) return true; + return false; + } + + bool is_first_slice_segment(const slice_unit* s) const { + if (slice_units.size()==0) return false; + return (slice_units[0] == s); + } + + enum { Invalid, // headers not read yet + Unknown, // SPS/PPS available + Reference, // will be used as reference + Leaf // not a reference picture + } role; + + enum { Unprocessed, + InProgress, + Decoded, + Dropped // will not be decoded + } state; + + std::vector tasks; // we are the owner + + /* Saved context models for WPP. + There is one saved model for the initialization of each CTB row. + The array is unused for non-WPP streams. */ + std::vector ctx_models; // TODO: move this into image ? +}; + + +class base_context : public error_queue +{ + public: + base_context(); + virtual ~base_context() { } + + // --- accelerated DSP functions --- + + void set_acceleration_functions(enum de265_acceleration); + + struct acceleration_functions acceleration; // CPU optimized functions + + //virtual /* */ de265_image* get_image(int dpb_index) { return dpb.get_image(dpb_index); } + virtual const de265_image* get_image(int frame_id) const = 0; + virtual bool has_image(int frame_id) const = 0; +}; + + +class decoder_context : public base_context { + public: + decoder_context(); + ~decoder_context(); + + de265_error start_thread_pool(int nThreads); + void stop_thread_pool(); + + void reset(); + + bool has_sps(int id) const { return (bool)sps[id]; } + bool has_pps(int id) const { return (bool)pps[id]; } + + std::shared_ptr get_shared_sps(int id) { return sps[id]; } + std::shared_ptr get_shared_pps(int id) { return pps[id]; } + + /* */ seq_parameter_set* get_sps(int id) { return sps[id].get(); } + const seq_parameter_set* get_sps(int id) const { return sps[id].get(); } + /* */ pic_parameter_set* get_pps(int id) { return pps[id].get(); } + const pic_parameter_set* get_pps(int id) const { return pps[id].get(); } + + /* + const slice_segment_header* get_SliceHeader_atCtb(int ctb) { + return img->slices[img->get_SliceHeaderIndex_atIndex(ctb)]; + } + */ + + uint8_t get_nal_unit_type() const { return nal_unit_type; } + bool get_RapPicFlag() const { return RapPicFlag; } + + de265_error decode_NAL(NAL_unit* nal); + + de265_error decode(int* more); + de265_error decode_some(bool* did_work); + + de265_error decode_slice_unit_sequential(image_unit* imgunit, slice_unit* sliceunit); + de265_error decode_slice_unit_parallel(image_unit* imgunit, slice_unit* sliceunit); + de265_error decode_slice_unit_WPP(image_unit* imgunit, slice_unit* sliceunit); + de265_error decode_slice_unit_tiles(image_unit* imgunit, slice_unit* sliceunit); + + + void process_nal_hdr(nal_header*); + + bool process_slice_segment_header(slice_segment_header*, + de265_error*, de265_PTS pts, + nal_header* nal_hdr, void* user_data); + + //void push_current_picture_to_output_queue(); + de265_error push_picture_to_output_queue(image_unit*); + + + // --- parameters --- + + bool param_sei_check_hash; + bool param_conceal_stream_errors; + bool param_suppress_faulty_pictures; + + int param_sps_headers_fd; + int param_vps_headers_fd; + int param_pps_headers_fd; + int param_slice_headers_fd; + + bool param_disable_deblocking; + bool param_disable_sao; + //bool param_disable_mc_residual_idct; // not implemented yet + //bool param_disable_intra_residual_idct; // not implemented yet + + void set_image_allocation_functions(de265_image_allocation* allocfunc, void* userdata); + + de265_image_allocation param_image_allocation_functions; + void* param_image_allocation_userdata; + + + // --- input stream data --- + + NAL_Parser nal_parser; + + + int get_num_worker_threads() const { return num_worker_threads; } + + /* */ de265_image* get_image(int dpb_index) { return dpb.get_image(dpb_index); } + const de265_image* get_image(int dpb_index) const { return dpb.get_image(dpb_index); } + + bool has_image(int dpb_index) const { return dpb_index>=0 && dpb_index vps[ DE265_MAX_VPS_SETS ]; + std::shared_ptr sps[ DE265_MAX_SPS_SETS ]; + std::shared_ptr pps[ DE265_MAX_PPS_SETS ]; + + std::shared_ptr current_vps; + std::shared_ptr current_sps; + std::shared_ptr current_pps; + + public: + thread_pool thread_pool_; + + private: + int num_worker_threads; + + + public: + // --- frame dropping --- + + void set_limit_TID(int tid); + int get_highest_TID() const; + int get_current_TID() const { return current_HighestTid; } + int change_framerate(int more_vs_less); // 1: more, -1: less + void set_framerate_ratio(int percent); + + private: + // input parameters + int limit_HighestTid; // never switch to a layer above this one + int framerate_ratio; + + // current control parameters + int goal_HighestTid; // this is the layer we want to decode at + int layer_framerate_ratio; // ratio of frames to keep in the current layer + + int current_HighestTid; // the layer which we are currently decoding + + struct { + int8_t tid; + int8_t ratio; + } framedrop_tab[100+1]; + int framedrop_tid_index[6+1]; + + void compute_framedrop_table(); + void calc_tid_and_framerate_ratio(); + + private: + // --- decoded picture buffer --- + + decoded_picture_buffer dpb; + + int current_image_poc_lsb; + bool first_decoded_picture; + bool NoRaslOutputFlag; + bool HandleCraAsBlaFlag; + bool FirstAfterEndOfSequenceNAL; + + int PicOrderCntMsb; + int prevPicOrderCntLsb; // at precTid0Pic + int prevPicOrderCntMsb; // at precTid0Pic + + de265_image* img; + + public: + const slice_segment_header* previous_slice_header; /* Remember the last slice for a successive + dependent slice. */ + + + // --- motion compensation --- + + public: + int PocLsbLt[MAX_NUM_REF_PICS]; + int UsedByCurrPicLt[MAX_NUM_REF_PICS]; + int DeltaPocMsbCycleLt[MAX_NUM_REF_PICS]; + private: + int CurrDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS]; + int FollDeltaPocMsbPresentFlag[MAX_NUM_REF_PICS]; + + // The number of entries in the lists below. + int NumPocStCurrBefore; + int NumPocStCurrAfter; + int NumPocStFoll; + int NumPocLtCurr; + int NumPocLtFoll; + + // These lists contain absolute POC values. + int PocStCurrBefore[MAX_NUM_REF_PICS]; // used for reference in current picture, smaller POC + int PocStCurrAfter[MAX_NUM_REF_PICS]; // used for reference in current picture, larger POC + int PocStFoll[MAX_NUM_REF_PICS]; // not used for reference in current picture, but in future picture + int PocLtCurr[MAX_NUM_REF_PICS]; // used in current picture + int PocLtFoll[MAX_NUM_REF_PICS]; // used in some future picture + + // These lists contain indices into the DPB. + int RefPicSetStCurrBefore[MAX_NUM_REF_PICS]; + int RefPicSetStCurrAfter[MAX_NUM_REF_PICS]; + int RefPicSetStFoll[MAX_NUM_REF_PICS]; + int RefPicSetLtCurr[MAX_NUM_REF_PICS]; + int RefPicSetLtFoll[MAX_NUM_REF_PICS]; + + + // --- parameters derived from parameter sets --- + + // NAL + + uint8_t nal_unit_type; + + char IdrPicFlag; + char RapPicFlag; + + + // --- image unit queue --- + + std::vector image_units; + + bool flush_reorder_buffer_at_this_frame; + + private: + void init_thread_context(thread_context* tctx); + void add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream, int ctbRow); + void add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream, + int ctbX,int ctbY); + + void mark_whole_slice_as_processed(image_unit* imgunit, + slice_unit* sliceunit, + int progress); + + void process_picture_order_count(slice_segment_header* hdr); + int generate_unavailable_reference_picture(const seq_parameter_set* sps, + int POC, bool longTerm); + void process_reference_picture_set(slice_segment_header* hdr); + bool construct_reference_picture_lists(slice_segment_header* hdr); + + + void remove_images_from_dpb(const std::vector& removeImageList); + void run_postprocessing_filters_sequential(struct de265_image* img); + void run_postprocessing_filters_parallel(image_unit* img); +}; + + +#endif diff --git a/libde265/dpb.h b/libde265/dpb.h new file mode 100644 index 0000000..fa8ff59 --- /dev/null +++ b/libde265/dpb.h @@ -0,0 +1,118 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_DPB_H +#define DE265_DPB_H + +#include "libde265/image.h" +#include "libde265/sps.h" + +#include +#include + +class decoder_context; + +class decoded_picture_buffer { +public: + decoded_picture_buffer(); + ~decoded_picture_buffer(); + + void set_max_size_of_DPB(int n) { max_images_in_DPB=n; } + void set_norm_size_of_DPB(int n) { norm_images_in_DPB=n; } + + /* Alloc a new image in the DPB and return its index. + If there is no space for a new image, return -1. */ + int new_image(std::shared_ptr sps, decoder_context* decctx, + de265_PTS pts, void* user_data, bool isOutputImage); + + /* Check for a free slot in the DPB. There are some slots reserved for + unavailable reference frames. If high_priority==true, these reserved slots + are included in the check. */ + bool has_free_dpb_picture(bool high_priority) const; + + /* Remove all pictures from DPB and queues. Decoding should be stopped while calling this. */ + void clear(); + + int size() const { return dpb.size(); } + + /* Raw access to the images. */ + + /* */ de265_image* get_image(int index) { + if (index>=dpb.size()) return NULL; + return dpb[index]; + } + + const de265_image* get_image(int index) const { + if (index>=dpb.size()) return NULL; + return dpb[index]; + } + + /* Search DPB for the slot index of a specific picture. */ + int DPB_index_of_picture_with_POC(int poc, int currentID, bool preferLongTerm=false) const; + int DPB_index_of_picture_with_LSB(int lsb, int currentID, bool preferLongTerm=false) const; + int DPB_index_of_picture_with_ID (int id) const; + + + // --- reorder buffer --- + + void insert_image_into_reorder_buffer(struct de265_image* img) { + reorder_output_queue.push_back(img); + } + + int num_pictures_in_reorder_buffer() const { return reorder_output_queue.size(); } + + // move next picture in reorder buffer to output queue + void output_next_picture_in_reorder_buffer(); + + // Move all pictures in reorder buffer to output buffer. Return true if there were any pictures. + bool flush_reorder_buffer(); + + + // --- output buffer --- + + int num_pictures_in_output_queue() const { return image_output_queue.size(); } + + /* Get the next picture in the output queue, but do not remove it from the queue. */ + struct de265_image* get_next_picture_in_output_queue() const { return image_output_queue.front(); } + + /* Remove the next picture in the output queue. */ + void pop_next_picture_in_output_queue(); + + + // --- debug --- + + void log_dpb_content() const; + void log_dpb_queues() const; + +private: + int max_images_in_DPB; + int norm_images_in_DPB; + + std::vector dpb; // decoded picture buffer + + std::vector reorder_output_queue; + std::deque image_output_queue; + +private: + decoded_picture_buffer(const decoded_picture_buffer&); // no copy + decoded_picture_buffer& operator=(const decoded_picture_buffer&); // no copy +}; + +#endif diff --git a/libde265/en265.h b/libde265/en265.h new file mode 100644 index 0000000..a22e5d1 --- /dev/null +++ b/libde265/en265.h @@ -0,0 +1,218 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef EN265_H +#define EN265_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + + +// ========== encoder context ========== + +struct en265_encoder_context; // private structure + +/* Get a new encoder context. Must be freed with en265_free_encoder(). */ +LIBDE265_API en265_encoder_context* en265_new_encoder(void); + +/* Free encoder context. May only be called once on a context. */ +LIBDE265_API de265_error en265_free_encoder(en265_encoder_context*); + +/* The alloc_userdata pointer will be given to the release_func(). */ +/* +LIBDE265_API void en265_set_image_release_function(en265_encoder_context*, + void (*release_func)(en265_encoder_context*, + struct de265_image*, + void* userdata), + void* alloc_userdata); +*/ + +// ========== encoder parameters ========== + +LIBDE265_API de265_error en265_set_parameter_bool(en265_encoder_context*, + const char* parametername,int value); +LIBDE265_API de265_error en265_set_parameter_int(en265_encoder_context*, + const char* parametername,int value); +LIBDE265_API de265_error en265_set_parameter_string(en265_encoder_context*, + const char* parametername,const char* value); +LIBDE265_API de265_error en265_set_parameter_choice(en265_encoder_context*, + const char* parametername,const char* value); + + +LIBDE265_API const char** en265_list_parameters(en265_encoder_context*); + +enum en265_parameter_type { + en265_parameter_bool, + en265_parameter_int, + en265_parameter_string, + en265_parameter_choice +}; + +LIBDE265_API enum en265_parameter_type en265_get_parameter_type(en265_encoder_context*, + const char* parametername); + +LIBDE265_API const char** en265_list_parameter_choices(en265_encoder_context*, + const char* parametername); + + +// --- convenience functions for command-line parameters --- + +LIBDE265_API de265_error en265_parse_command_line_parameters(en265_encoder_context*, + int* argc, char** argv); +LIBDE265_API void en265_show_parameters(en265_encoder_context*); + + + +// ========== encoding loop ========== + +LIBDE265_API de265_error en265_start_encoder(en265_encoder_context*, int number_of_threads); + +// If we have provided our own memory release function, no image memory will be allocated. +LIBDE265_API struct de265_image* en265_allocate_image(en265_encoder_context*, + int width, int height, + enum de265_chroma chroma, + de265_PTS pts, void* image_userdata); + +LIBDE265_API void* de265_alloc_image_plane(struct de265_image* img, int cIdx, + void* inputdata, int inputstride, void *userdata); +LIBDE265_API void de265_free_image_plane(struct de265_image* img, int cIdx); + + +// Request a specification of the image memory layout for an image of the specified dimensions. +LIBDE265_API void en265_get_image_spec(en265_encoder_context*, + int width, int height, enum de265_chroma chroma, + struct de265_image_spec* out_spec); + +// Image memory layout specification for an image returned by en265_allocate_image(). +/* TODO: do we need this? +LIBDE265_API void de265_get_image_spec_from_image(de265_image* img, struct de265_image_spec* spec); +*/ + + +LIBDE265_API de265_error en265_push_image(en265_encoder_context*, + struct de265_image*); // non-blocking + +LIBDE265_API de265_error en265_push_eof(en265_encoder_context*); + +// block when there are more than max_input_images in the input queue +LIBDE265_API de265_error en265_block_on_input_queue_length(en265_encoder_context*, + int max_pending_images, + int timeout_ms); + +LIBDE265_API de265_error en265_trim_input_queue(en265_encoder_context*, int max_pending_images); + +LIBDE265_API int en265_current_input_queue_length(en265_encoder_context*); + +// Run encoder in main thread. Only use this when not using background threads. +LIBDE265_API de265_error en265_encode(en265_encoder_context*); + +enum en265_encoder_state +{ + EN265_STATE_IDLE, + EN265_STATE_WAITING_FOR_INPUT, + EN265_STATE_WORKING, + EN265_STATE_OUTPUT_QUEUE_FULL, + EN265_STATE_EOS +}; + + +LIBDE265_API enum en265_encoder_state en265_get_encoder_state(en265_encoder_context*); + + +enum en265_packet_content_type { + EN265_PACKET_VPS, + EN265_PACKET_SPS, + EN265_PACKET_PPS, + EN265_PACKET_SEI, + EN265_PACKET_SLICE, + EN265_PACKET_SKIPPED_IMAGE +}; + + +enum en265_nal_unit_type { + EN265_NUT_TRAIL_N = 0, + EN265_NUT_TRAIL_R = 1, + EN265_NUT_TSA_N = 2, + EN265_NUT_TSA_R = 3, + EN265_NUT_STSA_N = 4, + EN265_NUT_STSA_R = 5, + EN265_NUT_RADL_N = 6, + EN265_NUT_RADL_R = 7, + EN265_NUT_RASL_N = 8, + EN265_NUT_RASL_R = 9, + EN265_NUT_BLA_W_LP = 16, + EN265_NUT_BLA_W_RADL= 17, + EN265_NUT_BLA_N_LP = 18, + EN265_NUT_IDR_W_RADL= 19, + EN265_NUT_IDR_N_LP = 20, + EN265_NUT_CRA = 21, + EN265_NUT_VPS = 32, + EN265_NUT_SPS = 33, + EN265_NUT_PPS = 34, + EN265_NUT_AUD = 35, + EN265_NUT_EOS = 36, + EN265_NUT_EOB = 37, + EN265_NUT_FD = 38, + EN265_NUT_PREFIX_SEI = 39, + EN265_NUT_SUFFIX_SEI = 40 +}; + + +struct en265_packet +{ + int version; // currently: 1 + + const uint8_t* data; + int length; + + int frame_number; + + enum en265_packet_content_type content_type; + char complete_picture : 1; + char final_slice : 1; + char dependent_slice : 1; + + enum en265_nal_unit_type nal_unit_type; + unsigned char nuh_layer_id; + unsigned char nuh_temporal_id; + + en265_encoder_context* encoder_context; + + const struct de265_image* input_image; + const struct de265_image* reconstruction; +}; + +// timeout_ms - timeout in milliseconds. 0 - no timeout, -1 - block forever +LIBDE265_API struct en265_packet* en265_get_packet(en265_encoder_context*, int timeout_ms); +LIBDE265_API void en265_free_packet(en265_encoder_context*, struct en265_packet*); + +LIBDE265_API int en265_number_of_queued_packets(en265_encoder_context*); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/libde265/fallback-dct.h b/libde265/fallback-dct.h new file mode 100644 index 0000000..83d25c1 --- /dev/null +++ b/libde265/fallback-dct.h @@ -0,0 +1,96 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef FALLBACK_DCT_H +#define FALLBACK_DCT_H + +#include +#include + +#include "util.h" + + +// --- decoding --- + +void transform_skip_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_bypass_fallback(int32_t *r, const int16_t *coeffs, int nT); + +void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride); +void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride); +void transform_bypass_rdpcm_v_fallback(int32_t *r, const int16_t *coeffs,int nT); +void transform_bypass_rdpcm_h_fallback(int32_t *r, const int16_t *coeffs,int nT); + +void transform_4x4_luma_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); + + +void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_bypass_16_fallback(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth); + +void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); +void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); + +void rotate_coefficients_fallback(int16_t *coeff, int nT); + + +void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); +void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); +void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); +void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); +void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits); + +template +void add_residual_fallback(pixel_t *dst, ptrdiff_t stride, + const int32_t* r, int nT, int bit_depth) +{ + for (int y=0;y + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef FALLBACK_MOTION_H +#define FALLBACK_MOTION_H + +#include +#include + + +void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height); + +void put_unweighted_pred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height); + +void put_weighted_pred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD); +void put_weighted_bipred_8_fallback(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD); + +void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height, int bit_depth); + +void put_unweighted_pred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, int bit_depth); + +void put_weighted_pred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height, + int w,int o,int log2WD, int bit_depth); +void put_weighted_bipred_16_fallback(uint16_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height, + int w1,int o1, int w2,int o2, int log2WD, int bit_depth); + + + +void put_epel_8_fallback(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer); + +void put_epel_16_fallback(int16_t *out, ptrdiff_t out_stride, + const uint16_t *src, ptrdiff_t src_stride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + +template +void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dststride, + const pixel_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + + +#define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback(int16_t *out, ptrdiff_t out_stride, \ + const uint8_t *src, ptrdiff_t srcstride, \ + int nPbW, int nPbH, int16_t* mcbuffer) +QPEL(0,0); QPEL(0,1); QPEL(0,2); QPEL(0,3); +QPEL(1,0); QPEL(1,1); QPEL(1,2); QPEL(1,3); +QPEL(2,0); QPEL(2,1); QPEL(2,2); QPEL(2,3); +QPEL(3,0); QPEL(3,1); QPEL(3,2); QPEL(3,3); + +#undef QPEL + + +#define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback_16(int16_t *out, ptrdiff_t out_stride, \ + const uint16_t *src, ptrdiff_t srcstride, \ + int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) +QPEL(0,0); QPEL(0,1); QPEL(0,2); QPEL(0,3); +QPEL(1,0); QPEL(1,1); QPEL(1,2); QPEL(1,3); +QPEL(2,0); QPEL(2,1); QPEL(2,2); QPEL(2,3); +QPEL(3,0); QPEL(3,1); QPEL(3,2); QPEL(3,3); + +#undef QPEL + +#endif diff --git a/libde265/fallback.h b/libde265/fallback.h new file mode 100644 index 0000000..4b0b83c --- /dev/null +++ b/libde265/fallback.h @@ -0,0 +1,28 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_FALLBACK_H +#define DE265_FALLBACK_H + +#include "acceleration.h" + +void init_acceleration_functions_fallback(struct acceleration_functions* lowlevel); + +#endif diff --git a/libde265/image-io.h b/libde265/image-io.h new file mode 100644 index 0000000..1cc6c8d --- /dev/null +++ b/libde265/image-io.h @@ -0,0 +1,121 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef IMAGE_IO_H +#define IMAGE_IO_H + +#include "libde265/image.h" +#include + + +class ImageSource +{ + public: + LIBDE265_API ImageSource(); + virtual LIBDE265_API ~ImageSource() { } + + //enum ImageStatus { Available, Waiting, EndOfVideo }; + + //virtual ImageStatus get_status() = 0; + virtual LIBDE265_API de265_image* get_image(bool block=true) = 0; + virtual LIBDE265_API void skip_frames(int n) = 0; + + virtual LIBDE265_API int get_width() const = 0; + virtual LIBDE265_API int get_height() const = 0; +}; + + + +class ImageSource_YUV : public ImageSource +{ + public: + LIBDE265_API ImageSource_YUV(); + virtual LIBDE265_API ~ImageSource_YUV(); + + bool LIBDE265_API set_input_file(const char* filename, int w,int h); + + //virtual ImageStatus get_status(); + virtual LIBDE265_API de265_image* get_image(bool block=true); + virtual LIBDE265_API void skip_frames(int n); + + virtual LIBDE265_API int get_width() const { return width; } + virtual LIBDE265_API int get_height() const { return height; } + + private: + FILE* mFH; + bool mReachedEndOfFile; + + int width,height; + + de265_image* read_next_image(); +}; + + + +class ImageSink +{ + public: + virtual LIBDE265_API ~ImageSink() { } + + virtual LIBDE265_API void send_image(const de265_image* img) = 0; +}; + +class ImageSink_YUV : public ImageSink +{ + public: + LIBDE265_API ImageSink_YUV() : mFH(NULL) { } + LIBDE265_API ~ImageSink_YUV(); + + bool LIBDE265_API set_filename(const char* filename); + + virtual LIBDE265_API void send_image(const de265_image* img); + + private: + FILE* mFH; +}; + + + +class PacketSink +{ + public: + virtual LIBDE265_API ~PacketSink() { } + + virtual LIBDE265_API void send_packet(const uint8_t* data, int n) = 0; +}; + + +class PacketSink_File : public PacketSink +{ + public: + LIBDE265_API PacketSink_File(); + virtual LIBDE265_API ~PacketSink_File(); + + LIBDE265_API void set_filename(const char* filename); + + virtual LIBDE265_API void send_packet(const uint8_t* data, int n); + + private: + FILE* mFH; +}; + +#endif diff --git a/libde265/image.h b/libde265/image.h new file mode 100644 index 0000000..5611b72 --- /dev/null +++ b/libde265/image.h @@ -0,0 +1,864 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_IMAGE_H +#define DE265_IMAGE_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include +#ifdef HAVE_STDBOOL_H +#include +#endif + +#include "libde265/de265.h" +#include "libde265/sps.h" +#include "libde265/pps.h" +#include "libde265/motion.h" +#include "libde265/threads.h" +#include "libde265/slice.h" +#include "libde265/nal.h" + +struct en265_encoder_context; + +enum PictureState { + UnusedForReference, + UsedForShortTermReference, + UsedForLongTermReference +}; + + +/* TODO: + At INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE images, we can check the SEI hash, whether + the output image is correct despite the faulty reference, and set the state back to correct. +*/ +#define INTEGRITY_CORRECT 0 +#define INTEGRITY_UNAVAILABLE_REFERENCE 1 +#define INTEGRITY_NOT_DECODED 2 +#define INTEGRITY_DECODING_ERRORS 3 +#define INTEGRITY_DERIVED_FROM_FAULTY_REFERENCE 4 + +#define SEI_HASH_UNCHECKED 0 +#define SEI_HASH_CORRECT 1 +#define SEI_HASH_INCORRECT 2 + +#define TU_FLAG_NONZERO_COEFF (1<<7) +#define TU_FLAG_SPLIT_TRANSFORM_MASK 0x1F + +#define DEBLOCK_FLAG_VERTI (1<<4) +#define DEBLOCK_FLAG_HORIZ (1<<5) +#define DEBLOCK_PB_EDGE_VERTI (1<<6) +#define DEBLOCK_PB_EDGE_HORIZ (1<<7) +#define DEBLOCK_BS_MASK 0x03 + + +#define CTB_PROGRESS_NONE 0 +#define CTB_PROGRESS_PREFILTER 1 +#define CTB_PROGRESS_DEBLK_V 2 +#define CTB_PROGRESS_DEBLK_H 3 +#define CTB_PROGRESS_SAO 4 + +class decoder_context; + +template class MetaDataArray +{ + public: + MetaDataArray() { data=NULL; data_size=0; log2unitSize=0; width_in_units=0; height_in_units=0; } + ~MetaDataArray() { free(data); } + + LIBDE265_CHECK_RESULT bool alloc(int w,int h, int _log2unitSize) { + int size = w*h; + + if (size != data_size) { + free(data); + data = (DataUnit*)malloc(size * sizeof(DataUnit)); + if (data == NULL) { + data_size = 0; + return false; + } + data_size = size; + } + + width_in_units = w; + height_in_units = h; + + log2unitSize = _log2unitSize; + + return data != NULL; + } + + void clear() { + if (data) memset(data, 0, sizeof(DataUnit) * data_size); + } + + const DataUnit& get(int x,int y) const { + int unitX = x>>log2unitSize; + int unitY = y>>log2unitSize; + + assert(unitX >= 0 && unitX < width_in_units); + assert(unitY >= 0 && unitY < height_in_units); + + return data[ unitX + unitY*width_in_units ]; + } + + DataUnit& get(int x,int y) { + int unitX = x>>log2unitSize; + int unitY = y>>log2unitSize; + + assert(unitX >= 0 && unitX < width_in_units); + assert(unitY >= 0 && unitY < height_in_units); + + return data[ unitX + unitY*width_in_units ]; + } + + void set(int x,int y, const DataUnit& d) { + int unitX = x>>log2unitSize; + int unitY = y>>log2unitSize; + + assert(unitX >= 0 && unitX < width_in_units); + assert(unitY >= 0 && unitY < height_in_units); + + data[ unitX + unitY*width_in_units ] = d; + } + + DataUnit& operator[](int idx) { return data[idx]; } + const DataUnit& operator[](int idx) const { return data[idx]; } + + int size() const { return data_size; } + + // private: + DataUnit* data; + int data_size; + int log2unitSize; + int width_in_units; + int height_in_units; +}; + +#define SET_CB_BLK(x,y,log2BlkWidth, Field,value) \ + int cbX = x >> cb_info.log2unitSize; \ + int cbY = y >> cb_info.log2unitSize; \ + int width = 1 << (log2BlkWidth - cb_info.log2unitSize); \ + for (int cby=cbY;cby> tu_info.log2unitSize; \ + int tuY = y >> tu_info.log2unitSize; \ + int width = 1 << (log2BlkWidth - tu_info.log2unitSize); \ + for (int tuy=tuY;tuy sps, + bool allocMetadata, + decoder_context* dctx, + //class encoder_context* ectx, + de265_PTS pts, void* user_data, + bool useCustomAllocFunctions); + + //de265_error alloc_encoder_data(const seq_parameter_set* sps); + + bool is_allocated() const { return pixels[0] != NULL; } + + void release(); + + void set_headers(std::shared_ptr _vps, + std::shared_ptr _sps, + std::shared_ptr _pps) { + vps = _vps; + sps = _sps; + pps = _pps; + } + + void fill_image(int y,int u,int v); + de265_error copy_image(const de265_image* src); + void copy_lines_from(const de265_image* src, int first, int end); + void exchange_pixel_data_with(de265_image&); + + uint32_t get_ID() const { return ID; } + + + /* */ uint8_t* get_image_plane(int cIdx) { return pixels[cIdx]; } + const uint8_t* get_image_plane(int cIdx) const { return pixels[cIdx]; } + + void set_image_plane(int cIdx, uint8_t* mem, int stride, void *userdata); + + uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos) + { + int stride = get_image_stride(cIdx); + return pixels[cIdx] + xpos + ypos*stride; + } + + + /// xpos;ypos in actual plane resolution + template + pixel_t* get_image_plane_at_pos_NEW(int cIdx, int xpos,int ypos) + { + int stride = get_image_stride(cIdx); + return (pixel_t*)(pixels[cIdx] + (xpos + ypos*stride)*sizeof(pixel_t)); + } + + const uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos) const + { + int stride = get_image_stride(cIdx); + return pixels[cIdx] + xpos + ypos*stride; + } + + void* get_image_plane_at_pos_any_depth(int cIdx, int xpos,int ypos) + { + int stride = get_image_stride(cIdx); + return pixels[cIdx] + ((xpos + ypos*stride) << bpp_shift[cIdx]); + } + + const void* get_image_plane_at_pos_any_depth(int cIdx, int xpos,int ypos) const + { + int stride = get_image_stride(cIdx); + return pixels[cIdx] + ((xpos + ypos*stride) << bpp_shift[cIdx]); + } + + /* Number of pixels in one row (not number of bytes). + */ + int get_image_stride(int cIdx) const + { + if (cIdx==0) return stride; + else return chroma_stride; + } + + int get_luma_stride() const { return stride; } + int get_chroma_stride() const { return chroma_stride; } + + int get_width (int cIdx=0) const { return cIdx==0 ? width : chroma_width; } + int get_height(int cIdx=0) const { return cIdx==0 ? height : chroma_height; } + + enum de265_chroma get_chroma_format() const { return chroma_format; } + + int get_bit_depth(int cIdx) const { + if (cIdx==0) return sps->BitDepth_Y; + else return sps->BitDepth_C; + } + + int get_bytes_per_pixel(int cIdx) const { + return (get_bit_depth(cIdx)+7)/8; + } + + bool high_bit_depth(int cIdx) const { + return get_bit_depth(cIdx)>8; + } + + bool can_be_released() const { return PicOutputFlag==false && PicState==UnusedForReference; } + + + void add_slice_segment_header(slice_segment_header* shdr) { + shdr->slice_index = slices.size(); + slices.push_back(shdr); + } + + + bool available_zscan(int xCurr,int yCurr, int xN,int yN) const; + + bool available_pred_blk(int xC,int yC, int nCbS, + int xP, int yP, int nPbW, int nPbH, int partIdx, + int xN,int yN) const; + + + static de265_image_allocation default_image_allocation; + + void printBlk(const char* title, int x0,int y0,int blkSize,int cIdx) const { + ::printBlk(title, get_image_plane_at_pos(cIdx,x0,y0), + blkSize, get_image_stride(cIdx)); + } + +private: + uint32_t ID; + static uint32_t s_next_image_ID; + + uint8_t* pixels[3]; + uint8_t bpp_shift[3]; // 0 for 8 bit, 1 for 16 bit + + enum de265_chroma chroma_format; + + int width, height; // size in luma pixels + + int chroma_width, chroma_height; + int stride, chroma_stride; + +public: + uint8_t BitDepth_Y, BitDepth_C; + uint8_t SubWidthC, SubHeightC; + std::vector slices; + +public: + + // --- conformance cropping window --- + + uint8_t* pixels_confwin[3]; // pointer to pixels in the conformance window + + int width_confwin, height_confwin; + int chroma_width_confwin, chroma_height_confwin; + + // --- decoding info --- + + // If PicOutputFlag==false && PicState==UnusedForReference, image buffer is free. + + int picture_order_cnt_lsb; + int PicOrderCntVal; + enum PictureState PicState; + bool PicOutputFlag; + + int32_t removed_at_picture_id; + + const video_parameter_set& get_vps() const { return *vps; } + const seq_parameter_set& get_sps() const { return *sps; } + const pic_parameter_set& get_pps() const { return *pps; } + + bool has_vps() const { return (bool)vps; } + bool has_sps() const { return (bool)sps; } + bool has_pps() const { return (bool)pps; } + + std::shared_ptr get_shared_sps() { return sps; } + + //std::shared_ptr get_shared_sps() const { return sps; } + //std::shared_ptr get_shared_pps() const { return pps; } + + decoder_context* decctx; + //class encoder_context* encctx; + + int number_of_ctbs() const { return ctb_info.size(); } + +private: + // The image also keeps a reference to VPS/SPS/PPS, because when decoding is delayed, + // the currently active parameter sets in the decctx might already have been replaced + // with new parameters. + std::shared_ptr vps; + std::shared_ptr sps; // the SPS used for decoding this image + std::shared_ptr pps; // the PPS used for decoding this image + + MetaDataArray ctb_info; + MetaDataArray cb_info; + MetaDataArray pb_info; + MetaDataArray intraPredMode; + MetaDataArray intraPredModeC; + MetaDataArray tu_info; + MetaDataArray deblk_info; + +public: + // --- meta information --- + + de265_PTS pts; + void* user_data; + void* plane_user_data[3]; // this is logically attached to the pixel data pointers + de265_image_allocation image_allocation_functions; // the functions used for memory allocation + + /* + void (*encoder_image_release_func)(en265_encoder_context*, + de265_image*, + void* userdata); + */ + + uint8_t integrity; /* Whether an error occured while the image was decoded. + When generated, this is initialized to INTEGRITY_CORRECT, + and changed on decoding errors. + */ + bool sei_hash_check_result; + + nal_header nal_hdr; + + // --- multi core --- + + de265_progress_lock* ctb_progress; // ctb_info_size + + void mark_all_CTB_progress(int progress) { + for (int i=0;i> tu_info.log2unitSize; + const int tuY = y >> tu_info.log2unitSize; + const int width = 1 << (log2TrafoSize - tu_info.log2unitSize); + + for (int tuy=tuY;tuy>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs; + + for (int y=0;yPicWidthInMinPUs); + assert(y < sps->PicHeightInMinPUs); + + int idx = PUidx + x + y*intraPredMode.width_in_units; + assert(idx>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs; + + for (int y=0;yPicWidthInMinPUs); + assert(yPicHeightInMinPUs); + + int idx = PUidx + x + y*intraPredModeC.width_in_units; + assert(idx= 0 && idx < slices.size(); + } + + slice_segment_header* get_SliceHeader(int x, int y) + { + int idx = get_SliceHeaderIndex(x,y); + if (idx >= slices.size()) { return NULL; } + return slices[idx]; + } + + slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY) + { + int idx = get_SliceHeaderIndexCtb(ctbX,ctbY); + if (idx >= slices.size()) { return NULL; } + return slices[idx]; + } + + const slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY) const + { + int idx = get_SliceHeaderIndexCtb(ctbX,ctbY); + if (idx >= slices.size()) { return NULL; } + return slices[idx]; + } + + void set_sao_info(int ctbX,int ctbY,const sao_info* saoinfo) + { + sao_info* sao = &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo; + + memcpy(sao, + saoinfo, + sizeof(sao_info)); + } + + const sao_info* get_sao_info(int ctbX,int ctbY) const + { + return &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo; + } + + + void set_CtbDeblockFlag(int ctbX, int ctbY, bool flag) + { + int idx = ctbX + ctbY*ctb_info.width_in_units; + ctb_info[idx].deblock = flag; + } + + bool get_CtbDeblockFlag(int ctbX, int ctbY) const + { + return ctb_info[ctbX + ctbY*ctb_info.width_in_units].deblock; + } + + + bool get_CTB_has_pcm_or_cu_transquant_bypass(int ctbX,int ctbY) const + { + int idx = ctbX + ctbY*ctb_info.width_in_units; + return ctb_info[idx].has_pcm_or_cu_transquant_bypass; + } + + + + // --- DEBLK metadata access --- + + int get_deblk_width() const { return deblk_info.width_in_units; } + int get_deblk_height() const { return deblk_info.height_in_units; } + + void set_deblk_flags(int x0,int y0, uint8_t flags) + { + const int xd = x0/4; + const int yd = y0/4; + + if (xd + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_INTRAPRED_H +#define DE265_INTRAPRED_H + +#include "libde265/decctx.h" + +extern const int intraPredAngle_table[1+34]; + + +/* Fill the three intra-pred-mode candidates into candModeList. + Block position is (x,y) and you also have to give the PUidx for this + block (which is (x>>Log2MinPUSize) + (y>>Log2MinPUSize)*PicWidthInMinPUs). + availableA/B is the output of check_CTB_available(). + */ +void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], + int x,int y, int PUidx, + bool availableA, // left + bool availableB, // top + const de265_image* img); + + +inline void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], int x,int y, + bool availableA, // left + bool availableB, // top + const de265_image* img) +{ + int PUidx = img->get_sps().getPUIndexRS(x,y); + fillIntraPredModeCandidates(candModeList, x,y, PUidx, availableA,availableB, img); +} + +void fillIntraPredModeCandidates(enum IntraPredMode candModeList[3], + enum IntraPredMode candIntraPredModeA, + enum IntraPredMode candIntraPredModeB); + + +/* Return value >= 0 -> use mpm_idx(return value) + else -> use rem_intra(-return value-1) + + This function may modify the candModeList ! + */ +int find_intra_pred_mode(enum IntraPredMode mode, + enum IntraPredMode candModeList[3]); + +void list_chroma_pred_candidates(enum IntraPredMode chroma_mode[5], + enum IntraPredMode luma_mode); + +int get_intra_scan_idx(int log2TrafoSize, enum IntraPredMode intraPredMode, int cIdx, + const seq_parameter_set* sps); + +int get_intra_scan_idx_luma (int log2TrafoSize, enum IntraPredMode intraPredMode); // DEPRECATED +int get_intra_scan_idx_chroma(int log2TrafoSize, enum IntraPredMode intraPredMode); // DEPRECATED + +enum IntraPredMode lumaPredMode_to_chromaPredMode(enum IntraPredMode luma, + enum IntraChromaPredMode chroma); + +/* +void decode_intra_block(decoder_context* ctx, + thread_context* tctx, + int cIdx, + int xB0,int yB0, // position of TU in frame (chroma adapted) + int x0,int y0, // position of CU in frame (chroma adapted) + int log2TrafoSize, int trafoDepth, + enum IntraPredMode intraPredMode, + bool transform_skip_flag); +*/ + +//void fill_border_samples(decoder_context* ctx, int xB,int yB, +// int nT, int cIdx, uint8_t* out_border); + +void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + int nT, int cIdx); + +// TODO: remove this +template void decode_intra_prediction(de265_image* img, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + pixel_t* dst, int nT, int cIdx); + + + + +// --- internal use only --- + +// Actually, the largest TB block can only be 32, but in some intra-pred-mode algorithms +// (e.g. min-residual), we may call intra prediction on the maximum CTB size (64). +static const int MAX_INTRA_PRED_BLOCK_SIZE = 64; + + +template +class intra_border_computer +{ + public: + pixel_t* out_border; + + const de265_image* img; + int nT; + int cIdx; + + int xB,yB; + + const seq_parameter_set* sps; + const pic_parameter_set* pps; + + uint8_t available_data[4*MAX_INTRA_PRED_BLOCK_SIZE + 1]; + uint8_t* available; + + int SubWidth; + int SubHeight; + + bool availableLeft; // is CTB at left side available? + bool availableTop; // is CTB at top side available? + bool availableTopRight; // is CTB at top-right side available? + bool availableTopLeft; // if CTB at top-left pixel available? + + int nBottom; + int nRight; + int nAvail; + pixel_t firstValue; + + void init(pixel_t* _out_border, + const de265_image* _img, int _nT, int _cIdx, int _xB, int _yB) { + img=_img; nT=_nT; cIdx=_cIdx; + out_border=_out_border; xB=_xB; yB=_yB; + + assert(nT <= MAX_INTRA_PRED_BLOCK_SIZE); + + availableLeft=true; + availableTop=true; + availableTopRight=true; + availableTopLeft=true; + } + void preproc(); + void fill_from_image(); + + void reference_sample_substitution(); +}; + + +#ifdef DE265_LOG_TRACE +template +void print_border(pixel_t* data, uint8_t* available, int nT) +{ + for (int i=-2*nT ; i<=2*nT ; i++) { + if (i==0 || i==1 || i==-nT || i==nT+1) { + logtrace(LogIntraPred,"|"); + } else { + logtrace(LogIntraPred," "); + } + + if (available==NULL || available[i]) { + logtrace(LogIntraPred,"%02x",data[i]); + } + else { + logtrace(LogIntraPred,"--"); + } + } +} +#else +#define print_border(data, available, nT) +#endif + + +// (8.4.4.2.3) +template +void intra_prediction_sample_filtering(const seq_parameter_set& sps, + pixel_t* p, + int nT, int cIdx, + enum IntraPredMode intraPredMode) +{ + int filterFlag; + + //printf("filtering, mode: %d\n",intraPredMode); + + if (intraPredMode==INTRA_DC || nT==4) { + filterFlag = 0; + } else { + // int-cast below prevents a typing problem that leads to wrong results when abs_value is a macro + int minDistVerHor = libde265_min( abs_value((int)intraPredMode-26), + abs_value((int)intraPredMode-10) ); + + //printf("mindist: %d\n",minDistVerHor); + + switch (nT) { + case 8: filterFlag = (minDistVerHor>7) ? 1 : 0; break; + case 16: filterFlag = (minDistVerHor>1) ? 1 : 0; break; + case 32: filterFlag = (minDistVerHor>0) ? 1 : 0; break; + // there is no official 64x64 TB block, but we call this for some intra-pred mode algorithms + // on the whole CB (2Nx2N mode for the whole CTB) + case 64: filterFlag = 0; break; + default: filterFlag = -1; assert(false); break; // should never happen + } + } + + + if (filterFlag) { + int biIntFlag = (sps.strong_intra_smoothing_enable_flag && + cIdx==0 && + nT==32 && + abs_value(p[0]+p[ 64]-2*p[ 32]) < (1<<(sps.bit_depth_luma-5)) && + abs_value(p[0]+p[-64]-2*p[-32]) < (1<<(sps.bit_depth_luma-5))) + ? 1 : 0; + + pixel_t pF_mem[4*32+1]; + pixel_t* pF = &pF_mem[2*32]; + + if (biIntFlag) { + pF[-2*nT] = p[-2*nT]; + pF[ 2*nT] = p[ 2*nT]; + pF[ 0] = p[ 0]; + + for (int i=1;i<=63;i++) { + pF[-i] = p[0] + ((i*(p[-64]-p[0])+32)>>6); + pF[ i] = p[0] + ((i*(p[ 64]-p[0])+32)>>6); + } + } else { + pF[-2*nT] = p[-2*nT]; + pF[ 2*nT] = p[ 2*nT]; + + for (int i=-(2*nT-1) ; i<=2*nT-1 ; i++) + { + pF[i] = (p[i+1] + 2*p[i] + p[i-1] + 2) >> 2; + } + } + + + // copy back to original array + + memcpy(p-2*nT, pF-2*nT, (4*nT+1) * sizeof(pixel_t)); + } + else { + // do nothing ? + } + + + logtrace(LogIntraPred,"post filtering: "); + print_border(p,NULL,nT); + logtrace(LogIntraPred,"\n"); +} + + +template +void intra_prediction_planar(pixel_t* dst, int dstStride, + int nT,int cIdx, + pixel_t* border) +{ + int Log2_nT = Log2(nT); + + for (int y=0;y> (Log2_nT+1); + } + + + logtrace(LogIntraPred,"result of planar prediction\n"); + + for (int y=0;y +void intra_prediction_DC(pixel_t* dst, int dstStride, + int nT,int cIdx, + pixel_t* border) +{ + int Log2_nT = Log2(nT); + + int dcVal = 0; + for (int i=0;i>= Log2_nT+1; + + if (cIdx==0 && nT<32) { + dst[0] = (border[-1] + 2*dcVal + border[1] +2) >> 2; + + for (int x=1;x>2; } + for (int y=1;y>2; } + for (int y=1;y +void intra_prediction_angular(pixel_t* dst, int dstStride, + int bit_depth, bool disableIntraBoundaryFilter, + int xB0,int yB0, + enum IntraPredMode intraPredMode, + int nT,int cIdx, + pixel_t* border) +{ + pixel_t ref_mem[4*MAX_INTRA_PRED_BLOCK_SIZE+1]; // TODO: what is the required range here ? + pixel_t* ref=&ref_mem[2*MAX_INTRA_PRED_BLOCK_SIZE]; + + assert(intraPredMode<35); + assert(intraPredMode>=2); + + int intraPredAngle = intraPredAngle_table[intraPredMode]; + + if (intraPredMode >= 18) { + + for (int x=0;x<=nT;x++) + { ref[x] = border[x]; } + + if (intraPredAngle<0) { + int invAngle = invAngle_table[intraPredMode-11]; + + if ((nT*intraPredAngle)>>5 < -1) { + for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) { + ref[x] = border[0-((x*invAngle+128)>>8)]; + } + } + } else { + for (int x=nT+1; x<=2*nT;x++) { + ref[x] = border[x]; + } + } + + for (int y=0;y>5; + int iFact= ((y+1)*intraPredAngle)&31; + + if (iFact != 0) { + dst[x+y*dstStride] = ((32-iFact)*ref[x+iIdx+1] + iFact*ref[x+iIdx+2] + 16)>>5; + } else { + dst[x+y*dstStride] = ref[x+iIdx+1]; + } + } + + if (intraPredMode==26 && cIdx==0 && nT<32 && !disableIntraBoundaryFilter) { + for (int y=0;y>1), bit_depth); + } + } + } + else { // intraPredAngle < 18 + + for (int x=0;x<=nT;x++) + { ref[x] = border[-x]; } // DIFF (neg) + + if (intraPredAngle<0) { + int invAngle = invAngle_table[intraPredMode-11]; + + if ((nT*intraPredAngle)>>5 < -1) { + for (int x=(nT*intraPredAngle)>>5; x<=-1; x++) { + ref[x] = border[((x*invAngle+128)>>8)]; // DIFF (neg) + } + } + } else { + for (int x=nT+1; x<=2*nT;x++) { + ref[x] = border[-x]; // DIFF (neg) + } + } + + for (int y=0;y>5; // DIFF (x<->y) + int iFact= ((x+1)*intraPredAngle)&31; // DIFF (x<->y) + + if (iFact != 0) { + dst[x+y*dstStride] = ((32-iFact)*ref[y+iIdx+1] + iFact*ref[y+iIdx+2] + 16)>>5; // DIFF (x<->y) + } else { + dst[x+y*dstStride] = ref[y+iIdx+1]; // DIFF (x<->y) + } + } + + if (intraPredMode==10 && cIdx==0 && nT<32 && !disableIntraBoundaryFilter) { // DIFF 26->10 + for (int x=0;xy) + dst[x] = Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth); // DIFF (x<->y && neg) + } + } + } + + + logtrace(LogIntraPred,"result of angular intra prediction (mode=%d):\n",intraPredMode); + + for (int y=0;y +void intra_border_computer::preproc() +{ + sps = &img->get_sps(); + pps = &img->get_pps(); + + SubWidth = (cIdx==0) ? 1 : sps->SubWidthC; + SubHeight = (cIdx==0) ? 1 : sps->SubHeightC; + + // --- check for CTB boundaries --- + + int xBLuma = xB * SubWidth; + int yBLuma = yB * SubHeight; + + int log2CtbSize = sps->Log2CtbSizeY; + int picWidthInCtbs = sps->PicWidthInCtbsY; + + + //printf("xB/yB: %d %d\n",xB,yB); + + // are we at left image border + + if (xBLuma == 0) { + availableLeft = false; + availableTopLeft = false; + xBLuma = 0; // fake value, available flags are already set to false + } + + + // are we at top image border + + if (yBLuma == 0) { + availableTop = false; + availableTopLeft = false; + availableTopRight = false; + yBLuma = 0; // fake value, available flags are already set to false + } + + if (xBLuma+nT*SubWidth >= sps->pic_width_in_luma_samples) { + availableTopRight=false; + } + + // check for tile and slice boundaries + + int xCurrCtb = xBLuma >> log2CtbSize; + int yCurrCtb = yBLuma >> log2CtbSize; + int xLeftCtb = (xBLuma-1) >> log2CtbSize; + int xRightCtb = (xBLuma+nT*SubWidth) >> log2CtbSize; + int yTopCtb = (yBLuma-1) >> log2CtbSize; + + int currCTBSlice = img->get_SliceAddrRS(xCurrCtb,yCurrCtb); + int leftCTBSlice = availableLeft ? img->get_SliceAddrRS(xLeftCtb, yCurrCtb) : -1; + int topCTBSlice = availableTop ? img->get_SliceAddrRS(xCurrCtb, yTopCtb) : -1; + int toprightCTBSlice = availableTopRight ? img->get_SliceAddrRS(xRightCtb, yTopCtb) : -1; + int topleftCTBSlice = availableTopLeft ? img->get_SliceAddrRS(xLeftCtb, yTopCtb) : -1; + + /* + printf("size: %d\n",pps->TileIdRS.size()); + printf("curr: %d left: %d top: %d\n", + xCurrCtb+yCurrCtb*picWidthInCtbs, + availableLeft ? xLeftCtb+yCurrCtb*picWidthInCtbs : 9999, + availableTop ? xCurrCtb+yTopCtb*picWidthInCtbs : 9999); + */ + int currCTBTileID = pps->TileIdRS[xCurrCtb+yCurrCtb*picWidthInCtbs]; + int leftCTBTileID = availableLeft ? pps->TileIdRS[xLeftCtb+yCurrCtb*picWidthInCtbs] : -1; + int topCTBTileID = availableTop ? pps->TileIdRS[xCurrCtb+yTopCtb*picWidthInCtbs] : -1; + int topleftCTBTileID = availableTopLeft ? pps->TileIdRS[xLeftCtb+yTopCtb*picWidthInCtbs] : -1; + int toprightCTBTileID= availableTopRight? pps->TileIdRS[xRightCtb+yTopCtb*picWidthInCtbs] : -1; + + if (leftCTBSlice != currCTBSlice || leftCTBTileID != currCTBTileID ) availableLeft = false; + if (topCTBSlice != currCTBSlice || topCTBTileID != currCTBTileID ) availableTop = false; + if (topleftCTBSlice !=currCTBSlice||topleftCTBTileID!=currCTBTileID ) availableTopLeft = false; + if (toprightCTBSlice!=currCTBSlice||toprightCTBTileID!=currCTBTileID) availableTopRight= false; + + + // number of pixels that are in the valid image area to the right and to the bottom + + nBottom = sps->pic_height_in_luma_samples - yB*SubHeight; + nBottom=(nBottom+SubHeight-1)/SubHeight; + if (nBottom>2*nT) nBottom=2*nT; + + nRight = sps->pic_width_in_luma_samples - xB*SubWidth; + nRight =(nRight +SubWidth-1)/SubWidth; + if (nRight >2*nT) nRight=2*nT; + + nAvail=0; + + available = &available_data[2*MAX_INTRA_PRED_BLOCK_SIZE]; + + memset(available-2*nT, 0, 4*nT+1); +} + + +template +void intra_border_computer::fill_from_image() +{ + assert(nT<=32); + + pixel_t* image; + int stride; + image = (pixel_t*)img->get_image_plane(cIdx); + stride = img->get_image_stride(cIdx); + + int xBLuma = xB * SubWidth; + int yBLuma = yB * SubHeight; + + int currBlockAddr = pps->MinTbAddrZS[ (xBLuma>>sps->Log2MinTrafoSize) + + (yBLuma>>sps->Log2MinTrafoSize) * sps->PicWidthInTbsY ]; + + + // copy pixels at left column + + for (int y=nBottom-1 ; y>=0 ; y-=4) + if (availableLeft) + { + int NBlockAddr = pps->MinTbAddrZS[ (((xB-1)*SubWidth )>>sps->Log2MinTrafoSize) + + (((yB+y)*SubHeight)>>sps->Log2MinTrafoSize) + * sps->PicWidthInTbsY ]; + + bool availableN = NBlockAddr <= currBlockAddr; + + if (pps->constrained_intra_pred_flag) { + if (img->get_pred_mode((xB-1)*SubWidth,(yB+y)*SubHeight)!=MODE_INTRA) + availableN = false; + } + + if (availableN) { + if (!nAvail) firstValue = image[xB-1 + (yB+y)*stride]; + + for (int i=0;i<4;i++) { + available[-y+i-1] = availableN; + out_border[-y+i-1] = image[xB-1 + (yB+y-i)*stride]; + } + + nAvail+=4; + } + } + + // copy pixel at top-left position + + if (availableTopLeft) + { + int NBlockAddr = pps->MinTbAddrZS[ (((xB-1)*SubWidth )>>sps->Log2MinTrafoSize) + + (((yB-1)*SubHeight)>>sps->Log2MinTrafoSize) + * sps->PicWidthInTbsY ]; + + bool availableN = NBlockAddr <= currBlockAddr; + + if (pps->constrained_intra_pred_flag) { + if (img->get_pred_mode((xB-1)*SubWidth,(yB-1)*SubHeight)!=MODE_INTRA) { + availableN = false; + } + } + + if (availableN) { + if (!nAvail) firstValue = image[xB-1 + (yB-1)*stride]; + + out_border[0] = image[xB-1 + (yB-1)*stride]; + available[0] = availableN; + nAvail++; + } + } + + // copy pixels at top row + + for (int x=0 ; xMinTbAddrZS[ (((xB+x)*SubWidth )>>sps->Log2MinTrafoSize) + + (((yB-1)*SubHeight)>>sps->Log2MinTrafoSize) + * sps->PicWidthInTbsY ]; + + bool availableN = NBlockAddr <= currBlockAddr; + + if (pps->constrained_intra_pred_flag) { + if (img->get_pred_mode((xB+x)*SubWidth,(yB-1)*SubHeight)!=MODE_INTRA) { + availableN = false; + } + } + + + if (availableN) { + if (!nAvail) firstValue = image[xB+x + (yB-1)*stride]; + + for (int i=0;i<4;i++) { + out_border[x+i+1] = image[xB+x+i + (yB-1)*stride]; + available[x+i+1] = availableN; + } + + nAvail+=4; + } + } + } +} + + + +template +void intra_border_computer::reference_sample_substitution() +{ + // reference sample substitution + + const int bit_depth = img->get_bit_depth(cIdx); + + if (nAvail!=4*nT+1) { + if (nAvail==0) { + if (sizeof(pixel_t)==1) { + memset(out_border-2*nT, 1<<(bit_depth-1), 4*nT+1); + } + else { + for (int i = -2*nT; i <= 2*nT ; i++) { + out_border[i] = 1<<(bit_depth-1); + } + } + } + else { + if (!available[-2*nT]) { + out_border[-2*nT] = firstValue; + } + + for (int i=-2*nT+1; i<=2*nT; i++) + if (!available[i]) { + out_border[i]=out_border[i-1]; + } + } + } + + logtrace(LogIntraPred,"availableN: "); + print_border(available,NULL,nT); + logtrace(LogIntraPred,"\n"); + + logtrace(LogIntraPred,"output: "); + print_border(out_border,NULL,nT); + logtrace(LogIntraPred,"\n"); +} + + +#endif diff --git a/libde265/md5.h b/libde265/md5.h new file mode 100644 index 0000000..f1a6857 --- /dev/null +++ b/libde265/md5.h @@ -0,0 +1,45 @@ +/* + * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. + * MD5 Message-Digest Algorithm (RFC 1321). + * + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * + * Author: + * Alexander Peslyak, better known as Solar Designer + * + * This software was written by Alexander Peslyak in 2001. No copyright is + * claimed, and the software is hereby placed in the public domain. + * In case this attempt to disclaim copyright and place the software in the + * public domain is deemed null and void, then the software is + * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the + * general public under the following terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * There's ABSOLUTELY NO WARRANTY, express or implied. + * + * See md5.c for more information. + */ + +#ifdef HAVE_OPENSSL +#include +#elif !defined(_MD5_H) +#define _MD5_H + +/* Any 32-bit or wider unsigned integer data type will do */ +typedef unsigned int MD5_u32plus; + +typedef struct { + MD5_u32plus lo, hi; + MD5_u32plus a, b, c, d; + unsigned char buffer[64]; + MD5_u32plus block[16]; +} MD5_CTX; + +extern void MD5_Init(MD5_CTX *ctx); +extern void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size); +extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); + +#endif diff --git a/libde265/motion.h b/libde265/motion.h new file mode 100644 index 0000000..12d7791 --- /dev/null +++ b/libde265/motion.h @@ -0,0 +1,131 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_MOTION_H +#define DE265_MOTION_H + +#include +#include "slice.h" + +class base_context; +class slice_segment_header; + +class MotionVector +{ + public: + int16_t x,y; +}; + + +class PBMotion +{ + public: + uint8_t predFlag[2]; // which of the two vectors is actually used + int8_t refIdx[2]; // index into RefPicList + MotionVector mv[2]; // the absolute motion vectors + + bool operator==(const PBMotion&) const; +}; + + +class PBMotionCoding +{ + public: + // index into RefPicList + int8_t refIdx[2]; + + // motion vector difference + int16_t mvd[2][2]; // [L0/L1][x/y] (only in top left position - ???) + + // enum InterPredIdc, whether this is prediction from L0,L1, or BI + uint8_t inter_pred_idc : 2; + + // which of the two MVPs is used + uint8_t mvp_l0_flag : 1; + uint8_t mvp_l1_flag : 1; + + // whether merge mode is used + uint8_t merge_flag : 1; + uint8_t merge_idx : 3; +}; + + +void get_merge_candidate_list(base_context* ctx, + const slice_segment_header* shdr, + struct de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + PBMotion* mergeCandList); + +/* +int derive_spatial_merging_candidates(const struct de265_image* img, + int xC, int yC, int nCS, int xP, int yP, + uint8_t singleMCLFlag, + int nPbW, int nPbH, + int partIdx, + MotionVectorSpec* out_cand, + int maxCandidates); +*/ + +void generate_inter_prediction_samples(base_context* ctx, + const slice_segment_header* shdr, + struct de265_image* img, + int xC,int yC, + int xB,int yB, + int nCS, int nPbW,int nPbH, + const PBMotion* vi); + + +/* Fill list (two entries) of motion-vector predictors for MVD coding. + */ +void fill_luma_motion_vector_predictors(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC,int nCS,int xP,int yP, + int nPbW,int nPbH, int l, + int refIdx, int partIdx, + MotionVector out_mvpList[2]); + + +void decode_prediction_unit(base_context* ctx,const slice_segment_header* shdr, + de265_image* img, const PBMotionCoding& motion, + int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx); + + + + +class MotionVectorAccess +{ +public: + virtual enum PartMode get_PartMode(int x,int y) const = 0; + virtual const PBMotion& get_mv_info(int x,int y) const = 0; +}; + + +void get_merge_candidate_list_without_step_9(base_context* ctx, + const slice_segment_header* shdr, + const MotionVectorAccess& mvaccess, + de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + int max_merge_idx, + PBMotion* mergeCandList); + +#endif diff --git a/libde265/nal-parser.h b/libde265/nal-parser.h new file mode 100644 index 0000000..a63a7fd --- /dev/null +++ b/libde265/nal-parser.h @@ -0,0 +1,154 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_NAL_PARSER_H +#define DE265_NAL_PARSER_H + +#include "libde265/sps.h" +#include "libde265/pps.h" +#include "libde265/nal.h" +#include "libde265/util.h" + +#include +#include + +#define DE265_NAL_FREE_LIST_SIZE 16 +#define DE265_SKIPPED_BYTES_INITIAL_SIZE 16 + + +class NAL_unit { + public: + NAL_unit(); + ~NAL_unit(); + + nal_header header; + + de265_PTS pts; + void* user_data; + + + void clear(); + + // --- rbsp data --- + + LIBDE265_CHECK_RESULT bool resize(int new_size); + LIBDE265_CHECK_RESULT bool append(const unsigned char* data, int n); + LIBDE265_CHECK_RESULT bool set_data(const unsigned char* data, int n); + + int size() const { return data_size; } + void set_size(int s) { data_size=s; } + unsigned char* data() { return nal_data; } + const unsigned char* data() const { return nal_data; } + + + // --- skipped stuffing bytes --- + + int num_skipped_bytes_before(int byte_position, int headerLength) const; + int num_skipped_bytes() const { return skipped_bytes.size(); } + + //void clear_skipped_bytes() { skipped_bytes.clear(); } + + /* Mark a byte as skipped. It is assumed that the byte is already removed + from the input data. The NAL data is not modified. + */ + void insert_skipped_byte(int pos); + + /* Remove all stuffing bytes from NAL data. The NAL data is modified and + the removed bytes are marked as skipped bytes. + */ + void remove_stuffing_bytes(); + + private: + unsigned char* nal_data; + int data_size; + int capacity; + + std::vector skipped_bytes; // up to position[x], there were 'x' skipped bytes +}; + + +class NAL_Parser +{ + public: + NAL_Parser(); + ~NAL_Parser(); + + de265_error push_data(const unsigned char* data, int len, + de265_PTS pts, void* user_data = NULL); + + de265_error push_NAL(const unsigned char* data, int len, + de265_PTS pts, void* user_data = NULL); + + NAL_unit* pop_from_NAL_queue(); + de265_error flush_data(); + void mark_end_of_stream() { end_of_stream=true; } + void mark_end_of_frame() { end_of_frame=true; } + void remove_pending_input_data(); + + int bytes_in_input_queue() const { + int size = nBytes_in_NAL_queue; + if (pending_input_NAL) { size += pending_input_NAL->size(); } + return size; + } + + int number_of_NAL_units_pending() const { + int size = NAL_queue.size(); + if (pending_input_NAL) { size++; } + return size; + } + + int number_of_complete_NAL_units_pending() const { + return NAL_queue.size(); + } + + void free_NAL_unit(NAL_unit*); + + + int get_NAL_queue_length() const { return NAL_queue.size(); } + bool is_end_of_stream() const { return end_of_stream; } + bool is_end_of_frame() const { return end_of_frame; } + + private: + // byte-stream level + + bool end_of_stream; // data in pending_input_data is end of stream + bool end_of_frame; // data in pending_input_data is end of frame + int input_push_state; + + NAL_unit* pending_input_NAL; + + + // NAL level + + std::queue NAL_queue; // enqueued NALs have suffing bytes removed + int nBytes_in_NAL_queue; // data bytes currently in NAL_queue + + void push_to_NAL_queue(NAL_unit*); + + + // pool of unused NAL memory + + std::vector NAL_free_list; // maximum size: DE265_NAL_FREE_LIST_SIZE + + LIBDE265_CHECK_RESULT NAL_unit* alloc_NAL_unit(int size); +}; + + +#endif diff --git a/libde265/nal.h b/libde265/nal.h new file mode 100644 index 0000000..2bd85db --- /dev/null +++ b/libde265/nal.h @@ -0,0 +1,129 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_NAL_H +#define DE265_NAL_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#ifdef HAVE_STDBOOL_H +#include +#endif + +#include "libde265/bitstream.h" +#include "libde265/cabac.h" + +struct nal_header { + nal_header() { + nal_unit_type = 0; + nuh_layer_id = 0; + nuh_temporal_id = 0; + } + + void read(bitreader* reader); + void write(CABAC_encoder& writer) const; + + void set(int unit_type, int layer_id=0, int temporal_id=0) { + nal_unit_type =unit_type; + nuh_layer_id =layer_id; + nuh_temporal_id=temporal_id; + } + + uint8_t nal_unit_type; + uint8_t nuh_layer_id; + uint8_t nuh_temporal_id; +}; + +#define NAL_UNIT_TRAIL_N 0 +#define NAL_UNIT_TRAIL_R 1 +#define NAL_UNIT_TSA_N 2 +#define NAL_UNIT_TSA_R 3 +#define NAL_UNIT_STSA_N 4 +#define NAL_UNIT_STSA_R 5 +#define NAL_UNIT_RADL_N 6 +#define NAL_UNIT_RADL_R 7 +#define NAL_UNIT_RASL_N 8 +#define NAL_UNIT_RASL_R 9 +#define NAL_UNIT_RESERVED_VCL_N10 10 +#define NAL_UNIT_RESERVED_VCL_N12 12 +#define NAL_UNIT_RESERVED_VCL_N14 14 +#define NAL_UNIT_RESERVED_VCL_R11 11 +#define NAL_UNIT_RESERVED_VCL_R13 13 +#define NAL_UNIT_RESERVED_VCL_R15 15 +#define NAL_UNIT_BLA_W_LP 16 // BLA = broken link access +#define NAL_UNIT_BLA_W_RADL 17 +#define NAL_UNIT_BLA_N_LP 18 +#define NAL_UNIT_IDR_W_RADL 19 +#define NAL_UNIT_IDR_N_LP 20 +#define NAL_UNIT_CRA_NUT 21 // CRA = clean random access +#define NAL_UNIT_RESERVED_IRAP_VCL22 22 +#define NAL_UNIT_RESERVED_IRAP_VCL23 23 +#define NAL_UNIT_RESERVED_VCL24 24 +#define NAL_UNIT_RESERVED_VCL25 25 +#define NAL_UNIT_RESERVED_VCL26 26 +#define NAL_UNIT_RESERVED_VCL27 27 +#define NAL_UNIT_RESERVED_VCL28 28 +#define NAL_UNIT_RESERVED_VCL29 29 +#define NAL_UNIT_RESERVED_VCL30 30 +#define NAL_UNIT_RESERVED_VCL31 31 +#define NAL_UNIT_VPS_NUT 32 +#define NAL_UNIT_SPS_NUT 33 +#define NAL_UNIT_PPS_NUT 34 +#define NAL_UNIT_AUD_NUT 35 +#define NAL_UNIT_EOS_NUT 36 +#define NAL_UNIT_EOB_NUT 37 +#define NAL_UNIT_FD_NUT 38 +#define NAL_UNIT_PREFIX_SEI_NUT 39 +#define NAL_UNIT_SUFFIX_SEI_NUT 40 +#define NAL_UNIT_RESERVED_NVCL41 41 +#define NAL_UNIT_RESERVED_NVCL42 42 +#define NAL_UNIT_RESERVED_NVCL43 43 +#define NAL_UNIT_RESERVED_NVCL44 44 +#define NAL_UNIT_RESERVED_NVCL45 45 +#define NAL_UNIT_RESERVED_NVCL46 46 +#define NAL_UNIT_RESERVED_NVCL47 47 + +#define NAL_UNIT_UNDEFINED 255 + +bool isIDR(uint8_t unit_type); +bool isBLA(uint8_t unit_type); +bool isCRA(uint8_t unit_type); +bool isRAP(uint8_t unit_type); +bool isRASL(uint8_t unit_type); +bool isIRAP(uint8_t unit_type); +bool isRADL(uint8_t unit_type); +bool isReferenceNALU(uint8_t unit_type); +bool isSublayerNonReference(uint8_t unit_type); + +const char* get_NAL_name(uint8_t unit_type); + +inline bool isIdrPic(uint8_t nal_unit_type) { + return (nal_unit_type == NAL_UNIT_IDR_W_RADL || + nal_unit_type == NAL_UNIT_IDR_N_LP); +} + +inline bool isRapPic(uint8_t nal_unit_type) { + return nal_unit_type >= 16 && nal_unit_type <= 23; +} + +#endif diff --git a/libde265/pps.h b/libde265/pps.h new file mode 100644 index 0000000..81ff1f6 --- /dev/null +++ b/libde265/pps.h @@ -0,0 +1,163 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_PPS_H +#define DE265_PPS_H + +#include "libde265/bitstream.h" +#include "libde265/sps.h" // for scaling list only + +#include +#include + +#define DE265_MAX_TILE_COLUMNS 10 +#define DE265_MAX_TILE_ROWS 10 + +class decoder_context; +class pic_parameter_set; + + +class pps_range_extension +{ + public: + pps_range_extension() { reset(); } + + void reset(); + + bool read(bitreader*, decoder_context*, const pic_parameter_set*); + void dump(int fd) const; + + uint8_t log2_max_transform_skip_block_size; + bool cross_component_prediction_enabled_flag; + bool chroma_qp_offset_list_enabled_flag; + uint8_t diff_cu_chroma_qp_offset_depth; + uint8_t chroma_qp_offset_list_len; + int8_t cb_qp_offset_list[6]; + int8_t cr_qp_offset_list[6]; + uint8_t log2_sao_offset_scale_luma; + uint8_t log2_sao_offset_scale_chroma; +}; + + +class pic_parameter_set { +public: + pic_parameter_set(); + ~pic_parameter_set(); + + void reset() { set_defaults(); } + bool read(bitreader*, decoder_context*); + bool write(error_queue*, CABAC_encoder&, + const seq_parameter_set* sps); + + bool is_tile_start_CTB(int ctbX,int ctbY) const; + void dump(int fd) const; + + + void set_defaults(enum PresetSet = Preset_Default); + + bool pps_read; // whether this pps has been read from bitstream + std::shared_ptr sps; + + + char pic_parameter_set_id; + char seq_parameter_set_id; + char dependent_slice_segments_enabled_flag; + char sign_data_hiding_flag; + char cabac_init_present_flag; + char num_ref_idx_l0_default_active; // [1;16] + char num_ref_idx_l1_default_active; // [1;16] + + int pic_init_qp; + char constrained_intra_pred_flag; + char transform_skip_enabled_flag; + + // --- QP --- + + char cu_qp_delta_enabled_flag; + int diff_cu_qp_delta_depth; // [ 0 ; log2_diff_max_min_luma_coding_block_size ] + + int pic_cb_qp_offset; + int pic_cr_qp_offset; + char pps_slice_chroma_qp_offsets_present_flag; + + + char weighted_pred_flag; + char weighted_bipred_flag; + char output_flag_present_flag; + char transquant_bypass_enable_flag; + char entropy_coding_sync_enabled_flag; + + + // --- tiles --- + + char tiles_enabled_flag; + int num_tile_columns; // [1;PicWidthInCtbsY] + int num_tile_rows; // [1;PicHeightInCtbsY] + char uniform_spacing_flag; + + + // --- --- + + char loop_filter_across_tiles_enabled_flag; + char pps_loop_filter_across_slices_enabled_flag; + char deblocking_filter_control_present_flag; + + char deblocking_filter_override_enabled_flag; + char pic_disable_deblocking_filter_flag; + + int beta_offset; + int tc_offset; + + char pic_scaling_list_data_present_flag; + struct scaling_list_data scaling_list; // contains valid data if sps->scaling_list_enabled_flag set + + char lists_modification_present_flag; + int log2_parallel_merge_level; // [2 ; log2(max CB size)] + char num_extra_slice_header_bits; + char slice_segment_header_extension_present_flag; + char pps_extension_flag; + char pps_range_extension_flag; + char pps_multilayer_extension_flag; + char pps_extension_6bits; + + pps_range_extension range_extension; + + + // --- derived values --- + + int Log2MinCuQpDeltaSize; + int Log2MinCuChromaQpOffsetSize; + int Log2MaxTransformSkipSize; + + int colWidth [ DE265_MAX_TILE_COLUMNS ]; + int rowHeight[ DE265_MAX_TILE_ROWS ]; + int colBd [ DE265_MAX_TILE_COLUMNS+1 ]; + int rowBd [ DE265_MAX_TILE_ROWS+1 ]; + + std::vector CtbAddrRStoTS; // #CTBs + std::vector CtbAddrTStoRS; // #CTBs + std::vector TileId; // #CTBs // index in tile-scan order + std::vector TileIdRS; // #CTBs // index in raster-scan order + std::vector MinTbAddrZS; // #TBs [x + y*PicWidthInTbsY] + + void set_derived_values(const seq_parameter_set* sps); +}; + +#endif diff --git a/libde265/quality.h b/libde265/quality.h new file mode 100644 index 0000000..7073d14 --- /dev/null +++ b/libde265/quality.h @@ -0,0 +1,47 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_QUALITY_H +#define DE265_QUALITY_H + +#include +#include +#include + + +LIBDE265_API uint32_t SSD(const uint8_t* img, int imgStride, + const uint8_t* ref, int refStride, + int width, int height); + +LIBDE265_API uint32_t SAD(const uint8_t* img, int imgStride, + const uint8_t* ref, int refStride, + int width, int height); + +LIBDE265_API double MSE(const uint8_t* img, int imgStride, + const uint8_t* ref, int refStride, + int width, int height); + +LIBDE265_API double PSNR(double mse); + + +LIBDE265_API uint32_t compute_distortion_ssd(const de265_image* img1, const de265_image* img2, + int x0, int y0, int log2size, int cIdx); + +#endif diff --git a/libde265/refpic.h b/libde265/refpic.h new file mode 100644 index 0000000..2904197 --- /dev/null +++ b/libde265/refpic.h @@ -0,0 +1,61 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_REFPIC_H +#define DE265_REFPIC_H + +#include "libde265/bitstream.h" + +#define MAX_NUM_REF_PICS 16 // maximum defined by standard, may be lower for some Levels + + +class ref_pic_set +{ + public: + // Lists of pictures that have to be kept in the decoded picture buffer for future + // reference and that may optionally be used for prediction in the current frame. + // Lists contain the relative POC positions. + int16_t DeltaPocS0[MAX_NUM_REF_PICS]; // sorted in decreasing order (e.g. -1, -2, -4, -7, ...) + int16_t DeltaPocS1[MAX_NUM_REF_PICS]; // sorted in ascending order (e.g. 1, 2, 4, 7) + + // flag for each reference whether this is actually used for prediction in the current frame + uint8_t UsedByCurrPicS0[MAX_NUM_REF_PICS]; + uint8_t UsedByCurrPicS1[MAX_NUM_REF_PICS]; + + uint8_t NumNegativePics; // number of past reference pictures + uint8_t NumPositivePics; // number of future reference pictures + + // --- derived values --- + + void compute_derived_values(); + + uint8_t NumDeltaPocs; // total number of reference pictures (past + future) + + uint8_t NumPocTotalCurr_shortterm_only; /* Total number of reference pictures that may actually + be used for prediction in the current frame. */ + + void reset(); +}; + + +void dump_short_term_ref_pic_set(const ref_pic_set*, FILE* fh); +void dump_compact_short_term_ref_pic_set(const ref_pic_set* set, int range, FILE* fh); + +#endif diff --git a/libde265/sao.h b/libde265/sao.h new file mode 100644 index 0000000..bb9e08c --- /dev/null +++ b/libde265/sao.h @@ -0,0 +1,36 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SAO_H +#define DE265_SAO_H + +#include "libde265/decctx.h" + +void apply_sample_adaptive_offset(de265_image* img); + +/* requires less memory than the function above */ +void apply_sample_adaptive_offset_sequential(de265_image* img); + +/* saoInputProgress - the CTB progress that SAO will wait for before beginning processing. + Returns 'true' if any tasks have been added. + */ +bool add_sao_tasks(image_unit* imgunit, int saoInputProgress); + +#endif diff --git a/libde265/scan.h b/libde265/scan.h new file mode 100644 index 0000000..7a8b977 --- /dev/null +++ b/libde265/scan.h @@ -0,0 +1,43 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SCAN_H +#define DE265_SCAN_H + +#include + +typedef struct { + uint8_t x,y; +} position; + +typedef struct { + uint8_t subBlock; + uint8_t scanPos; +} scan_position; + +void init_scan_orders(); + +/* scanIdx: 0 - diag, 1 - horiz, 2 - verti + */ +const position* get_scan_order(int log2BlockSize, int scanIdx); + +scan_position get_scan_position(int x,int y, int scanIdx, int log2BlkSize); + +#endif diff --git a/libde265/sei.h b/libde265/sei.h new file mode 100644 index 0000000..fd615d5 --- /dev/null +++ b/libde265/sei.h @@ -0,0 +1,89 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SEI_H +#define DE265_SEI_H + +#include "libde265/bitstream.h" +#include "libde265/de265.h" + + +enum sei_payload_type { + sei_payload_type_buffering_period = 0, + sei_payload_type_pic_timing = 1, + sei_payload_type_pan_scan_rect = 2, + sei_payload_type_filler_payload = 3, + sei_payload_type_user_data_registered_itu_t_t35 = 4, + sei_payload_type_user_data_unregistered = 5, + sei_payload_type_recovery_point = 6, + sei_payload_type_scene_info = 9, + sei_payload_type_picture_snapshot = 15, + sei_payload_type_progressive_refinement_segment_start = 16, + sei_payload_type_progressive_refinement_segment_end = 17, + sei_payload_type_film_grain_characteristics = 19, + sei_payload_type_post_filter_hint = 22, + sei_payload_type_tone_mapping_info = 23, + sei_payload_type_frame_packing_arrangement = 45, + sei_payload_type_display_orientation = 47, + sei_payload_type_structure_of_pictures_info = 128, + sei_payload_type_active_parameter_sets = 129, + sei_payload_type_decoding_unit_info = 130, + sei_payload_type_temporal_sub_layer_zero_index = 131, + sei_payload_type_decoded_picture_hash = 132, + sei_payload_type_scalable_nesting = 133, + sei_payload_type_region_refresh_info = 134, + sei_payload_type_no_display = 135, + sei_payload_type_motion_constrained_tile_sets = 136 +}; + + +enum sei_decoded_picture_hash_type { + sei_decoded_picture_hash_type_MD5 = 0, + sei_decoded_picture_hash_type_CRC = 1, + sei_decoded_picture_hash_type_checksum = 2 +}; + + +typedef struct { + enum sei_decoded_picture_hash_type hash_type; + uint8_t md5[3][16]; + uint16_t crc[3]; + uint32_t checksum[3]; +} sei_decoded_picture_hash; + + +typedef struct { + enum sei_payload_type payload_type; + int payload_size; + + union { + sei_decoded_picture_hash decoded_picture_hash; + } data; +} sei_message; + +class seq_parameter_set; + +const char* sei_type_name(enum sei_payload_type type); + +de265_error read_sei(bitreader* reader, sei_message*, bool suffix, const seq_parameter_set* sps); +void dump_sei(const sei_message*, const seq_parameter_set* sps); +de265_error process_sei(const sei_message*, struct de265_image* img); + +#endif diff --git a/libde265/slice.h b/libde265/slice.h new file mode 100644 index 0000000..0f476f2 --- /dev/null +++ b/libde265/slice.h @@ -0,0 +1,313 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * Min Chen + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SLICE_H +#define DE265_SLICE_H + +#include "libde265/cabac.h" +#include "libde265/de265.h" +#include "libde265/util.h" +#include "libde265/refpic.h" +#include "libde265/threads.h" +#include "contextmodel.h" + +#include +#include +#include + +#define MAX_NUM_REF_PICS 16 + +class decoder_context; +class thread_context; +class error_queue; +class seq_parameter_set; +class pic_parameter_set; + +enum SliceType + { + SLICE_TYPE_B = 0, + SLICE_TYPE_P = 1, + SLICE_TYPE_I = 2 + }; + +/* + 2Nx2N 2NxN Nx2N NxN + +-------+ +-------+ +---+---+ +---+---+ + | | | | | | | | | | + | | |_______| | | | |___|___| + | | | | | | | | | | + | | | | | | | | | | + +-------+ +-------+ +---+---+ +---+---+ + + 2NxnU 2NxnD nLx2N nRx2N + +-------+ +-------+ +-+-----+ +-----+-+ + |_______| | | | | | | | | + | | | | | | | | | | + | | |_______| | | | | | | + | | | | | | | | | | + +-------+ +-------+ +-+-----+ +-----+-+ + + - AMP only if CU size > min CU size -> minimum PU size = CUsize/2 + - NxN only if size >= 16x16 (-> minimum block size = 8x8) + - minimum block size for Bi-Pred is 8x8 (wikipedia: Coding_tree_unit) +*/ +enum PartMode + { + PART_2Nx2N = 0, + PART_2NxN = 1, + PART_Nx2N = 2, + PART_NxN = 3, + PART_2NxnU = 4, + PART_2NxnD = 5, + PART_nLx2N = 6, + PART_nRx2N = 7 + }; + +const char* part_mode_name(enum PartMode); + + +enum PredMode + { + MODE_INTRA, MODE_INTER, MODE_SKIP + }; + +enum IntraPredMode + { + INTRA_PLANAR = 0, + INTRA_DC = 1, + INTRA_ANGULAR_2 = 2, INTRA_ANGULAR_3 = 3, INTRA_ANGULAR_4 = 4, INTRA_ANGULAR_5 = 5, + INTRA_ANGULAR_6 = 6, INTRA_ANGULAR_7 = 7, INTRA_ANGULAR_8 = 8, INTRA_ANGULAR_9 = 9, + INTRA_ANGULAR_10 = 10, INTRA_ANGULAR_11 = 11, INTRA_ANGULAR_12 = 12, INTRA_ANGULAR_13 = 13, + INTRA_ANGULAR_14 = 14, INTRA_ANGULAR_15 = 15, INTRA_ANGULAR_16 = 16, INTRA_ANGULAR_17 = 17, + INTRA_ANGULAR_18 = 18, INTRA_ANGULAR_19 = 19, INTRA_ANGULAR_20 = 20, INTRA_ANGULAR_21 = 21, + INTRA_ANGULAR_22 = 22, INTRA_ANGULAR_23 = 23, INTRA_ANGULAR_24 = 24, INTRA_ANGULAR_25 = 25, + INTRA_ANGULAR_26 = 26, INTRA_ANGULAR_27 = 27, INTRA_ANGULAR_28 = 28, INTRA_ANGULAR_29 = 29, + INTRA_ANGULAR_30 = 30, INTRA_ANGULAR_31 = 31, INTRA_ANGULAR_32 = 32, INTRA_ANGULAR_33 = 33, + INTRA_ANGULAR_34 = 34 + }; + + +enum IntraChromaPredMode + { + INTRA_CHROMA_PLANAR_OR_34 = 0, + INTRA_CHROMA_ANGULAR_26_OR_34 = 1, + INTRA_CHROMA_ANGULAR_10_OR_34 = 2, + INTRA_CHROMA_DC_OR_34 = 3, + INTRA_CHROMA_LIKE_LUMA = 4 + }; + + +enum InterPredIdc + { + // note: values have to match the decoding function decode_inter_pred_idc() + PRED_L0=1, + PRED_L1=2, + PRED_BI=3 + }; + + + +class slice_segment_header { +public: + slice_segment_header() { + reset(); + } + + de265_error read(bitreader* br, decoder_context*, bool* continueDecoding); + de265_error write(error_queue*, CABAC_encoder&, + const seq_parameter_set* sps, + const pic_parameter_set* pps, + uint8_t nal_unit_type); + + void dump_slice_segment_header(const decoder_context*, int fd) const; + + void set_defaults(); + void reset(); + + + int slice_index; // index through all slices in a picture (internal only) + std::shared_ptr pps; + + + char first_slice_segment_in_pic_flag; + char no_output_of_prior_pics_flag; + int slice_pic_parameter_set_id; + char dependent_slice_segment_flag; + int slice_segment_address; + + int slice_type; + char pic_output_flag; + char colour_plane_id; + int slice_pic_order_cnt_lsb; + char short_term_ref_pic_set_sps_flag; + ref_pic_set slice_ref_pic_set; + + int short_term_ref_pic_set_idx; + int num_long_term_sps; + int num_long_term_pics; + + uint8_t lt_idx_sps[MAX_NUM_REF_PICS]; + int poc_lsb_lt[MAX_NUM_REF_PICS]; + char used_by_curr_pic_lt_flag[MAX_NUM_REF_PICS]; + + char delta_poc_msb_present_flag[MAX_NUM_REF_PICS]; + int delta_poc_msb_cycle_lt[MAX_NUM_REF_PICS]; + + char slice_temporal_mvp_enabled_flag; + char slice_sao_luma_flag; + char slice_sao_chroma_flag; + + char num_ref_idx_active_override_flag; + int num_ref_idx_l0_active; // [1;16] + int num_ref_idx_l1_active; // [1;16] + + char ref_pic_list_modification_flag_l0; + char ref_pic_list_modification_flag_l1; + uint8_t list_entry_l0[16]; + uint8_t list_entry_l1[16]; + + char mvd_l1_zero_flag; + char cabac_init_flag; + char collocated_from_l0_flag; + int collocated_ref_idx; + + // --- pred_weight_table --- + + uint8_t luma_log2_weight_denom; // [0;7] + uint8_t ChromaLog2WeightDenom; // [0;7] + + // first index is L0/L1 + uint8_t luma_weight_flag[2][16]; // bool + uint8_t chroma_weight_flag[2][16]; // bool + int16_t LumaWeight[2][16]; + int8_t luma_offset[2][16]; + int16_t ChromaWeight[2][16][2]; + int8_t ChromaOffset[2][16][2]; + + + int five_minus_max_num_merge_cand; + int slice_qp_delta; + + int slice_cb_qp_offset; + int slice_cr_qp_offset; + + char cu_chroma_qp_offset_enabled_flag; + + char deblocking_filter_override_flag; + char slice_deblocking_filter_disabled_flag; + int slice_beta_offset; // = pps->beta_offset if undefined + int slice_tc_offset; // = pps->tc_offset if undefined + + char slice_loop_filter_across_slices_enabled_flag; + + int num_entry_point_offsets; + int offset_len; + std::vector entry_point_offset; + + int slice_segment_header_extension_length; + + + // --- derived data --- + + int SliceQPY; + int initType; + + void compute_derived_values(const pic_parameter_set* pps); + + + // --- data for external modules --- + + int SliceAddrRS; // slice_segment_address of last independent slice + + int MaxNumMergeCand; // directly derived from 'five_minus_max_num_merge_cand' + int CurrRpsIdx; + ref_pic_set CurrRps; // the active reference-picture set + int NumPocTotalCurr; + + // number of entries: num_ref_idx_l0_active / num_ref_idx_l1_active + int RefPicList[2][MAX_NUM_REF_PICS]; // contains buffer IDs (D:indices into DPB/E:frame number) + int RefPicList_POC[2][MAX_NUM_REF_PICS]; + int RefPicList_PicState[2][MAX_NUM_REF_PICS]; /* We have to save the PicState because the decoding + of an image may be delayed and the PicState can + change in the mean-time (e.g. from ShortTerm to + LongTerm). PicState is used in motion.cc */ + + char LongTermRefPic[2][MAX_NUM_REF_PICS]; /* Flag whether the picture at this ref-pic-list + is a long-term picture. */ + + // context storage for dependent slices (stores CABAC model at end of slice segment) + context_model_table ctx_model_storage; + bool ctx_model_storage_defined; // whether there is valid data in ctx_model_storage + + std::vector RemoveReferencesList; // images that can be removed from the DPB before decoding this slice + +}; + + + +typedef struct { + // TODO: we could combine SaoTypeIdx and SaoEoClass into one byte to make the struct 16 bytes only + + unsigned char SaoTypeIdx; // use with (SaoTypeIdx>>(2*cIdx)) & 0x3 + unsigned char SaoEoClass; // use with (SaoTypeIdx>>(2*cIdx)) & 0x3 + + uint8_t sao_band_position[3]; + int8_t saoOffsetVal[3][4]; // index with [][idx-1] as saoOffsetVal[][0]==0 always +} sao_info; + + + + +de265_error read_slice_segment_data(thread_context* tctx); + +bool alloc_and_init_significant_coeff_ctxIdx_lookupTable(); +void free_significant_coeff_ctxIdx_lookupTable(); + + +class thread_task_ctb_row : public thread_task +{ +public: + bool firstSliceSubstream; + int debug_startCtbRow; + thread_context* tctx; + + virtual void work(); + virtual std::string name() const; +}; + +class thread_task_slice_segment : public thread_task +{ +public: + bool firstSliceSubstream; + int debug_startCtbX, debug_startCtbY; + thread_context* tctx; + + virtual void work(); + virtual std::string name() const; +}; + + +int check_CTB_available(const de265_image* img, + int xC,int yC, int xN,int yN); + +#endif diff --git a/libde265/sps.h b/libde265/sps.h new file mode 100644 index 0000000..b06151d --- /dev/null +++ b/libde265/sps.h @@ -0,0 +1,257 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SPS_H +#define DE265_SPS_H + +#include "libde265/vps.h" +#include "libde265/vui.h" +#include "libde265/bitstream.h" +#include "libde265/refpic.h" +#include "libde265/de265.h" +#include "libde265/cabac.h" + +#include + +class error_queue; + +// #define MAX_REF_PIC_SETS 64 // maximum according to standard +#define MAX_NUM_LT_REF_PICS_SPS 32 + +// this is just a safety range +#define MAX_PICTURE_WIDTH 70000 +#define MAX_PICTURE_HEIGHT 70000 + +enum { + CHROMA_MONO = 0, + CHROMA_420 = 1, + CHROMA_422 = 2, + CHROMA_444 = 3, + CHROMA_444_SEPARATE +}; + + +typedef struct scaling_list_data { + // structure size: approx. 4 kB + + uint8_t ScalingFactor_Size0[6][4][4]; + uint8_t ScalingFactor_Size1[6][8][8]; + uint8_t ScalingFactor_Size2[6][16][16]; + uint8_t ScalingFactor_Size3[2][32][32]; +} scaling_list_data; + + +enum PresetSet { + Preset_Default +}; + + +class sps_range_extension +{ + public: + sps_range_extension(); + + uint8_t transform_skip_rotation_enabled_flag; + uint8_t transform_skip_context_enabled_flag; + uint8_t implicit_rdpcm_enabled_flag; + uint8_t explicit_rdpcm_enabled_flag; + uint8_t extended_precision_processing_flag; + uint8_t intra_smoothing_disabled_flag; + uint8_t high_precision_offsets_enabled_flag; + uint8_t persistent_rice_adaptation_enabled_flag; + uint8_t cabac_bypass_alignment_enabled_flag; + + de265_error read(error_queue*, bitreader*); + void dump(int fd) const; +}; + + +class seq_parameter_set { +public: + seq_parameter_set(); + ~seq_parameter_set(); + + de265_error read(error_queue*, bitreader*); + de265_error write(error_queue*, CABAC_encoder&); + + void dump(int fd) const; + + void set_defaults(enum PresetSet = Preset_Default); + void set_CB_log2size_range(int mini,int maxi); + void set_TB_log2size_range(int mini,int maxi); + void set_resolution(int w,int h); + + bool sps_read; // whether the sps has been read from the bitstream + + + char video_parameter_set_id; + char sps_max_sub_layers; // [1;7] + char sps_temporal_id_nesting_flag; + + profile_tier_level profile_tier_level_; + + int seq_parameter_set_id; + int chroma_format_idc; + + char separate_colour_plane_flag; + int pic_width_in_luma_samples; + int pic_height_in_luma_samples; + char conformance_window_flag; + + int conf_win_left_offset; + int conf_win_right_offset; + int conf_win_top_offset; + int conf_win_bottom_offset; + + int bit_depth_luma; + int bit_depth_chroma; + + int log2_max_pic_order_cnt_lsb; + char sps_sub_layer_ordering_info_present_flag; + + int sps_max_dec_pic_buffering[7]; // for each temporal layer + int sps_max_num_reorder_pics[7]; + int sps_max_latency_increase_plus1[7]; + + int log2_min_luma_coding_block_size; // smallest CB size [3;6] + int log2_diff_max_min_luma_coding_block_size; // largest CB size + int log2_min_transform_block_size; // smallest TB size [2;5] + int log2_diff_max_min_transform_block_size; // largest TB size + int max_transform_hierarchy_depth_inter; + int max_transform_hierarchy_depth_intra; + + char scaling_list_enable_flag; + char sps_scaling_list_data_present_flag; /* if not set, the default scaling lists will be set + in scaling_list */ + + struct scaling_list_data scaling_list; + + char amp_enabled_flag; + char sample_adaptive_offset_enabled_flag; + char pcm_enabled_flag; + + char pcm_sample_bit_depth_luma; + char pcm_sample_bit_depth_chroma; + int log2_min_pcm_luma_coding_block_size; + int log2_diff_max_min_pcm_luma_coding_block_size; + char pcm_loop_filter_disable_flag; + + int num_short_term_ref_pic_sets() const { return ref_pic_sets.size(); } + std::vector ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) ) + + char long_term_ref_pics_present_flag; + + int num_long_term_ref_pics_sps; + + int lt_ref_pic_poc_lsb_sps[MAX_NUM_LT_REF_PICS_SPS]; + char used_by_curr_pic_lt_sps_flag[MAX_NUM_LT_REF_PICS_SPS]; + + char sps_temporal_mvp_enabled_flag; + char strong_intra_smoothing_enable_flag; + + char vui_parameters_present_flag; + video_usability_information vui; + + char sps_extension_present_flag; + char sps_range_extension_flag; + char sps_multilayer_extension_flag; + char sps_extension_6bits; + + sps_range_extension range_extension; + + /* + if( sps_extension_flag ) + while( more_rbsp_data() ) + sps_extension_data_flag + u(1) + rbsp_trailing_bits() + */ + + + // --- derived values --- + + de265_error compute_derived_values(bool sanitize_values = false); + + int BitDepth_Y; + int QpBdOffset_Y; + int BitDepth_C; + int QpBdOffset_C; + + int ChromaArrayType; + int SubWidthC, SubHeightC; + int WinUnitX, WinUnitY; + + int MaxPicOrderCntLsb; + + int Log2MinCbSizeY; + int Log2CtbSizeY; + int MinCbSizeY; + int CtbSizeY; + int PicWidthInMinCbsY; + int PicWidthInCtbsY; + int PicHeightInMinCbsY; + int PicHeightInCtbsY; + int PicSizeInMinCbsY; + int PicSizeInCtbsY; + int PicSizeInSamplesY; + + int CtbWidthC, CtbHeightC; + + int PicWidthInTbsY; // not in standard + int PicHeightInTbsY; // not in standard + int PicSizeInTbsY; // not in standard + + int Log2MinTrafoSize; + int Log2MaxTrafoSize; + + int Log2MinPUSize; + int PicWidthInMinPUs; // might be rounded up + int PicHeightInMinPUs; // might be rounded up + + int Log2MinIpcmCbSizeY; + int Log2MaxIpcmCbSizeY; + + int SpsMaxLatencyPictures[7]; // [temporal layer] + + uint8_t WpOffsetBdShiftY; + uint8_t WpOffsetBdShiftC; + int32_t WpOffsetHalfRangeY; + int32_t WpOffsetHalfRangeC; + + + int getPUIndexRS(int pixelX,int pixelY) const { + return (pixelX>>Log2MinPUSize) + (pixelY>>Log2MinPUSize)*PicWidthInMinPUs; + } + + int get_bit_depth(int cIdx) const { + if (cIdx==0) return BitDepth_Y; + else return BitDepth_C; + } + + int get_chroma_shift_W(int cIdx) const { return cIdx ? SubWidthC -1 : 0; } + int get_chroma_shift_H(int cIdx) const { return cIdx ? SubHeightC-1 : 0; } +}; + +de265_error read_scaling_list(bitreader*, const seq_parameter_set*, scaling_list_data*, bool inPPS); +de265_error write_scaling_list(CABAC_encoder& out, const seq_parameter_set* sps, + scaling_list_data* sclist, bool inPPS); +void set_default_scaling_lists(scaling_list_data*); + +#endif diff --git a/libde265/threads.h b/libde265/threads.h new file mode 100644 index 0000000..2c743bc --- /dev/null +++ b/libde265/threads.h @@ -0,0 +1,148 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_THREADS_H +#define DE265_THREADS_H + +#include "libde265/de265.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_STDBOOL_H +#include +#endif + +#include +#include +#include + +#ifndef _WIN32 +#include + +typedef pthread_t de265_thread; +typedef pthread_mutex_t de265_mutex; +typedef pthread_cond_t de265_cond; + +#else // _WIN32 +#if !defined(NOMINMAX) +#define NOMINMAX 1 +#endif +#include +#include "../extra/win32cond.h" +#if _MSC_VER > 1310 +#include +#endif + +typedef HANDLE de265_thread; +typedef HANDLE de265_mutex; +typedef win32_cond_t de265_cond; +#endif // _WIN32 + +#ifndef _WIN32 +int de265_thread_create(de265_thread* t, void *(*start_routine) (void *), void *arg); +#else +int de265_thread_create(de265_thread* t, LPTHREAD_START_ROUTINE start_routine, void *arg); +#endif +void de265_thread_join(de265_thread t); +void de265_thread_destroy(de265_thread* t); +void de265_mutex_init(de265_mutex* m); +void de265_mutex_destroy(de265_mutex* m); +void de265_mutex_lock(de265_mutex* m); +void de265_mutex_unlock(de265_mutex* m); +void de265_cond_init(de265_cond* c); +void de265_cond_destroy(de265_cond* c); +void de265_cond_broadcast(de265_cond* c, de265_mutex* m); +void de265_cond_wait(de265_cond* c,de265_mutex* m); +void de265_cond_signal(de265_cond* c); + + +class de265_progress_lock +{ +public: + de265_progress_lock(); + ~de265_progress_lock(); + + void wait_for_progress(int progress); + void set_progress(int progress); + void increase_progress(int progress); + int get_progress() const; + void reset(int value=0) { mProgress=value; } + +private: + int mProgress; + + // private data + + de265_mutex mutex; + de265_cond cond; +}; + + + +class thread_task +{ +public: + thread_task() : state(Queued) { } + virtual ~thread_task() { } + + enum { Queued, Running, Blocked, Finished } state; + + virtual void work() = 0; + + virtual std::string name() const { return "noname"; } +}; + + +#define MAX_THREADS 32 + +/* TODO NOTE: When unblocking a task, we have to check first + if there are threads waiting because of the run-count limit. + If there are higher-priority tasks, those should be run instead + of the just unblocked task. + */ + +class thread_pool +{ + public: + bool stopped; + + std::deque tasks; // we are not the owner + + de265_thread thread[MAX_THREADS]; + int num_threads; + + int num_threads_working; + + int ctbx[MAX_THREADS]; // the CTB the thread is working on + int ctby[MAX_THREADS]; + + de265_mutex mutex; + de265_cond cond_var; +}; + + +de265_error start_thread_pool(thread_pool* pool, int num_threads); +void stop_thread_pool(thread_pool* pool); // do not process remaining tasks + +void add_task(thread_pool* pool, thread_task* task); // TOCO: can make thread_task const + +#endif diff --git a/libde265/transform.h b/libde265/transform.h new file mode 100644 index 0000000..6f19049 --- /dev/null +++ b/libde265/transform.h @@ -0,0 +1,65 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_TRANSFORM_H +#define DE265_TRANSFORM_H + +#include "libde265/de265.h" +#include "libde265/decctx.h" + +extern const int tab8_22[]; + +LIBDE265_INLINE static int table8_22(int qPi) +{ + if (qPi<30) return qPi; + if (qPi>=43) return qPi-6; + return tab8_22[qPi-30]; +} + +// (8.6.1) +void decode_quantization_parameters(thread_context* tctx, int xC,int yC, + int xCUBase, int yCUBase); + +// (8.6.2) +void scale_coefficients(thread_context* tctx, + int xT,int yT, // position of TU in frame (chroma adapted) + int x0,int y0, // position of CU in frame (chroma adapted) + int nT, int cIdx, + bool transform_skip_flag, bool intra, int rdpcmMode); + + +void inv_transform(acceleration_functions* acceleration, + uint8_t* dst, int dstStride, int16_t* coeff, + int log2TbSize, int trType); + +void fwd_transform(acceleration_functions* acceleration, + int16_t* coeff, int coeffStride, int log2TbSize, int trType, + const int16_t* src, int srcStride); + +void quant_coefficients(int16_t* out_coeff, + const int16_t* in_coeff, + int log2TrSize, int qp, + bool intra); + +void dequant_coefficients(int16_t* out_coeff, + const int16_t* in_coeff, + int log2TrSize, int qP); + +#endif diff --git a/libde265/util.h b/libde265/util.h new file mode 100644 index 0000000..84d4d36 --- /dev/null +++ b/libde265/util.h @@ -0,0 +1,229 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_UTIL_H +#define DE265_UTIL_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef _MSC_VER +#include +#endif + +#include +#include + +#include "libde265/de265.h" + +#ifdef __GNUC__ +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#endif + +#ifdef _MSC_VER +#define LIBDE265_DECLARE_ALIGNED( var, n ) __declspec(align(n)) var +#define likely(x) (x) +#define unlikely(x) (x) +#else +#define LIBDE265_DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif + +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define LIBDE265_CHECK_RESULT __attribute__ ((warn_unused_result)) +#elif defined(_MSC_VER) && (_MSC_VER >= 1700) +#define LIBDE265_CHECK_RESULT _Check_return_ +#else +#define LIBDE265_CHECK_RESULT +#endif + +// Be careful with these alignment instructions. They only specify the alignment within +// a struct. But they cannot make sure that the base address of the struct has the same alignment +// when it is dynamically allocated. +#define ALIGNED_32( var ) LIBDE265_DECLARE_ALIGNED( var, 32 ) +#define ALIGNED_16( var ) LIBDE265_DECLARE_ALIGNED( var, 16 ) +#define ALIGNED_8( var ) LIBDE265_DECLARE_ALIGNED( var, 8 ) +#define ALIGNED_4( var ) LIBDE265_DECLARE_ALIGNED( var, 4 ) + +// C++11 specific features +#if defined(_MSC_VER) || (!__clang__ && __GNUC__ && GCC_VERSION < 40600) +#define FOR_LOOP(type, var, list) for each (type var in list) +#undef FOR_LOOP_AUTO_SUPPORT +#else +#define FOR_LOOP(type, var, list) for (type var : list) +#define FOR_LOOP_AUTO_SUPPORT 1 +#endif + +#ifdef USE_STD_TR1_NAMESPACE +#include +namespace std { using namespace std::tr1; } +#endif + +#ifdef NEED_STD_MOVE_FALLBACK +// Provide fallback variant of "std::move" for older compilers with +// incomplete/broken C++11 support. +namespace std { + +template +inline typename std::remove_reference<_Tp>::type&& move(_Tp&& __t) { + return static_cast::type&&>(__t); +} + +} // namespace std +#endif + +#ifdef NEED_NULLPTR_FALLBACK +// Compilers with partial/incomplete support for C++11 don't know about +// "nullptr". A simple alias should be fine for our use case. +#define nullptr NULL +#endif + +#ifdef _MSC_VER + #ifdef _CPPRTTI + #define RTTI_ENABLED + #endif +#else + #ifdef __GXX_RTTI + #define RTTI_ENABLED + #endif +#endif + +//inline uint8_t Clip1_8bit(int16_t value) { if (value<=0) return 0; else if (value>=255) return 255; else return value; } +#define Clip1_8bit(value) ((value)<0 ? 0 : (value)>255 ? 255 : (value)) +#define Clip_BitDepth(value, bit_depth) ((value)<0 ? 0 : (value)>((1<(high) ? (high) : (value)) +#define Sign(value) (((value)<0) ? -1 : ((value)>0) ? 1 : 0) +#define abs_value(a) (((a)<0) ? -(a) : (a)) +#define libde265_min(a,b) (((a)<(b)) ? (a) : (b)) +#define libde265_max(a,b) (((a)>(b)) ? (a) : (b)) + +LIBDE265_INLINE static int ceil_div(int num,int denom) +{ + num += denom-1; + return num/denom; +} + +LIBDE265_INLINE static int ceil_log2(int val) +{ + int n=0; + while (val > (1<1) { + n++; + v>>=1; + } + + return n; +} + +LIBDE265_INLINE static int Log2SizeToArea(int v) +{ + return (1<<(v<<1)); +} + +void copy_subimage(uint8_t* dst,int dststride, + const uint8_t* src,int srcstride, + int w, int h); + + +// === logging === + +enum LogModule { + LogHighlevel, + LogHeaders, + LogSlice, + LogDPB, + LogMotion, + LogTransform, + LogDeblock, + LogSAO, + LogSEI, + LogIntraPred, + LogPixels, + LogSymbols, + LogCABAC, + LogEncoder, + LogEncoderMetadata, + NUMBER_OF_LogModules +}; + + +#if defined(DE265_LOG_ERROR) || defined(DE265_LOG_INFO) || defined(DE265_LOG_DEBUG) || defined(DE265_LOG_TRACE) +# define DE265_LOGGING 1 +void enable_logging(enum LogModule); +void disable_logging(enum LogModule); +#else +#define enable_logging(x) { } +#define disable_logging(x) { } +#endif + +#ifdef DE265_LOGGING +void log_set_current_POC(int poc); +#else +#define log_set_current_POC(poc) { } +#endif + +#ifdef DE265_LOG_ERROR +void logerror(enum LogModule module, const char* string, ...); +#else +#define logerror(a,b, ...) { } +#endif + +#ifdef DE265_LOG_INFO +void loginfo (enum LogModule module, const char* string, ...); +#else +#define loginfo(a,b, ...) { } +#endif + +#ifdef DE265_LOG_DEBUG +void logdebug(enum LogModule module, const char* string, ...); +bool logdebug_enabled(enum LogModule module); +#else +#define logdebug(a,b, ...) { } +inline bool logdebug_enabled(enum LogModule module) { return false; } +#endif + +#ifdef DE265_LOG_TRACE +void logtrace(enum LogModule module, const char* string, ...); +#else +#define logtrace(a,b, ...) { } +#endif + +void log2fh(FILE* fh, const char* string, ...); + + +void printBlk(const char* title,const int32_t* data, int blksize, int stride, const std::string& prefix=" "); +void printBlk(const char* title,const int16_t* data, int blksize, int stride, const std::string& prefix=" "); +void printBlk(const char* title,const uint8_t* data, int blksize, int stride, const std::string& prefix=" "); + +void debug_set_image_output(void (*)(const struct de265_image*, int slot)); +void debug_show_image(const struct de265_image*, int slot); + +#endif diff --git a/libde265/visualize.h b/libde265/visualize.h new file mode 100644 index 0000000..2cc0a5c --- /dev/null +++ b/libde265/visualize.h @@ -0,0 +1,50 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_VISUALIZE_H +#define DE265_VISUALIZE_H + +#include "libde265/de265.h" +#include "libde265/image.h" + + +void write_picture_to_file(const de265_image* img, const char* filename); + +#ifdef __cplusplus +extern "C" { +#endif + +// TODO: these should either move to "sherlock265", or be part of the +// "official" public API +LIBDE265_API void draw_CB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); +LIBDE265_API void draw_TB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); +LIBDE265_API void draw_PB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); +LIBDE265_API void draw_PB_pred_modes(const de265_image* img, uint8_t* dst, int stride, int pixelSize); +LIBDE265_API void draw_intra_pred_modes(const de265_image* img, uint8_t* dst, int stride, uint32_t value, int pixelSize); +LIBDE265_API void draw_QuantPY(const de265_image* img, uint8_t* dst, int stride, int pixelSize); +LIBDE265_API void draw_Motion(const de265_image* img, uint8_t* dst, int stride, int pixelSize); +LIBDE265_API void draw_Slices(const de265_image* img, uint8_t* dst, int stride, int pixelSize); +LIBDE265_API void draw_Tiles(const de265_image* img, uint8_t* dst, int stride, int pixelSize); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libde265/vps.h b/libde265/vps.h new file mode 100644 index 0000000..04c9c15 --- /dev/null +++ b/libde265/vps.h @@ -0,0 +1,173 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_VPS_H +#define DE265_VPS_H + +#ifdef HAVE_CONFIG_H +#include +#endif + +#ifdef HAVE_STDBOOL_H +#include +#endif + +#include "libde265/bitstream.h" +#include "libde265/de265.h" +#include "libde265/cabac.h" + +#include + +class error_queue; + +#define MAX_TEMPORAL_SUBLAYERS 8 + + +enum profile_idc { + Profile_Main = 1, + Profile_Main10 = 2, + Profile_MainStillPicture = 3, + Profile_FormatRangeExtensions = 4 +}; + + +class profile_data { +public: + void read(bitreader* reader); + void write(CABAC_encoder& writer) const; + void dump(bool general, FILE* fh) const; + + void set_defaults(enum profile_idc, int level_major, int level_minor); + + // --- profile --- + + char profile_present_flag; // always true for general profile + + char profile_space; // currently always 0 + char tier_flag; // main tier or low tier (see Table A-66/A-67) + enum profile_idc profile_idc; // profile + + char profile_compatibility_flag[32]; // to which profile we are compatible + + char progressive_source_flag; + char interlaced_source_flag; + char non_packed_constraint_flag; + char frame_only_constraint_flag; + + + // --- level --- + + char level_present_flag; // always true for general level + int level_idc; // level * 30 +}; + + +class profile_tier_level +{ +public: + void read(bitreader* reader, int max_sub_layers); + void write(CABAC_encoder& writer, int max_sub_layers) const; + void dump(int max_sub_layers, FILE* fh) const; + + profile_data general; + + //bool sub_layer_profile_present[MAX_TEMPORAL_SUBLAYERS]; + //bool sub_layer_level_present[MAX_TEMPORAL_SUBLAYERS]; + + profile_data sub_layer[MAX_TEMPORAL_SUBLAYERS]; +}; + + +/* +struct bit_rate_pic_rate_info { + char bit_rate_info_present_flag[8]; + char pic_rate_info_present_flag[8]; + + int avg_bit_rate[8]; + int max_bit_rate[8]; + + char constant_pic_rate_idc[8]; + int avg_pic_rate[8]; + +}; + +void read_bit_rate_pic_rate_info(bitreader* reader, + struct bit_rate_pic_rate_info* hdr, + int TempLevelLow, + int TempLevelHigh); + +void dump_bit_rate_pic_rate_info(struct bit_rate_pic_rate_info* hdr, + int TempLevelLow, + int TempLevelHigh); +*/ + + +typedef struct { + int vps_max_dec_pic_buffering; // [1 ; ] + int vps_max_num_reorder_pics; // [0 ; ] + int vps_max_latency_increase; // 0 -> no limit, otherwise value is (x-1) +} layer_data; + + +class video_parameter_set +{ +public: + de265_error read(error_queue* errqueue, bitreader* reader); + de265_error write(error_queue* errqueue, CABAC_encoder& out) const; + void dump(int fd) const; + + void set_defaults(enum profile_idc profile, int level_major, int level_minor); + + int video_parameter_set_id; + int vps_max_layers; // [1;?] currently always 1 + int vps_max_sub_layers; // [1;7] number of temporal sub-layers + int vps_temporal_id_nesting_flag; // indicate temporal up-switching always possible + profile_tier_level profile_tier_level_; + + int vps_sub_layer_ordering_info_present_flag; + layer_data layer[MAX_TEMPORAL_SUBLAYERS]; + + uint8_t vps_max_layer_id; // max value for nuh_layer_id in NALs + int vps_num_layer_sets; // [1;1024], currently always 1 + + std::vector > layer_id_included_flag; // max size = [1024][64] + + + // --- timing info --- + + char vps_timing_info_present_flag; + uint32_t vps_num_units_in_tick; + uint32_t vps_time_scale; + char vps_poc_proportional_to_timing_flag; + uint32_t vps_num_ticks_poc_diff_one; + + int vps_num_hrd_parameters; // currently [0;1] + + std::vector hrd_layer_set_idx; // max size = 1024 + std::vector cprms_present_flag; // max size = 1024 + + + // --- vps extension --- + + char vps_extension_flag; +}; + + +#endif diff --git a/libde265/vui.h b/libde265/vui.h new file mode 100644 index 0000000..c412669 --- /dev/null +++ b/libde265/vui.h @@ -0,0 +1,126 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_VUI_H +#define DE265_VUI_H + +#include "libde265/de265.h" +#include "libde265/bitstream.h" + +#include + +class error_queue; +class seq_parameter_set; + + +enum VideoFormat { + VideoFormat_Component = 0, + VideoFormat_PAL = 1, + VideoFormat_NTSC = 2, + VideoFormat_SECAM = 3, + VideoFormat_MAC = 4, + VideoFormat_Unspecified = 5 +}; + +const char* get_video_format_name(enum VideoFormat); + + +class video_usability_information +{ + public: + video_usability_information(); + + de265_error read(error_queue*, bitreader*, const seq_parameter_set*); + void dump(int fd) const; + + + // --- sample aspect ratio (SAR) --- + + bool aspect_ratio_info_present_flag; + uint16_t sar_width; // sar_width and sar_height are zero if unspecified + uint16_t sar_height; + + + // --- overscan --- + + bool overscan_info_present_flag; + bool overscan_appropriate_flag; + + + // --- video signal type --- + + bool video_signal_type_present_flag; + enum VideoFormat video_format; + bool video_full_range_flag; + bool colour_description_present_flag; + uint8_t colour_primaries; + uint8_t transfer_characteristics; + uint8_t matrix_coeffs; + + // --- chroma / interlaced --- + + bool chroma_loc_info_present_flag; + uint8_t chroma_sample_loc_type_top_field; + uint8_t chroma_sample_loc_type_bottom_field; + + bool neutral_chroma_indication_flag; + bool field_seq_flag; + bool frame_field_info_present_flag; + + // --- default display window --- + + bool default_display_window_flag; + uint32_t def_disp_win_left_offset; + uint32_t def_disp_win_right_offset; + uint32_t def_disp_win_top_offset; + uint32_t def_disp_win_bottom_offset; + + + // --- timing --- + + bool vui_timing_info_present_flag; + uint32_t vui_num_units_in_tick; + uint32_t vui_time_scale; + + bool vui_poc_proportional_to_timing_flag; + uint32_t vui_num_ticks_poc_diff_one; + + + // --- hrd parameters --- + + bool vui_hrd_parameters_present_flag; + //hrd_parameters vui_hrd_parameters; + + + // --- bitstream restriction --- + + bool bitstream_restriction_flag; + bool tiles_fixed_structure_flag; + bool motion_vectors_over_pic_boundaries_flag; + bool restricted_ref_pic_lists_flag; + uint16_t min_spatial_segmentation_idc; + uint8_t max_bytes_per_pic_denom; + uint8_t max_bits_per_min_cu_denom; + uint8_t log2_max_mv_length_horizontal; + uint8_t log2_max_mv_length_vertical; +}; + + +#endif diff --git a/md5.cc b/md5.cc new file mode 100644 index 0000000..2f01c93 --- /dev/null +++ b/md5.cc @@ -0,0 +1,295 @@ +/* + * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. + * MD5 Message-Digest Algorithm (RFC 1321). + * + * Homepage: + * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 + * + * Author: + * Alexander Peslyak, better known as Solar Designer + * + * This software was written by Alexander Peslyak in 2001. No copyright is + * claimed, and the software is hereby placed in the public domain. + * In case this attempt to disclaim copyright and place the software in the + * public domain is deemed null and void, then the software is + * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the + * general public under the following terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * There's ABSOLUTELY NO WARRANTY, express or implied. + * + * (This is a heavily cut-down "BSD license".) + * + * This differs from Colin Plumb's older public domain implementation in that + * no exactly 32-bit integer data type is required (any 32-bit or wider + * unsigned integer data type will do), there's no compile-time endianness + * configuration, and the function prototypes match OpenSSL's. No code from + * Colin Plumb's implementation has been reused; this comment merely compares + * the properties of the two independent implementations. + * + * The primary goals of this implementation are portability and ease of use. + * It is meant to be fast, but not as fast as possible. Some known + * optimizations are not included to reduce source code size and avoid + * compile-time configuration. + */ + +#ifndef HAVE_OPENSSL + +#include + +#include "md5.h" + +/* + * The basic MD5 functions. + * + * F and G are optimized compared to their RFC 1321 definitions for + * architectures that lack an AND-NOT instruction, just like in Colin Plumb's + * implementation. + */ +#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | ~(z))) + +/* + * The MD5 transformation for all four rounds. + */ +#define STEP(f, a, b, c, d, x, t, s) \ + (a) += f((b), (c), (d)) + (x) + (t); \ + (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ + (a) += (b); + +/* + * SET reads 4 input bytes in little-endian byte order and stores them + * in a properly aligned word in host byte order. + * + * The check for little-endian architectures that tolerate unaligned + * memory accesses is just an optimization. Nothing will break if it + * doesn't work. + */ +#if defined(__i386__) || defined(__x86_64__) || defined(__vax__) +#define SET(n) \ + (*(MD5_u32plus *)&ptr[(n) * 4]) +#define GET(n) \ + SET(n) +#else +#define SET(n) \ + (ctx->block[(n)] = \ + (MD5_u32plus)ptr[(n) * 4] | \ + ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ + ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ + ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) +#define GET(n) \ + (ctx->block[(n)]) +#endif + +/* + * This processes one or more 64-byte data blocks, but does NOT update + * the bit counters. There are no alignment requirements. + */ +static void *body(MD5_CTX *ctx, void *data, unsigned long size) +{ + unsigned char *ptr; + MD5_u32plus a, b, c, d; + MD5_u32plus saved_a, saved_b, saved_c, saved_d; + + ptr = (unsigned char *)data; + + a = ctx->a; + b = ctx->b; + c = ctx->c; + d = ctx->d; + + do { + saved_a = a; + saved_b = b; + saved_c = c; + saved_d = d; + +/* Round 1 */ + STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) + STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) + STEP(F, c, d, a, b, SET(2), 0x242070db, 17) + STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) + STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) + STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) + STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) + STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) + STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) + STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) + STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) + STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) + STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) + STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) + STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) + STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) + +/* Round 2 */ + STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) + STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) + STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) + STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) + STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) + STEP(G, d, a, b, c, GET(10), 0x02441453, 9) + STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) + STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) + STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) + STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) + STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) + STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) + STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) + STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) + STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) + STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) + +/* Round 3 */ + STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) + STEP(H, d, a, b, c, GET(8), 0x8771f681, 11) + STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) + STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23) + STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) + STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11) + STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) + STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23) + STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) + STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11) + STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) + STEP(H, b, c, d, a, GET(6), 0x04881d05, 23) + STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) + STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11) + STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) + STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23) + +/* Round 4 */ + STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) + STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) + STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) + STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) + STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) + STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) + STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) + STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) + STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) + STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) + STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) + STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) + STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) + STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) + STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) + STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) + + a += saved_a; + b += saved_b; + c += saved_c; + d += saved_d; + + ptr += 64; + } while (size -= 64); + + ctx->a = a; + ctx->b = b; + ctx->c = c; + ctx->d = d; + + return ptr; +} + +void MD5_Init(MD5_CTX *ctx) +{ + ctx->a = 0x67452301; + ctx->b = 0xefcdab89; + ctx->c = 0x98badcfe; + ctx->d = 0x10325476; + + ctx->lo = 0; + ctx->hi = 0; +} + +void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size) +{ + MD5_u32plus saved_lo; + unsigned long used, free; + + saved_lo = ctx->lo; + if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) + ctx->hi++; + ctx->hi += size >> 29; + + used = saved_lo & 0x3f; + + if (used) { + free = 64 - used; + + if (size < free) { + memcpy(&ctx->buffer[used], data, size); + return; + } + + memcpy(&ctx->buffer[used], data, free); + data = (unsigned char *)data + free; + size -= free; + body(ctx, ctx->buffer, 64); + } + + if (size >= 64) { + data = body(ctx, data, size & ~(unsigned long)0x3f); + size &= 0x3f; + } + + memcpy(ctx->buffer, data, size); +} + +void MD5_Final(unsigned char *result, MD5_CTX *ctx) +{ + unsigned long used, free; + + used = ctx->lo & 0x3f; + + ctx->buffer[used++] = 0x80; + + free = 64 - used; + + if (free < 8) { + memset(&ctx->buffer[used], 0, free); + body(ctx, ctx->buffer, 64); + used = 0; + free = 64; + } + + memset(&ctx->buffer[used], 0, free - 8); + + ctx->lo <<= 3; + ctx->buffer[56] = ctx->lo; + ctx->buffer[57] = ctx->lo >> 8; + ctx->buffer[58] = ctx->lo >> 16; + ctx->buffer[59] = ctx->lo >> 24; + ctx->buffer[60] = ctx->hi; + ctx->buffer[61] = ctx->hi >> 8; + ctx->buffer[62] = ctx->hi >> 16; + ctx->buffer[63] = ctx->hi >> 24; + + body(ctx, ctx->buffer, 64); + + result[0] = ctx->a; + result[1] = ctx->a >> 8; + result[2] = ctx->a >> 16; + result[3] = ctx->a >> 24; + result[4] = ctx->b; + result[5] = ctx->b >> 8; + result[6] = ctx->b >> 16; + result[7] = ctx->b >> 24; + result[8] = ctx->c; + result[9] = ctx->c >> 8; + result[10] = ctx->c >> 16; + result[11] = ctx->c >> 24; + result[12] = ctx->d; + result[13] = ctx->d >> 8; + result[14] = ctx->d >> 16; + result[15] = ctx->d >> 24; + + memset(ctx, 0, sizeof(*ctx)); +} + +#endif diff --git a/motion.cc b/motion.cc new file mode 100644 index 0000000..deae240 --- /dev/null +++ b/motion.cc @@ -0,0 +1,2111 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "motion.h" +#include "decctx.h" +#include "util.h" +#include "dpb.h" + +#include + + +#include +#include +#include + +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + + +#define MAX_CU_SIZE 64 + + +static int extra_before[4] = { 0,3,3,2 }; +static int extra_after [4] = { 0,3,4,4 }; + + + +template +void mc_luma(const base_context* ctx, + const seq_parameter_set* sps, int mv_x, int mv_y, + int xP,int yP, + int16_t* out, int out_stride, + const pixel_t* ref, int ref_stride, + int nPbW, int nPbH, int bitDepth_L) +{ + int xFracL = mv_x & 3; + int yFracL = mv_y & 3; + + int xIntOffsL = xP + (mv_x>>2); + int yIntOffsL = yP + (mv_y>>2); + + // luma sample interpolation process (8.5.3.2.2.1) + + //const int shift1 = sps->BitDepth_Y-8; + //const int shift2 = 6; + const int shift3 = 14 - sps->BitDepth_Y; + + int w = sps->pic_width_in_luma_samples; + int h = sps->pic_height_in_luma_samples; + + ALIGNED_16(int16_t) mcbuffer[MAX_CU_SIZE * (MAX_CU_SIZE+7)]; + + if (xFracL==0 && yFracL==0) { + + if (xIntOffsL >= 0 && yIntOffsL >= 0 && + nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) { + + ctx->acceleration.put_hevc_qpel(out, out_stride, + &ref[yIntOffsL*ref_stride + xIntOffsL], + ref_stride /* sizeof(pixel_t)*/, + nPbW,nPbH, mcbuffer, 0,0, bitDepth_L); + } + else { + for (int y=0;y \n"); + + for (int y=0;y> 6); // 6 will be used when summing predictions + } + logtrace(LogMotion,"\n"); + } +#endif + } + else { + int extra_left = extra_before[xFracL]; + int extra_right = extra_after [xFracL]; + int extra_top = extra_before[yFracL]; + int extra_bottom = extra_after [yFracL]; + + //int nPbW_extra = extra_left + nPbW + extra_right; + //int nPbH_extra = extra_top + nPbH + extra_bottom; + + + pixel_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+7)]; + + const pixel_t* src_ptr; + int src_stride; + + if (-extra_left + xIntOffsL >= 0 && + -extra_top + yIntOffsL >= 0 && + nPbW+extra_right + xIntOffsL < w && + nPbH+extra_bottom + yIntOffsL < h) { + src_ptr = &ref[xIntOffsL + yIntOffsL*ref_stride]; + src_stride = ref_stride; + } + else { + for (int y=-extra_top;yacceleration.put_hevc_qpel(out, out_stride, + src_ptr, src_stride /* sizeof(pixel_t) */, + nPbW,nPbH, mcbuffer, xFracL,yFracL, bitDepth_L); + + + logtrace(LogMotion,"---V---\n"); + for (int y=0;y +void mc_chroma(const base_context* ctx, + const seq_parameter_set* sps, + int mv_x, int mv_y, + int xP,int yP, + int16_t* out, int out_stride, + const pixel_t* ref, int ref_stride, + int nPbWC, int nPbHC, int bit_depth_C) +{ + // chroma sample interpolation process (8.5.3.2.2.2) + + //const int shift1 = sps->BitDepth_C-8; + //const int shift2 = 6; + const int shift3 = 14 - sps->BitDepth_C; + + int wC = sps->pic_width_in_luma_samples /sps->SubWidthC; + int hC = sps->pic_height_in_luma_samples/sps->SubHeightC; + + mv_x *= 2 / sps->SubWidthC; + mv_y *= 2 / sps->SubHeightC; + + int xFracC = mv_x & 7; + int yFracC = mv_y & 7; + + int xIntOffsC = xP/sps->SubWidthC + (mv_x>>3); + int yIntOffsC = yP/sps->SubHeightC + (mv_y>>3); + + ALIGNED_32(int16_t mcbuffer[MAX_CU_SIZE*(MAX_CU_SIZE+7)]); + + if (xFracC == 0 && yFracC == 0) { + if (xIntOffsC>=0 && nPbWC+xIntOffsC<=wC && + yIntOffsC>=0 && nPbHC+yIntOffsC<=hC) { + ctx->acceleration.put_hevc_epel(out, out_stride, + &ref[xIntOffsC + yIntOffsC*ref_stride], ref_stride, + nPbWC,nPbHC, 0,0, NULL, bit_depth_C); + } + else + { + for (int y=0;y=1 && nPbWC+xIntOffsC<=wC-2 && + yIntOffsC>=1 && nPbHC+yIntOffsC<=hC-2) { + src_ptr = &ref[xIntOffsC + yIntOffsC*ref_stride]; + src_stride = ref_stride; + } + else { + for (int y=-extra_top;yacceleration.put_hevc_epel_hv(out, out_stride, + src_ptr, src_stride, + nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); + } + else if (xFracC) { + ctx->acceleration.put_hevc_epel_h(out, out_stride, + src_ptr, src_stride, + nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); + } + else if (yFracC) { + ctx->acceleration.put_hevc_epel_v(out, out_stride, + src_ptr, src_stride, + nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C); + } + else { + assert(false); // full-pel shifts are handled above + } + } +} + + + +// 8.5.3.2 +// NOTE: for full-pel shifts, we can introduce a fast path, simply copying without shifts +void generate_inter_prediction_samples(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC, + int xB,int yB, + int nCS, int nPbW,int nPbH, + const PBMotion* vi) +{ + int xP = xC+xB; + int yP = yC+yB; + + void* pixels[3]; + int stride[3]; + + const pic_parameter_set* pps = shdr->pps.get(); + const seq_parameter_set* sps = pps->sps.get(); + + const int SubWidthC = sps->SubWidthC; + const int SubHeightC = sps->SubHeightC; + + pixels[0] = img->get_image_plane_at_pos_any_depth(0,xP,yP); + stride[0] = img->get_image_stride(0); + + pixels[1] = img->get_image_plane_at_pos_any_depth(1,xP/SubWidthC,yP/SubHeightC); + stride[1] = img->get_image_stride(1); + + pixels[2] = img->get_image_plane_at_pos_any_depth(2,xP/SubWidthC,yP/SubHeightC); + stride[2] = img->get_image_stride(2); + + + ALIGNED_16(int16_t) predSamplesL [2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE]; + ALIGNED_16(int16_t) predSamplesC[2 /* chroma */ ][2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE]; + + //int xP = xC+xB; + //int yP = yC+yB; + + int predFlag[2]; + predFlag[0] = vi->predFlag[0]; + predFlag[1] = vi->predFlag[1]; + + const int bit_depth_L = sps->BitDepth_Y; + const int bit_depth_C = sps->BitDepth_C; + + // Some encoders use bi-prediction with two similar MVs. + // Identify this case and use only one MV. + + // do this only without weighted prediction, because the weights/offsets may be different + if (pps->weighted_pred_flag==0) { + if (predFlag[0] && predFlag[1]) { + if (vi->mv[0].x == vi->mv[1].x && + vi->mv[0].y == vi->mv[1].y && + shdr->RefPicList[0][vi->refIdx[0]] == + shdr->RefPicList[1][vi->refIdx[1]]) { + predFlag[1] = 0; + } + } + } + + + for (int l=0;l<2;l++) { + if (predFlag[l]) { + // 8.5.3.2.1 + + if (vi->refIdx[l] >= MAX_NUM_REF_PICS) { + img->integrity = INTEGRITY_DECODING_ERRORS; + ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + return; + } + + const de265_image* refPic = ctx->get_image(shdr->RefPicList[l][vi->refIdx[l]]); + + logtrace(LogMotion, "refIdx: %d -> dpb[%d]\n", vi->refIdx[l], shdr->RefPicList[l][vi->refIdx[l]]); + + if (!refPic || refPic->PicState == UnusedForReference) { + img->integrity = INTEGRITY_DECODING_ERRORS; + ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + + // TODO: fill predSamplesC with black or grey + } + else { + // 8.5.3.2.2 + + logtrace(LogMotion,"do MC: L%d,MV=%d;%d RefPOC=%d\n", + l,vi->mv[l].x,vi->mv[l].y,refPic->PicOrderCntVal); + + + // TODO: must predSamples stride really be nCS or can it be somthing smaller like nPbW? + + if (img->high_bit_depth(0)) { + mc_luma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesL[l],nCS, + (const uint16_t*)refPic->get_image_plane(0), + refPic->get_luma_stride(), nPbW,nPbH, bit_depth_L); + } + else { + mc_luma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesL[l],nCS, + (const uint8_t*)refPic->get_image_plane(0), + refPic->get_luma_stride(), nPbW,nPbH, bit_depth_L); + } + + if (img->high_bit_depth(0)) { + mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesC[0][l],nCS, (const uint16_t*)refPic->get_image_plane(1), + refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesC[1][l],nCS, (const uint16_t*)refPic->get_image_plane(2), + refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + } + else { + mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesC[0][l],nCS, (const uint8_t*)refPic->get_image_plane(1), + refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + mc_chroma(ctx, sps, vi->mv[l].x, vi->mv[l].y, xP,yP, + predSamplesC[1][l],nCS, (const uint8_t*)refPic->get_image_plane(2), + refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + } + } + } + } + + + // weighted sample prediction (8.5.3.2.3) + + const int shift1_L = libde265_max(2,14-sps->BitDepth_Y); + const int offset_shift1_L = img->get_sps().WpOffsetBdShiftY; + const int shift1_C = libde265_max(2,14-sps->BitDepth_C); + const int offset_shift1_C = img->get_sps().WpOffsetBdShiftC; + + /* + const int shift1_L = 14-img->sps.BitDepth_Y; + const int offset_shift1_L = img->sps.BitDepth_Y-8; + const int shift1_C = 14-img->sps.BitDepth_C; + const int offset_shift1_C = img->sps.BitDepth_C-8; + */ + + /* + if (0) + printf("%d/%d %d/%d %d/%d %d/%d\n", + shift1_L, + Nshift1_L, + offset_shift1_L, + Noffset_shift1_L, + shift1_C, + Nshift1_C, + offset_shift1_C, + Noffset_shift1_C); + + assert(shift1_L== + Nshift1_L); + assert(offset_shift1_L== + Noffset_shift1_L); + assert(shift1_C== + Nshift1_C); + assert(offset_shift1_C== + Noffset_shift1_C); + */ + + + logtrace(LogMotion,"predFlags (modified): %d %d\n", predFlag[0], predFlag[1]); + + if (shdr->slice_type == SLICE_TYPE_P) { + if (pps->weighted_pred_flag==0) { + if (predFlag[0]==1 && predFlag[1]==0) { + ctx->acceleration.put_unweighted_pred(pixels[0], stride[0], + predSamplesL[0],nCS, nPbW,nPbH, bit_depth_L); + ctx->acceleration.put_unweighted_pred(pixels[1], stride[1], + predSamplesC[0][0],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + ctx->acceleration.put_unweighted_pred(pixels[2], stride[2], + predSamplesC[1][0],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + } + else { + ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + else { + // weighted prediction + + if (predFlag[0]==1 && predFlag[1]==0) { + + int refIdx0 = vi->refIdx[0]; + + int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; + int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; + + int luma_w0 = shdr->LumaWeight[0][refIdx0]; + int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(offset_shift1_L)); + + int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0]; + int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(offset_shift1_C)); + int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1]; + int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(offset_shift1_C)); + + logtrace(LogMotion,"weighted-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH); + + ctx->acceleration.put_weighted_pred(pixels[0], stride[0], + predSamplesL[0],nCS, nPbW,nPbH, + luma_w0, luma_o0, luma_log2WD, bit_depth_L); + ctx->acceleration.put_weighted_pred(pixels[1], stride[1], + predSamplesC[0][0],nCS, nPbW/SubWidthC,nPbH/SubHeightC, + chroma0_w0, chroma0_o0, chroma_log2WD, bit_depth_C); + ctx->acceleration.put_weighted_pred(pixels[2], stride[2], + predSamplesC[1][0],nCS, nPbW/SubWidthC,nPbH/SubHeightC, + chroma1_w0, chroma1_o0, chroma_log2WD, bit_depth_C); + } + else { + ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + } + else { + assert(shdr->slice_type == SLICE_TYPE_B); + + if (predFlag[0]==1 && predFlag[1]==1) { + if (pps->weighted_bipred_flag==0) { + //const int shift2 = 15-8; // TODO: real bit depth + //const int offset2 = 1<<(shift2-1); + + int16_t* in0 = predSamplesL[0]; + int16_t* in1 = predSamplesL[1]; + + ctx->acceleration.put_weighted_pred_avg(pixels[0], stride[0], + in0,in1, nCS, nPbW, nPbH, bit_depth_L); + + int16_t* in00 = predSamplesC[0][0]; + int16_t* in01 = predSamplesC[0][1]; + int16_t* in10 = predSamplesC[1][0]; + int16_t* in11 = predSamplesC[1][1]; + + ctx->acceleration.put_weighted_pred_avg(pixels[1], stride[1], + in00,in01, nCS, + nPbW/SubWidthC, nPbH/SubHeightC, bit_depth_C); + ctx->acceleration.put_weighted_pred_avg(pixels[2], stride[2], + in10,in11, nCS, + nPbW/SubWidthC, nPbH/SubHeightC, bit_depth_C); + } + else { + // weighted prediction + + int refIdx0 = vi->refIdx[0]; + int refIdx1 = vi->refIdx[1]; + + int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; + int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; + + int luma_w0 = shdr->LumaWeight[0][refIdx0]; + int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(offset_shift1_L)); + int luma_w1 = shdr->LumaWeight[1][refIdx1]; + int luma_o1 = shdr->luma_offset[1][refIdx1] * (1<<(offset_shift1_L)); + + int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0]; + int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(offset_shift1_C)); + int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1]; + int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(offset_shift1_C)); + int chroma0_w1 = shdr->ChromaWeight[1][refIdx1][0]; + int chroma0_o1 = shdr->ChromaOffset[1][refIdx1][0] * (1<<(offset_shift1_C)); + int chroma1_w1 = shdr->ChromaWeight[1][refIdx1][1]; + int chroma1_o1 = shdr->ChromaOffset[1][refIdx1][1] * (1<<(offset_shift1_C)); + + logtrace(LogMotion,"weighted-BI-0 [%d] %d %d %d %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH); + logtrace(LogMotion,"weighted-BI-1 [%d] %d %d %d %dx%d\n", refIdx1, luma_log2WD-6,luma_w1,luma_o1,nPbW,nPbH); + + int16_t* in0 = predSamplesL[0]; + int16_t* in1 = predSamplesL[1]; + + ctx->acceleration.put_weighted_bipred(pixels[0], stride[0], + in0,in1, nCS, nPbW, nPbH, + luma_w0,luma_o0, + luma_w1,luma_o1, + luma_log2WD, bit_depth_L); + + int16_t* in00 = predSamplesC[0][0]; + int16_t* in01 = predSamplesC[0][1]; + int16_t* in10 = predSamplesC[1][0]; + int16_t* in11 = predSamplesC[1][1]; + + ctx->acceleration.put_weighted_bipred(pixels[1], stride[1], + in00,in01, nCS, nPbW/SubWidthC, nPbH/SubHeightC, + chroma0_w0,chroma0_o0, + chroma0_w1,chroma0_o1, + chroma_log2WD, bit_depth_C); + ctx->acceleration.put_weighted_bipred(pixels[2], stride[2], + in10,in11, nCS, nPbW/SubWidthC, nPbH/SubHeightC, + chroma1_w0,chroma1_o0, + chroma1_w1,chroma1_o1, + chroma_log2WD, bit_depth_C); + } + } + else if (predFlag[0]==1 || predFlag[1]==1) { + int l = predFlag[0] ? 0 : 1; + + if (pps->weighted_bipred_flag==0) { + ctx->acceleration.put_unweighted_pred(pixels[0], stride[0], + predSamplesL[l],nCS, nPbW,nPbH, bit_depth_L); + ctx->acceleration.put_unweighted_pred(pixels[1], stride[1], + predSamplesC[0][l],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + ctx->acceleration.put_unweighted_pred(pixels[2], stride[2], + predSamplesC[1][l],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C); + } + else { + int refIdx = vi->refIdx[l]; + + int luma_log2WD = shdr->luma_log2_weight_denom + shift1_L; + int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C; + + int luma_w = shdr->LumaWeight[l][refIdx]; + int luma_o = shdr->luma_offset[l][refIdx] * (1<<(offset_shift1_L)); + + int chroma0_w = shdr->ChromaWeight[l][refIdx][0]; + int chroma0_o = shdr->ChromaOffset[l][refIdx][0] * (1<<(offset_shift1_C)); + int chroma1_w = shdr->ChromaWeight[l][refIdx][1]; + int chroma1_o = shdr->ChromaOffset[l][refIdx][1] * (1<<(offset_shift1_C)); + + logtrace(LogMotion,"weighted-B-L%d [%d] %d %d %d %dx%d\n", l, refIdx, luma_log2WD-6,luma_w,luma_o,nPbW,nPbH); + + ctx->acceleration.put_weighted_pred(pixels[0], stride[0], + predSamplesL[l],nCS, nPbW,nPbH, + luma_w, luma_o, luma_log2WD, bit_depth_L); + ctx->acceleration.put_weighted_pred(pixels[1], stride[1], + predSamplesC[0][l],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, + chroma0_w, chroma0_o, chroma_log2WD, bit_depth_C); + ctx->acceleration.put_weighted_pred(pixels[2], stride[2], + predSamplesC[1][l],nCS, + nPbW/SubWidthC,nPbH/SubHeightC, + chroma1_w, chroma1_o, chroma_log2WD, bit_depth_C); + } + } + else { + // TODO: check why it can actually happen that both predFlags[] are false. + // For now, we ignore this and continue decoding. + + ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + +#if defined(DE265_LOG_TRACE) && 0 + logtrace(LogTransform,"MC pixels (luma), position %d %d:\n", xP,yP); + + for (int y=0;yget_PartMode(x,y); } + const PBMotion& get_mv_info(int x,int y) const override { return img->get_mv_info(x,y); } + +private: + const de265_image* img; +}; + + + +/* + +--+ +--+--+ + |B2| |B1|B0| + +--+----------------+--+--+ + | | + | | + | | + | | + | PB | + | | + | | + +--+ | + |A1| | + +--+-------------------+ + |A0| + +--+ +*/ + + +// 8.5.3.1.2 +// TODO: check: can we fill the candidate list directly in this function and omit to copy later +/* + xC/yC: CB position + nCS: CB size (probably modified because of singleMCLFlag) + xP/yP: PB position (absolute) (probably modified because of singleMCLFlag) + singleMCLFlag + nPbW/nPbH: PB size + partIdx + out_cand: merging candidate vectors + + Add these candidates: + - A1 + - B1 (if != A1) + - B0 (if != B1) + - A0 (if != A1) + - B2 (if != A1 and != B1) + + A maximum of 4 candidates are generated. + + Note 1: For a CB splitted into two PBs, it does not make sense to merge the + second part to the parameters of the first part, since then, we could use 2Nx2N + right away. -> Exclude this candidate. +*/ +int derive_spatial_merging_candidates(//const de265_image* img, + const MotionVectorAccess& mvaccess, + const de265_image* img, + int xC, int yC, int nCS, int xP, int yP, + uint8_t singleMCLFlag, + int nPbW, int nPbH, + int partIdx, + PBMotion* out_cand, + int maxCandidates) +{ + const pic_parameter_set* pps = &img->get_pps(); + const int log2_parallel_merge_level = pps->log2_parallel_merge_level; + + enum PartMode PartMode = mvaccess.get_PartMode(xC,yC); + + /* + const int A0 = SpatialMergingCandidates::PRED_A0; + const int A1 = SpatialMergingCandidates::PRED_A1; + const int B0 = SpatialMergingCandidates::PRED_B0; + const int B1 = SpatialMergingCandidates::PRED_B1; + const int B2 = SpatialMergingCandidates::PRED_B2; + */ + + // --- A1 --- + + // a pixel within A1 (bottom right of A1) + int xA1 = xP-1; + int yA1 = yP+nPbH-1; + + bool availableA1; + int idxA1; + + int computed_candidates = 0; + + // check if candidate is in same motion-estimation region (MER) -> discard + if ((xP>>log2_parallel_merge_level) == (xA1>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yA1>>log2_parallel_merge_level)) { + availableA1 = false; + logtrace(LogMotion,"spatial merging candidate A1: below parallel merge level\n"); + } + // redundant candidate? (Note 1) -> discard + else if (// !singleMCLFlag && automatically true when partIdx==1 + partIdx==1 && + (PartMode==PART_Nx2N || + PartMode==PART_nLx2N || + PartMode==PART_nRx2N)) { + availableA1 = false; + logtrace(LogMotion,"spatial merging candidate A1: second part ignore\n"); + } + // MV available in A1 + else { + availableA1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA1,yA1); + if (!availableA1) logtrace(LogMotion,"spatial merging candidate A1: unavailable\n"); + } + + if (availableA1) { + idxA1 = computed_candidates++; + out_cand[idxA1] = mvaccess.get_mv_info(xA1,yA1); + + logtrace(LogMotion,"spatial merging candidate A1:\n"); + logmvcand(out_cand[idxA1]); + } + + if (computed_candidates>=maxCandidates) return computed_candidates; + + + // --- B1 --- + + int xB1 = xP+nPbW-1; + int yB1 = yP-1; + + bool availableB1; + int idxB1; + + // same MER -> discard + if ((xP>>log2_parallel_merge_level) == (xB1>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yB1>>log2_parallel_merge_level)) { + availableB1 = false; + logtrace(LogMotion,"spatial merging candidate B1: below parallel merge level\n"); + } + // redundant candidate (Note 1) -> discard + else if (// !singleMCLFlag && automatically true when partIdx==1 + partIdx==1 && + (PartMode==PART_2NxN || + PartMode==PART_2NxnU || + PartMode==PART_2NxnD)) { + availableB1 = false; + logtrace(LogMotion,"spatial merging candidate B1: second part ignore\n"); + } + // MV available in B1 + else { + availableB1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB1,yB1); + if (!availableB1) logtrace(LogMotion,"spatial merging candidate B1: unavailable\n"); + } + + if (availableB1) { + const PBMotion& b1 = img->get_mv_info(xB1,yB1); + + // B1 == A1 -> discard B1 + if (availableA1 && out_cand[idxA1] == b1) { + idxB1 = idxA1; + logtrace(LogMotion,"spatial merging candidate B1: redundant to A1\n"); + } + else { + idxB1 = computed_candidates++; + out_cand[idxB1] = b1; + + logtrace(LogMotion,"spatial merging candidate B1:\n"); + logmvcand(out_cand[idxB1]); + } + } + + if (computed_candidates>=maxCandidates) return computed_candidates; + + + // --- B0 --- + + int xB0 = xP+nPbW; + int yB0 = yP-1; + + bool availableB0; + int idxB0; + + if ((xP>>log2_parallel_merge_level) == (xB0>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yB0>>log2_parallel_merge_level)) { + availableB0 = false; + logtrace(LogMotion,"spatial merging candidate B0: below parallel merge level\n"); + } + else { + availableB0 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB0,yB0); + if (!availableB0) logtrace(LogMotion,"spatial merging candidate B0: unavailable\n"); + } + + if (availableB0) { + const PBMotion& b0 = img->get_mv_info(xB0,yB0); + + // B0 == B1 -> discard B0 + if (availableB1 && out_cand[idxB1]==b0) { + idxB0 = idxB1; + logtrace(LogMotion,"spatial merging candidate B0: redundant to B1\n"); + } + else { + idxB0 = computed_candidates++; + out_cand[idxB0] = b0; + logtrace(LogMotion,"spatial merging candidate B0:\n"); + logmvcand(out_cand[idxB0]); + } + } + + if (computed_candidates>=maxCandidates) return computed_candidates; + + + // --- A0 --- + + int xA0 = xP-1; + int yA0 = yP+nPbH; + + bool availableA0; + int idxA0; + + if ((xP>>log2_parallel_merge_level) == (xA0>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yA0>>log2_parallel_merge_level)) { + availableA0 = false; + logtrace(LogMotion,"spatial merging candidate A0: below parallel merge level\n"); + } + else { + availableA0 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA0,yA0); + if (!availableA0) logtrace(LogMotion,"spatial merging candidate A0: unavailable\n"); + } + + if (availableA0) { + const PBMotion& a0 = img->get_mv_info(xA0,yA0); + + // A0 == A1 -> discard A0 + if (availableA1 && out_cand[idxA1]==a0) { + idxA0 = idxA1; + logtrace(LogMotion,"spatial merging candidate A0: redundant to A1\n"); + } + else { + idxA0 = computed_candidates++; + out_cand[idxA0] = a0; + logtrace(LogMotion,"spatial merging candidate A0:\n"); + logmvcand(out_cand[idxA0]); + } + } + + if (computed_candidates>=maxCandidates) return computed_candidates; + + + // --- B2 --- + + int xB2 = xP-1; + int yB2 = yP-1; + + bool availableB2; + int idxB2; + + // if we already have four candidates, do not consider B2 anymore + if (computed_candidates==4) { + availableB2 = false; + logtrace(LogMotion,"spatial merging candidate B2: ignore\n"); + } + else if ((xP>>log2_parallel_merge_level) == (xB2>>log2_parallel_merge_level) && + (yP>>log2_parallel_merge_level) == (yB2>>log2_parallel_merge_level)) { + availableB2 = false; + logtrace(LogMotion,"spatial merging candidate B2: below parallel merge level\n"); + } + else { + availableB2 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB2,yB2); + if (!availableB2) logtrace(LogMotion,"spatial merging candidate B2: unavailable\n"); + } + + if (availableB2) { + const PBMotion& b2 = img->get_mv_info(xB2,yB2); + + // B2 == B1 -> discard B2 + if (availableB1 && out_cand[idxB1]==b2) { + idxB2 = idxB1; + logtrace(LogMotion,"spatial merging candidate B2: redundant to B1\n"); + } + // B2 == A1 -> discard B2 + else if (availableA1 && out_cand[idxA1]==b2) { + idxB2 = idxA1; + logtrace(LogMotion,"spatial merging candidate B2: redundant to A1\n"); + } + else { + idxB2 = computed_candidates++; + out_cand[idxB2] = b2; + logtrace(LogMotion,"spatial merging candidate B2:\n"); + logmvcand(out_cand[idxB2]); + } + } + + return computed_candidates; +} + + +// 8.5.3.1.4 +void derive_zero_motion_vector_candidates(const slice_segment_header* shdr, + PBMotion* out_mergeCandList, + int* inout_numCurrMergeCand, + int maxCandidates) +{ + logtrace(LogMotion,"derive_zero_motion_vector_candidates\n"); + + int numRefIdx; + + if (shdr->slice_type==SLICE_TYPE_P) { + numRefIdx = shdr->num_ref_idx_l0_active; + } + else { + numRefIdx = libde265_min(shdr->num_ref_idx_l0_active, + shdr->num_ref_idx_l1_active); + } + + + //int numInputMergeCand = *inout_numMergeCand; + int zeroIdx = 0; + + while (*inout_numCurrMergeCand < maxCandidates) { + // 1. + + logtrace(LogMotion,"zeroIdx:%d numRefIdx:%d\n", zeroIdx, numRefIdx); + + PBMotion* newCand = &out_mergeCandList[*inout_numCurrMergeCand]; + + const int refIdx = (zeroIdx < numRefIdx) ? zeroIdx : 0; + + if (shdr->slice_type==SLICE_TYPE_P) { + newCand->refIdx[0] = refIdx; + newCand->refIdx[1] = -1; + newCand->predFlag[0] = 1; + newCand->predFlag[1] = 0; + } + else { + newCand->refIdx[0] = refIdx; + newCand->refIdx[1] = refIdx; + newCand->predFlag[0] = 1; + newCand->predFlag[1] = 1; + } + + newCand->mv[0].x = 0; + newCand->mv[0].y = 0; + newCand->mv[1].x = 0; + newCand->mv[1].y = 0; + + (*inout_numCurrMergeCand)++; + + // 2. + + zeroIdx++; + } +} + + +bool scale_mv(MotionVector* out_mv, MotionVector mv, int colDist, int currDist) +{ + int td = Clip3(-128,127, colDist); + int tb = Clip3(-128,127, currDist); + + if (td==0) { + *out_mv = mv; + return false; + } + else { + int tx = (16384 + (abs_value(td)>>1)) / td; + int distScaleFactor = Clip3(-4096,4095, (tb*tx+32)>>6); + out_mv->x = Clip3(-32768,32767, + Sign(distScaleFactor*mv.x)*((abs_value(distScaleFactor*mv.x)+127)>>8)); + out_mv->y = Clip3(-32768,32767, + Sign(distScaleFactor*mv.y)*((abs_value(distScaleFactor*mv.y)+127)>>8)); + return true; + } +} + + +// (L1003) 8.5.3.2.8 + +void derive_collocated_motion_vectors(base_context* ctx, + de265_image* img, + const slice_segment_header* shdr, + int xP,int yP, + int colPic, + int xColPb,int yColPb, + int refIdxLX, // (always 0 for merge mode) + int X, + MotionVector* out_mvLXCol, + uint8_t* out_availableFlagLXCol) +{ + logtrace(LogMotion,"derive_collocated_motion_vectors %d;%d\n",xP,yP); + + + // get collocated image and the prediction mode at the collocated position + + assert(ctx->has_image(colPic)); + const de265_image* colImg = ctx->get_image(colPic); + + // check for access outside image area + + if (xColPb >= colImg->get_width() || + yColPb >= colImg->get_height()) { + ctx->add_warning(DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA, false); + *out_availableFlagLXCol = 0; + return; + } + + enum PredMode predMode = colImg->get_pred_mode(xColPb,yColPb); + + + // collocated block is Intra -> no collocated MV + + if (predMode == MODE_INTRA) { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + return; + } + + + logtrace(LogMotion,"colPic:%d (POC=%d) X:%d refIdxLX:%d refpiclist:%d\n", + colPic, + colImg->PicOrderCntVal, + X,refIdxLX,shdr->RefPicList[X][refIdxLX]); + + + // collocated reference image is unavailable -> no collocated MV + + if (colImg->integrity == INTEGRITY_UNAVAILABLE_REFERENCE) { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + return; + } + + + // get the collocated MV + + const PBMotion& mvi = colImg->get_mv_info(xColPb,yColPb); + int listCol; + int refIdxCol; + MotionVector mvCol; + + logtrace(LogMotion,"read MVI %d;%d:\n",xColPb,yColPb); + logmvcand(mvi); + + + // collocated MV uses only L1 -> use L1 + if (mvi.predFlag[0]==0) { + mvCol = mvi.mv[1]; + refIdxCol = mvi.refIdx[1]; + listCol = 1; + } + // collocated MV uses only L0 -> use L0 + else if (mvi.predFlag[1]==0) { + mvCol = mvi.mv[0]; + refIdxCol = mvi.refIdx[0]; + listCol = 0; + } + // collocated MV uses L0 and L1 + else { + bool allRefFramesBeforeCurrentFrame = true; + + const int currentPOC = img->PicOrderCntVal; + + // all reference POCs earlier than current POC (list 1) + // Test L1 first, because there is a higher change to find a future reference frame. + + for (int rIdx=0; rIdxnum_ref_idx_l1_active && allRefFramesBeforeCurrentFrame; rIdx++) + { + const de265_image* refimg = ctx->get_image(shdr->RefPicList[1][rIdx]); + int refPOC = refimg->PicOrderCntVal; + + if (refPOC > currentPOC) { + allRefFramesBeforeCurrentFrame = false; + } + } + + // all reference POCs earlier than current POC (list 0) + + for (int rIdx=0; rIdxnum_ref_idx_l0_active && allRefFramesBeforeCurrentFrame; rIdx++) + { + const de265_image* refimg = ctx->get_image(shdr->RefPicList[0][rIdx]); + int refPOC = refimg->PicOrderCntVal; + + if (refPOC > currentPOC) { + allRefFramesBeforeCurrentFrame = false; + } + } + + + /* TODO: What is the rationale behind this ??? + + My guess: + when there are images before the current frame (most probably in L0) and images after + the current frame (most probably in L1), we take the reference in the opposite + direction than where the collocated frame is positioned in the hope that the distance + to the current frame will be smaller and thus give a better prediction. + + If all references point into the past, we cannot say much about the temporal order or + L0,L1 and thus take over both parts. + */ + + if (allRefFramesBeforeCurrentFrame) { + mvCol = mvi.mv[X]; + refIdxCol = mvi.refIdx[X]; + listCol = X; + } + else { + int N = shdr->collocated_from_l0_flag; + mvCol = mvi.mv[N]; + refIdxCol = mvi.refIdx[N]; + listCol = N; + } + } + + + + const slice_segment_header* colShdr = colImg->slices[ colImg->get_SliceHeaderIndex(xColPb,yColPb) ]; + + if (shdr->LongTermRefPic[X][refIdxLX] != + colShdr->LongTermRefPic[listCol][refIdxCol]) { + *out_availableFlagLXCol = 0; + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + } + else { + *out_availableFlagLXCol = 1; + + const bool isLongTerm = shdr->LongTermRefPic[X][refIdxLX]; + + int colDist = colImg->PicOrderCntVal - colShdr->RefPicList_POC[listCol][refIdxCol]; + int currDist = img->PicOrderCntVal - shdr->RefPicList_POC[X][refIdxLX]; + + logtrace(LogMotion,"COLPOCDIFF %d %d [%d %d / %d %d]\n",colDist, currDist, + colImg->PicOrderCntVal, colShdr->RefPicList_POC[listCol][refIdxCol], + img->PicOrderCntVal, shdr->RefPicList_POC[X][refIdxLX] + ); + + if (isLongTerm || colDist == currDist) { + *out_mvLXCol = mvCol; + } + else { + if (!scale_mv(out_mvLXCol, mvCol, colDist, currDist)) { + ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + + logtrace(LogMotion,"scale: %d;%d to %d;%d\n", + mvCol.x,mvCol.y, out_mvLXCol->x,out_mvLXCol->y); + } + } +} + + +// 8.5.3.1.7 +void derive_temporal_luma_vector_prediction(base_context* ctx, + de265_image* img, + const slice_segment_header* shdr, + int xP,int yP, + int nPbW,int nPbH, + int refIdxL, + int X, // which MV (L0/L1) to get + MotionVector* out_mvLXCol, + uint8_t* out_availableFlagLXCol) +{ + // --- no temporal MVP -> exit --- + + if (shdr->slice_temporal_mvp_enabled_flag == 0) { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + return; + } + + + // --- find collocated reference image --- + + int Log2CtbSizeY = img->get_sps().Log2CtbSizeY; + + int colPic; // TODO: this is the same for the whole slice. We can precompute it. + + if (shdr->slice_type == SLICE_TYPE_B && + shdr->collocated_from_l0_flag == 0) + { + logtrace(LogMotion,"collocated L1 ref_idx=%d\n",shdr->collocated_ref_idx); + + colPic = shdr->RefPicList[1][ shdr->collocated_ref_idx ]; + } + else + { + logtrace(LogMotion,"collocated L0 ref_idx=%d\n",shdr->collocated_ref_idx); + + colPic = shdr->RefPicList[0][ shdr->collocated_ref_idx ]; + } + + + // check whether collocated reference picture exists + + if (!ctx->has_image(colPic)) { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + + ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false); + return; + } + + + // --- get collocated MV either at bottom-right corner or from center of PB --- + + int xColPb,yColPb; + int yColBr = yP + nPbH; // bottom right collocated motion vector position + int xColBr = xP + nPbW; + + /* If neighboring pixel at bottom-right corner is in the same CTB-row and inside the image, + use this (reduced down to 16 pixels resolution) as collocated MV position. + + Note: see 2014, Sze, Sect. 5.2.1.2 why candidate C0 is excluded when on another CTB-row. + This is to reduce the memory bandwidth requirements. + */ + if ((yP>>Log2CtbSizeY) == (yColBr>>Log2CtbSizeY) && + xColBr < img->get_sps().pic_width_in_luma_samples && + yColBr < img->get_sps().pic_height_in_luma_samples) + { + xColPb = xColBr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid + yColPb = yColBr & ~0x0F; + + derive_collocated_motion_vectors(ctx,img,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X, + out_mvLXCol, out_availableFlagLXCol); + } + else + { + out_mvLXCol->x = 0; + out_mvLXCol->y = 0; + *out_availableFlagLXCol = 0; + } + + + if (*out_availableFlagLXCol==0) { + + int xColCtr = xP+(nPbW>>1); + int yColCtr = yP+(nPbH>>1); + + xColPb = xColCtr & ~0x0F; // reduce resolution of collocated motion-vectors to 16 pixels grid + yColPb = yColCtr & ~0x0F; + + derive_collocated_motion_vectors(ctx,img,shdr, xP,yP, colPic, xColPb,yColPb, refIdxL, X, + out_mvLXCol, out_availableFlagLXCol); + } +} + + +static int table_8_19[2][12] = { + { 0,1,0,2,1,2,0,3,1,3,2,3 }, + { 1,0,2,0,2,1,3,0,3,1,3,2 } + }; + +// 8.5.3.1.3 +/* Note (TODO): during decoding, we know which of the candidates we will select. ++ Hence, we do not really have to generate the other ones... ++ */ +void derive_combined_bipredictive_merging_candidates(const base_context* ctx, + const slice_segment_header* shdr, + PBMotion* inout_mergeCandList, + int* inout_numMergeCand, + int maxCandidates) +{ + if (*inout_numMergeCand>1 && *inout_numMergeCand < maxCandidates) { + int numOrigMergeCand = *inout_numMergeCand; + + int numInputMergeCand = *inout_numMergeCand; + int combIdx = 0; + uint8_t combStop = false; + + while (!combStop) { + int l0CandIdx = table_8_19[0][combIdx]; + int l1CandIdx = table_8_19[1][combIdx]; + + if (l0CandIdx >= numInputMergeCand || + l1CandIdx >= numInputMergeCand) { + assert(false); // bitstream error -> TODO: conceal error + } + + PBMotion& l0Cand = inout_mergeCandList[l0CandIdx]; + PBMotion& l1Cand = inout_mergeCandList[l1CandIdx]; + + logtrace(LogMotion,"add bipredictive merging candidate (combIdx:%d)\n",combIdx); + logtrace(LogMotion,"l0Cand:\n"); logmvcand(l0Cand); + logtrace(LogMotion,"l1Cand:\n"); logmvcand(l1Cand); + + const de265_image* img0 = l0Cand.predFlag[0] ? ctx->get_image(shdr->RefPicList[0][l0Cand.refIdx[0]]) : NULL; + const de265_image* img1 = l1Cand.predFlag[1] ? ctx->get_image(shdr->RefPicList[1][l1Cand.refIdx[1]]) : NULL; + + if (l0Cand.predFlag[0] && !img0) { + return; // TODO error + } + + if (l1Cand.predFlag[1] && !img1) { + return; // TODO error + } + + if (l0Cand.predFlag[0] && l1Cand.predFlag[1] && + (img0->PicOrderCntVal != img1->PicOrderCntVal || + l0Cand.mv[0].x != l1Cand.mv[1].x || + l0Cand.mv[0].y != l1Cand.mv[1].y)) { + PBMotion& p = inout_mergeCandList[ *inout_numMergeCand ]; + p.refIdx[0] = l0Cand.refIdx[0]; + p.refIdx[1] = l1Cand.refIdx[1]; + p.predFlag[0] = l0Cand.predFlag[0]; + p.predFlag[1] = l1Cand.predFlag[1]; + p.mv[0] = l0Cand.mv[0]; + p.mv[1] = l1Cand.mv[1]; + (*inout_numMergeCand)++; + + logtrace(LogMotion,"result:\n"); + logmvcand(p); + } + + combIdx++; + if (combIdx == numOrigMergeCand*(numOrigMergeCand-1) || + *inout_numMergeCand == maxCandidates) { + combStop = true; + } + } + } +} + + +// 8.5.3.1.1 + +void get_merge_candidate_list_without_step_9(base_context* ctx, + const slice_segment_header* shdr, + const MotionVectorAccess& mvaccess, + de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + int max_merge_idx, + PBMotion* mergeCandList) +{ + + //int xOrigP = xP; + //int yOrigP = yP; + int nOrigPbW = nPbW; + int nOrigPbH = nPbH; + + int singleMCLFlag; // single merge-candidate-list (MCL) flag + + /* Use single MCL for CBs of size 8x8, except when parallel-merge-level is at 4x4. + Without this flag, PBs smaller than 8x8 would not receive as much merging candidates. + Having additional candidates might have these advantages: + - coding MVs for these small PBs is expensive, and + - since the PBs are not far away from a proper (neighboring) merging candidate, + the quality of the candidates will still be good. + */ + singleMCLFlag = (img->get_pps().log2_parallel_merge_level > 2 && nCS==8); + + if (singleMCLFlag) { + xP=xC; + yP=yC; + nPbW=nCS; + nPbH=nCS; + partIdx=0; + } + + int maxCandidates = max_merge_idx+1; + //MotionVectorSpec mergeCandList[5]; + int numMergeCand=0; + + // --- spatial merge candidates + + numMergeCand = derive_spatial_merging_candidates(mvaccess, + img, xC,yC, nCS, xP,yP, singleMCLFlag, + nPbW,nPbH,partIdx, mergeCandList, + maxCandidates); + + // --- collocated merge candidate + if (numMergeCand < maxCandidates) { + int refIdxCol[2] = { 0,0 }; + + MotionVector mvCol[2]; + uint8_t predFlagLCol[2]; + derive_temporal_luma_vector_prediction(ctx,img,shdr, xP,yP,nPbW,nPbH, + refIdxCol[0],0, &mvCol[0], + &predFlagLCol[0]); + + uint8_t availableFlagCol = predFlagLCol[0]; + predFlagLCol[1] = 0; + + if (shdr->slice_type == SLICE_TYPE_B) { + derive_temporal_luma_vector_prediction(ctx,img,shdr, + xP,yP,nPbW,nPbH, refIdxCol[1],1, &mvCol[1], + &predFlagLCol[1]); + availableFlagCol |= predFlagLCol[1]; + } + + + if (availableFlagCol) { + PBMotion* colVec = &mergeCandList[numMergeCand++]; + + colVec->mv[0] = mvCol[0]; + colVec->mv[1] = mvCol[1]; + colVec->predFlag[0] = predFlagLCol[0]; + colVec->predFlag[1] = predFlagLCol[1]; + colVec->refIdx[0] = refIdxCol[0]; + colVec->refIdx[1] = refIdxCol[1]; + } + } + + + // --- bipredictive merge candidates --- + + if (shdr->slice_type == SLICE_TYPE_B) { + derive_combined_bipredictive_merging_candidates(ctx, shdr, + mergeCandList, &numMergeCand, maxCandidates); + } + + + // --- zero-vector merge candidates --- + + derive_zero_motion_vector_candidates(shdr, mergeCandList, &numMergeCand, maxCandidates); + + + logtrace(LogMotion,"mergeCandList:\n"); + for (int i=0;iMaxNumMergeCand;i++) + { + //logtrace(LogMotion, " %d:%s\n", i, i==merge_idx ? " SELECTED":""); + logmvcand(mergeCandList[i]); + } +} + + + +void get_merge_candidate_list(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + PBMotion* mergeCandList) +{ + int max_merge_idx = 5-shdr->five_minus_max_num_merge_cand -1; + + get_merge_candidate_list_without_step_9(ctx, shdr, + MotionVectorAccess_de265_image(img), img, + xC,yC,xP,yP,nCS,nPbW,nPbH, partIdx, + max_merge_idx, mergeCandList); + + // 9. for encoder: modify all merge candidates + + for (int i=0;i<=max_merge_idx;i++) { + if (mergeCandList[i].predFlag[0] && + mergeCandList[i].predFlag[1] && + nPbW+nPbH==12) + { + mergeCandList[i].refIdx[1] = -1; + mergeCandList[i].predFlag[1] = 0; + } + } +} + + +void derive_luma_motion_merge_mode(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC, int xP,int yP, + int nCS, int nPbW,int nPbH, int partIdx, + int merge_idx, + PBMotion* out_vi) +{ + PBMotion mergeCandList[5]; + + get_merge_candidate_list_without_step_9(ctx, shdr, + MotionVectorAccess_de265_image(img), img, + xC,yC,xP,yP,nCS,nPbW,nPbH, partIdx, + merge_idx, mergeCandList); + + + *out_vi = mergeCandList[merge_idx]; + + // 8.5.3.1.1 / 9. + + if (out_vi->predFlag[0] && out_vi->predFlag[1] && nPbW+nPbH==12) { + out_vi->refIdx[1] = -1; + out_vi->predFlag[1] = 0; + } +} + + +// 8.5.3.1.6 +void derive_spatial_luma_vector_prediction(base_context* ctx, + de265_image* img, + const slice_segment_header* shdr, + int xC,int yC,int nCS,int xP,int yP, + int nPbW,int nPbH, int X, + int refIdxLX, int partIdx, + uint8_t out_availableFlagLXN[2], + MotionVector out_mvLXN[2]) +{ + int isScaledFlagLX = 0; + + const int A=0; + const int B=1; + + out_availableFlagLXN[A] = 0; + out_availableFlagLXN[B] = 0; + + + // --- A --- + + // 1. + + int xA[2], yA[2]; + xA[0] = xP-1; + yA[0] = yP + nPbH; + xA[1] = xA[0]; + yA[1] = yA[0]-1; + + // 2. + + out_availableFlagLXN[A] = 0; + out_mvLXN[A].x = 0; + out_mvLXN[A].y = 0; + + // 3. / 4. + + bool availableA[2]; + availableA[0] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[0],yA[0]); + availableA[1] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA[1],yA[1]); + + // 5. + + if (availableA[0] || availableA[1]) { + isScaledFlagLX = 1; + } + + // 6. test A0 and A1 (Ak) + + int refIdxA=-1; + + // the POC we want to reference in this PB + const de265_image* tmpimg = ctx->get_image(shdr->RefPicList[X][ refIdxLX ]); + if (tmpimg==NULL) { return; } + const int referenced_POC = tmpimg->PicOrderCntVal; + + for (int k=0;k<=1;k++) { + if (availableA[k] && + out_availableFlagLXN[A]==0 && // no A?-predictor so far + img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) { + + int Y=1-X; + + const PBMotion& vi = img->get_mv_info(xA[k],yA[k]); + logtrace(LogMotion,"MVP A%d=\n",k); + logmvcand(vi); + + const de265_image* imgX = NULL; + if (vi.predFlag[X]) imgX = ctx->get_image(shdr->RefPicList[X][ vi.refIdx[X] ]); + const de265_image* imgY = NULL; + if (vi.predFlag[Y]) imgY = ctx->get_image(shdr->RefPicList[Y][ vi.refIdx[Y] ]); + + // check whether the predictor X is available and references the same POC + if (vi.predFlag[X] && imgX && imgX->PicOrderCntVal == referenced_POC) { + + logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,X); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = vi.mv[X]; + refIdxA = vi.refIdx[X]; + } + // check whether the other predictor (Y) is available and references the same POC + else if (vi.predFlag[Y] && imgY && imgY->PicOrderCntVal == referenced_POC) { + + logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,Y); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = vi.mv[Y]; + refIdxA = vi.refIdx[Y]; + } + } + } + + // 7. If there is no predictor referencing the same POC, we take any other reference as + // long as it is the same type of reference (long-term / short-term) + + for (int k=0 ; k<=1 && out_availableFlagLXN[A]==0 ; k++) { + int refPicList=-1; + + if (availableA[k] && + // TODO: we could remove this call by storing the result of the similar computation above + img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) { + + int Y=1-X; + + const PBMotion& vi = img->get_mv_info(xA[k],yA[k]); + if (vi.predFlag[X]==1 && + shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi.refIdx[X] ]) { + + logtrace(LogMotion,"take A%D/L%d as A candidate with different POCs\n",k,X); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = vi.mv[X]; + refIdxA = vi.refIdx[X]; + refPicList = X; + } + else if (vi.predFlag[Y]==1 && + shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi.refIdx[Y] ]) { + + logtrace(LogMotion,"take A%d/L%d as A candidate with different POCs\n",k,Y); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = vi.mv[Y]; + refIdxA = vi.refIdx[Y]; + refPicList = Y; + } + } + + if (out_availableFlagLXN[A]==1) { + if (refIdxA<0) { + out_availableFlagLXN[0] = out_availableFlagLXN[1] = false; + return; // error + } + + assert(refIdxA>=0); + assert(refPicList>=0); + + const de265_image* refPicA = ctx->get_image(shdr->RefPicList[refPicList][refIdxA ]); + const de265_image* refPicX = ctx->get_image(shdr->RefPicList[X ][refIdxLX]); + + //int picStateA = shdr->RefPicList_PicState[refPicList][refIdxA ]; + //int picStateX = shdr->RefPicList_PicState[X ][refIdxLX]; + + int isLongTermA = shdr->LongTermRefPic[refPicList][refIdxA ]; + int isLongTermX = shdr->LongTermRefPic[X ][refIdxLX]; + + logtrace(LogMotion,"scale MVP A: A-POC:%d X-POC:%d\n", + refPicA->PicOrderCntVal,refPicX->PicOrderCntVal); + + if (!isLongTermA && !isLongTermX) + /* + if (picStateA == UsedForShortTermReference && + picStateX == UsedForShortTermReference) + */ + { + int distA = img->PicOrderCntVal - refPicA->PicOrderCntVal; + int distX = img->PicOrderCntVal - referenced_POC; + + if (!scale_mv(&out_mvLXN[A], out_mvLXN[A], distA, distX)) { + ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + } + } + + + // --- B --- + + // 1. + + int xB[3], yB[3]; + xB[0] = xP+nPbW; + yB[0] = yP-1; + xB[1] = xB[0]-1; + yB[1] = yP-1; + xB[2] = xP-1; + yB[2] = yP-1; + + // 2. + + out_availableFlagLXN[B] = 0; + out_mvLXN[B].x = 0; + out_mvLXN[B].y = 0; + + // 3. test B0,B1,B2 (Bk) + + int refIdxB=-1; + + bool availableB[3]; + for (int k=0;k<3;k++) { + availableB[k] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB[k],yB[k]); + + if (availableB[k] && out_availableFlagLXN[B]==0) { + + int Y=1-X; + + const PBMotion& vi = img->get_mv_info(xB[k],yB[k]); + logtrace(LogMotion,"MVP B%d=\n",k); + logmvcand(vi); + + + const de265_image* imgX = NULL; + if (vi.predFlag[X]) imgX = ctx->get_image(shdr->RefPicList[X][ vi.refIdx[X] ]); + const de265_image* imgY = NULL; + if (vi.predFlag[Y]) imgY = ctx->get_image(shdr->RefPicList[Y][ vi.refIdx[Y] ]); + + if (vi.predFlag[X] && imgX && imgX->PicOrderCntVal == referenced_POC) { + logtrace(LogMotion,"a) take B%d/L%d as B candidate with same POC\n",k,X); + + out_availableFlagLXN[B]=1; + out_mvLXN[B] = vi.mv[X]; + refIdxB = vi.refIdx[X]; + } + else if (vi.predFlag[Y] && imgY && imgY->PicOrderCntVal == referenced_POC) { + logtrace(LogMotion,"b) take B%d/L%d as B candidate with same POC\n",k,Y); + + out_availableFlagLXN[B]=1; + out_mvLXN[B] = vi.mv[Y]; + refIdxB = vi.refIdx[Y]; + } + } + } + + // 4. + + if (isScaledFlagLX==0 && // no A predictor, + out_availableFlagLXN[B]) // but an unscaled B predictor + { + // use unscaled B predictor as A predictor + + logtrace(LogMotion,"copy the same-POC B candidate as additional A candidate\n"); + + out_availableFlagLXN[A]=1; + out_mvLXN[A] = out_mvLXN[B]; + refIdxA = refIdxB; + } + + // 5. + + // If no A predictor, we output the unscaled B as the A predictor (above) + // and also add a scaled B predictor here. + // If there is (probably) an A predictor, no differing-POC B predictor is generated. + if (isScaledFlagLX==0) { + out_availableFlagLXN[B]=0; + + for (int k=0 ; k<=2 && out_availableFlagLXN[B]==0 ; k++) { + int refPicList=-1; + + if (availableB[k]) { + int Y=1-X; + + const PBMotion& vi = img->get_mv_info(xB[k],yB[k]); + + if (vi.predFlag[X]==1 && + shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi.refIdx[X] ]) { + out_availableFlagLXN[B]=1; + out_mvLXN[B] = vi.mv[X]; + refIdxB = vi.refIdx[X]; + refPicList = X; + } + else if (vi.predFlag[Y]==1 && + shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[Y][ vi.refIdx[Y] ]) { + out_availableFlagLXN[B]=1; + out_mvLXN[B] = vi.mv[Y]; + refIdxB = vi.refIdx[Y]; + refPicList = Y; + } + } + + if (out_availableFlagLXN[B]==1) { + if (refIdxB<0) { + out_availableFlagLXN[0] = out_availableFlagLXN[1] = false; + return; // error + } + + assert(refPicList>=0); + assert(refIdxB>=0); + + const de265_image* refPicB=ctx->get_image(shdr->RefPicList[refPicList][refIdxB ]); + const de265_image* refPicX=ctx->get_image(shdr->RefPicList[X ][refIdxLX]); + + int isLongTermB = shdr->LongTermRefPic[refPicList][refIdxB ]; + int isLongTermX = shdr->LongTermRefPic[X ][refIdxLX]; + + if (refPicB==NULL || refPicX==NULL) { + img->decctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED,false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + else if (refPicB->PicOrderCntVal != refPicX->PicOrderCntVal && + !isLongTermB && !isLongTermX) { + int distB = img->PicOrderCntVal - refPicB->PicOrderCntVal; + int distX = img->PicOrderCntVal - referenced_POC; + + logtrace(LogMotion,"scale MVP B: B-POC:%d X-POC:%d\n",refPicB->PicOrderCntVal,refPicX->PicOrderCntVal); + + if (!scale_mv(&out_mvLXN[B], out_mvLXN[B], distB, distX)) { + ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false); + img->integrity = INTEGRITY_DECODING_ERRORS; + } + } + } + } + } +} + + +// 8.5.3.1.5 +void fill_luma_motion_vector_predictors(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + int xC,int yC,int nCS,int xP,int yP, + int nPbW,int nPbH, int l, + int refIdx, int partIdx, + MotionVector out_mvpList[2]) +{ + // 8.5.3.1.6: derive two spatial vector predictors A (0) and B (1) + + uint8_t availableFlagLXN[2]; + MotionVector mvLXN[2]; + + derive_spatial_luma_vector_prediction(ctx, img, shdr, xC,yC, nCS, xP,yP, + nPbW,nPbH, l, refIdx, partIdx, + availableFlagLXN, mvLXN); + + // 8.5.3.1.7: if we only have one spatial vector or both spatial vectors are the same, + // derive a temporal predictor + + uint8_t availableFlagLXCol; + MotionVector mvLXCol; + + + if (availableFlagLXN[0] && + availableFlagLXN[1] && + (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y)) { + availableFlagLXCol = 0; + } + else { + derive_temporal_luma_vector_prediction(ctx, img, shdr, + xP,yP, nPbW,nPbH, refIdx,l, + &mvLXCol, &availableFlagLXCol); + } + + + // --- build candidate vector list with exactly two entries --- + + int numMVPCandLX=0; + + // spatial predictor A + + if (availableFlagLXN[0]) + { + out_mvpList[numMVPCandLX++] = mvLXN[0]; + } + + // spatial predictor B (if not same as A) + + if (availableFlagLXN[1] && + (!availableFlagLXN[0] || // in case A in not available, but mvLXA initialized to same as mvLXB + (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y))) + { + out_mvpList[numMVPCandLX++] = mvLXN[1]; + } + + // temporal predictor + + if (availableFlagLXCol) + { + out_mvpList[numMVPCandLX++] = mvLXCol; + } + + // fill with zero predictors + + while (numMVPCandLX<2) { + out_mvpList[numMVPCandLX].x = 0; + out_mvpList[numMVPCandLX].y = 0; + numMVPCandLX++; + } + + + assert(numMVPCandLX==2); +} + + +MotionVector luma_motion_vector_prediction(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + const PBMotionCoding& motion, + int xC,int yC,int nCS,int xP,int yP, + int nPbW,int nPbH, int l, + int refIdx, int partIdx) +{ + MotionVector mvpList[2]; + + fill_luma_motion_vector_predictors(ctx, shdr, img, + xC,yC,nCS,xP,yP, + nPbW, nPbH, l, refIdx, partIdx, + mvpList); + + // select predictor according to mvp_lX_flag + + return mvpList[ l ? motion.mvp_l1_flag : motion.mvp_l0_flag ]; +} + + +#if DE265_LOG_TRACE +void logMV(int x0,int y0,int nPbW,int nPbH, const char* mode,const PBMotion* mv) +{ + int pred0 = mv->predFlag[0]; + int pred1 = mv->predFlag[1]; + + logtrace(LogMotion, + "*MV %d;%d [%d;%d] %s: (%d) %d;%d @%d (%d) %d;%d @%d\n", x0,y0,nPbW,nPbH,mode, + pred0, + pred0 ? mv->mv[0].x : 0,pred0 ? mv->mv[0].y : 0, pred0 ? mv->refIdx[0] : 0, + pred1, + pred1 ? mv->mv[1].x : 0,pred1 ? mv->mv[1].y : 0, pred1 ? mv->refIdx[1] : 0); +} +#else +#define logMV(x0,y0,nPbW,nPbH,mode,mv) +#endif + + + +// 8.5.3.1 +void motion_vectors_and_ref_indices(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + const PBMotionCoding& motion, + int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, + int partIdx, + PBMotion* out_vi) +{ + //slice_segment_header* shdr = tctx->shdr; + + int xP = xC+xB; + int yP = yC+yB; + + enum PredMode predMode = img->get_pred_mode(xC,yC); + + if (predMode == MODE_SKIP || + (predMode == MODE_INTER && motion.merge_flag)) + { + derive_luma_motion_merge_mode(ctx,shdr,img, + xC,yC, xP,yP, nCS,nPbW,nPbH, partIdx, + motion.merge_idx, out_vi); + + logMV(xP,yP,nPbW,nPbH, "merge_mode", out_vi); + } + else { + int mvdL[2][2]; + MotionVector mvpL[2]; + + for (int l=0;l<2;l++) { + // 1. + + enum InterPredIdc inter_pred_idc = (enum InterPredIdc)motion.inter_pred_idc; + + if (inter_pred_idc == PRED_BI || + (inter_pred_idc == PRED_L0 && l==0) || + (inter_pred_idc == PRED_L1 && l==1)) { + out_vi->refIdx[l] = motion.refIdx[l]; + out_vi->predFlag[l] = 1; + } + else { + out_vi->refIdx[l] = -1; + out_vi->predFlag[l] = 0; + } + + // 2. + + mvdL[l][0] = motion.mvd[l][0]; + mvdL[l][1] = motion.mvd[l][1]; + + + if (out_vi->predFlag[l]) { + // 3. + + mvpL[l] = luma_motion_vector_prediction(ctx,shdr,img,motion, + xC,yC,nCS,xP,yP, nPbW,nPbH, l, + out_vi->refIdx[l], partIdx); + + // 4. + + int32_t x = (mvpL[l].x + mvdL[l][0] + 0x10000) & 0xFFFF; + int32_t y = (mvpL[l].y + mvdL[l][1] + 0x10000) & 0xFFFF; + + out_vi->mv[l].x = (x>=0x8000) ? x-0x10000 : x; + out_vi->mv[l].y = (y>=0x8000) ? y-0x10000 : y; + } + } + + logMV(xP,yP,nPbW,nPbH, "mvp", out_vi); + } +} + + +// 8.5.3 + +/* xC/yC : CB position + xB/yB : position offset of the PB + nPbW/nPbH : size of PB + nCS : CB size + */ +void decode_prediction_unit(base_context* ctx, + const slice_segment_header* shdr, + de265_image* img, + const PBMotionCoding& motion, + int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx) +{ + logtrace(LogMotion,"decode_prediction_unit POC=%d %d;%d %dx%d\n", + img->PicOrderCntVal, xC+xB,yC+yB, nPbW,nPbH); + + //slice_segment_header* shdr = tctx->shdr; + + // 1. + + PBMotion vi; + motion_vectors_and_ref_indices(ctx, shdr, img, motion, + xC,yC, xB,yB, nCS, nPbW,nPbH, partIdx, &vi); + + // 2. + + generate_inter_prediction_samples(ctx,shdr, img, xC,yC, xB,yB, nCS, nPbW,nPbH, &vi); + + + img->set_mv_info(xC+xB,yC+yB,nPbW,nPbH, vi); +} diff --git a/nal-parser.cc b/nal-parser.cc new file mode 100644 index 0000000..ea95ed1 --- /dev/null +++ b/nal-parser.cc @@ -0,0 +1,446 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "nal-parser.h" + +#include +#include +#include +#include + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +NAL_unit::NAL_unit() + : skipped_bytes(DE265_SKIPPED_BYTES_INITIAL_SIZE) +{ + pts=0; + user_data = NULL; + + nal_data = NULL; + data_size = 0; + capacity = 0; +} + +NAL_unit::~NAL_unit() +{ + free(nal_data); +} + +void NAL_unit::clear() +{ + header = nal_header(); + pts = 0; + user_data = NULL; + + // set size to zero but keep memory + data_size = 0; + + skipped_bytes.clear(); +} + +LIBDE265_CHECK_RESULT bool NAL_unit::resize(int new_size) +{ + if (capacity < new_size) { + unsigned char* newbuffer = (unsigned char*)malloc(new_size); + if (newbuffer == NULL) { + return false; + } + + if (nal_data != NULL) { + memcpy(newbuffer, nal_data, data_size); + free(nal_data); + } + + nal_data = newbuffer; + capacity = new_size; + } + return true; +} + +LIBDE265_CHECK_RESULT bool NAL_unit::append(const unsigned char* in_data, int n) +{ + if (!resize(data_size + n)) { + return false; + } + memcpy(nal_data + data_size, in_data, n); + data_size += n; + return true; +} + +bool LIBDE265_CHECK_RESULT NAL_unit::set_data(const unsigned char* in_data, int n) +{ + if (!resize(n)) { + return false; + } + memcpy(nal_data, in_data, n); + data_size = n; + return true; +} + +void NAL_unit::insert_skipped_byte(int pos) +{ + skipped_bytes.push_back(pos); +} + +int NAL_unit::num_skipped_bytes_before(int byte_position, int headerLength) const +{ + for (int k=skipped_bytes.size()-1;k>=0;k--) + if (skipped_bytes[k]-headerLength <= byte_position) { + return k+1; + } + + return 0; +} + +void NAL_unit::remove_stuffing_bytes() +{ + uint8_t* p = data(); + + for (int i=0;i 0) { + nal = NAL_free_list.back(); + NAL_free_list.pop_back(); + } + else { + nal = new NAL_unit; + } + + nal->clear(); + if (!nal->resize(size)) { + free_NAL_unit(nal); + return NULL; + } + + return nal; +} + +void NAL_Parser::free_NAL_unit(NAL_unit* nal) +{ + if (nal == NULL) { + // Allow calling with NULL just like regular "free()" + return; + } + if (NAL_free_list.size() < DE265_NAL_FREE_LIST_SIZE) { + NAL_free_list.push_back(nal); + } + else { + delete nal; + } +} + +NAL_unit* NAL_Parser::pop_from_NAL_queue() +{ + if (NAL_queue.empty()) { + return NULL; + } + else { + NAL_unit* nal = NAL_queue.front(); + NAL_queue.pop(); + + nBytes_in_NAL_queue -= nal->size(); + + return nal; + } +} + +void NAL_Parser::push_to_NAL_queue(NAL_unit* nal) +{ + NAL_queue.push(nal); + nBytes_in_NAL_queue += nal->size(); +} + +de265_error NAL_Parser::push_data(const unsigned char* data, int len, + de265_PTS pts, void* user_data) +{ + end_of_frame = false; + + if (pending_input_NAL == NULL) { + pending_input_NAL = alloc_NAL_unit(len+3); + if (pending_input_NAL == NULL) { + return DE265_ERROR_OUT_OF_MEMORY; + } + pending_input_NAL->pts = pts; + pending_input_NAL->user_data = user_data; + } + + NAL_unit* nal = pending_input_NAL; // shortcut + + // Resize output buffer so that complete input would fit. + // We add 3, because in the worst case 3 extra bytes are created for an input byte. + if (!nal->resize(nal->size() + len + 3)) { + return DE265_ERROR_OUT_OF_MEMORY; + } + + unsigned char* out = nal->data() + nal->size(); + + for (int i=0;iinput_push_state, *data, data, + out - ctx->nal_data.data); + */ + + switch (input_push_state) { + case 0: + case 1: + if (*data == 0) { input_push_state++; } + else { input_push_state=0; } + break; + case 2: + if (*data == 1) { input_push_state=3; } // nal->clear_skipped_bytes(); } + else if (*data == 0) { } // *out++ = 0; } + else { input_push_state=0; } + break; + case 3: + *out++ = *data; + input_push_state = 4; + break; + case 4: + *out++ = *data; + input_push_state = 5; + break; + + case 5: + if (*data==0) { input_push_state=6; } + else { *out++ = *data; } + break; + + case 6: + if (*data==0) { input_push_state=7; } + else { + *out++ = 0; + *out++ = *data; + input_push_state=5; + } + break; + + case 7: + if (*data==0) { *out++ = 0; } + else if (*data==3) { + *out++ = 0; *out++ = 0; input_push_state=5; + + // remember which byte we removed + nal->insert_skipped_byte((out - nal->data()) + nal->num_skipped_bytes()); + } + else if (*data==1) { + +#if DEBUG_INSERT_STREAM_ERRORS + if ((rand()%100)<90 && nal_data.size>0) { + int pos = rand()%nal_data.size; + int bit = rand()%8; + nal->nal_data.data[pos] ^= 1<set_size(out - nal->data());; + + // push this NAL decoder queue + push_to_NAL_queue(nal); + + + // initialize new, empty NAL unit + + pending_input_NAL = alloc_NAL_unit(len+3); + if (pending_input_NAL == NULL) { + return DE265_ERROR_OUT_OF_MEMORY; + } + pending_input_NAL->pts = pts; + pending_input_NAL->user_data = user_data; + nal = pending_input_NAL; + out = nal->data(); + + input_push_state=3; + //nal->clear_skipped_bytes(); + } + else { + *out++ = 0; + *out++ = 0; + *out++ = *data; + + input_push_state=5; + } + break; + } + + data++; + } + + nal->set_size(out - nal->data()); + return DE265_OK; +} + + +de265_error NAL_Parser::push_NAL(const unsigned char* data, int len, + de265_PTS pts, void* user_data) +{ + + // Cannot use byte-stream input and NAL input at the same time. + assert(pending_input_NAL == NULL); + + end_of_frame = false; + + NAL_unit* nal = alloc_NAL_unit(len); + if (nal == NULL || !nal->set_data(data, len)) { + free_NAL_unit(nal); + return DE265_ERROR_OUT_OF_MEMORY; + } + nal->pts = pts; + nal->user_data = user_data; + + nal->remove_stuffing_bytes(); + + push_to_NAL_queue(nal); + + return DE265_OK; +} + + +de265_error NAL_Parser::flush_data() +{ + if (pending_input_NAL) { + NAL_unit* nal = pending_input_NAL; + uint8_t null[2] = { 0,0 }; + + // append bytes that are implied by the push state + + if (input_push_state==6) { + if (!nal->append(null,1)) { + return DE265_ERROR_OUT_OF_MEMORY; + } + } + if (input_push_state==7) { + if (!nal->append(null,2)) { + return DE265_ERROR_OUT_OF_MEMORY; + } + } + + + // only push the NAL if it contains at least the NAL header + + if (input_push_state>=5) { + push_to_NAL_queue(nal); + pending_input_NAL = NULL; + } + + input_push_state = 0; + } + + return DE265_OK; +} + + +void NAL_Parser::remove_pending_input_data() +{ + // --- remove pending input data --- + + if (pending_input_NAL) { + free_NAL_unit(pending_input_NAL); + pending_input_NAL = NULL; + } + + for (;;) { + NAL_unit* nal = pop_from_NAL_queue(); + if (nal) { free_NAL_unit(nal); } + else break; + } + + input_push_state = 0; + nBytes_in_NAL_queue = 0; +} diff --git a/nal.cc b/nal.cc new file mode 100644 index 0000000..380f04d --- /dev/null +++ b/nal.cc @@ -0,0 +1,166 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "nal.h" +#include "cabac.h" +#include + + +void nal_header::read(bitreader* reader) +{ + skip_bits(reader,1); + nal_unit_type = get_bits(reader,6); + nuh_layer_id = get_bits(reader,6); + nuh_temporal_id = get_bits(reader,3) -1; +} + + +void nal_header::write(CABAC_encoder& out) const +{ + out.skip_bits(1); + out.write_bits(nal_unit_type,6); + out.write_bits(nuh_layer_id ,6); + out.write_bits(nuh_temporal_id+1,3); +} + + +bool isIDR(uint8_t unit_type) +{ + return (unit_type == NAL_UNIT_IDR_W_RADL || + unit_type == NAL_UNIT_IDR_N_LP); +} + +bool isBLA(uint8_t unit_type) +{ + return (unit_type == NAL_UNIT_BLA_W_LP || + unit_type == NAL_UNIT_BLA_W_RADL || + unit_type == NAL_UNIT_BLA_N_LP); +} + +bool isCRA(uint8_t unit_type) +{ + return unit_type == NAL_UNIT_CRA_NUT; +} + +bool isRAP(uint8_t unit_type) +{ + return isIDR(unit_type) || isBLA(unit_type) || isCRA(unit_type); +} + +bool isRASL(uint8_t unit_type) +{ + return (unit_type == NAL_UNIT_RASL_N || + unit_type == NAL_UNIT_RASL_R); +} + +bool isIRAP(uint8_t unit_type) +{ + return (unit_type >= NAL_UNIT_BLA_W_LP && + unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23); +} + +bool isRADL(uint8_t unit_type) +{ + return (unit_type == NAL_UNIT_RADL_N || + unit_type == NAL_UNIT_RADL_R); +} + + +bool isReferenceNALU(uint8_t unit_type) +{ + return ( ((unit_type <= NAL_UNIT_RESERVED_VCL_R15) && (unit_type%2 != 0)) || + ((unit_type >= NAL_UNIT_BLA_W_LP) && + (unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23)) ); +} + +bool isSublayerNonReference(uint8_t unit_type) +{ + switch (unit_type) { + case NAL_UNIT_TRAIL_N: + case NAL_UNIT_TSA_N: + case NAL_UNIT_STSA_N: + case NAL_UNIT_RADL_N: + case NAL_UNIT_RASL_N: + case NAL_UNIT_RESERVED_VCL_N10: + case NAL_UNIT_RESERVED_VCL_N12: + case NAL_UNIT_RESERVED_VCL_N14: + return true; + + default: + return false; + } +} + +static const char* NAL_unit_name[] = { + "TRAIL_N", // 0 + "TRAIL_R", + "TSA_N", + "TSA_R", + "STSA_N", + "STSA_R", // 5 + "RADL_N", + "RADL_R", + "RASL_N", + "RASL_R", + "RESERVED_VCL_N10", // 10 + "RESERVED_VCL_R11", + "RESERVED_VCL_N12", + "RESERVED_VCL_R13", + "RESERVED_VCL_N14", + "RESERVED_VCL_R15", // 15 + "BLA_W_LP", + "BLA_W_RADL", + "BLA_N_LP", + "IDR_W_RADL", + "IDR_N_LP", // 20 + "CRA_NUT", + "RESERVED_IRAP_VCL22", + "RESERVED_IRAP_VCL23", + "RESERVED_VCL24", + "RESERVED_VCL25", // 25 + "RESERVED_VCL26", + "RESERVED_VCL27", + "RESERVED_VCL28", + "RESERVED_VCL29", + "RESERVED_VCL30", // 30 + "RESERVED_VCL31", + "VPS", + "SPS", + "PPS", + "AUD", // 35 + "EOS", + "EOB", + "FD", + "PREFIX_SEI", + "SUFFIX_SEI", // 40 + "RESERVED_NVCL41", + "RESERVED_NVCL42", + "RESERVED_NVCL43", + "RESERVED_NVCL44", + "RESERVED_NVCL45", // 45 + "RESERVED_NVCL46", + "RESERVED_NVCL47" +}; + +const char* get_NAL_name(uint8_t unit_type) +{ + if (unit_type >= 48) { return "INVALID NAL >= 48"; } + return NAL_unit_name[unit_type]; +} diff --git a/pps.cc b/pps.cc new file mode 100644 index 0000000..de3bcda --- /dev/null +++ b/pps.cc @@ -0,0 +1,992 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "pps.h" +#include "decctx.h" +#include "util.h" + +#include +#include +#include +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + + +void pps_range_extension::reset() +{ + log2_max_transform_skip_block_size = 2; + cross_component_prediction_enabled_flag = false; + chroma_qp_offset_list_enabled_flag = false; + diff_cu_chroma_qp_offset_depth = 0; + chroma_qp_offset_list_len = 0; + log2_sao_offset_scale_luma = 0; + log2_sao_offset_scale_chroma = 0; +} + + +bool pps_range_extension::read(bitreader* br, decoder_context* ctx, const pic_parameter_set* pps) +{ + const seq_parameter_set* sps = ctx->get_sps(pps->seq_parameter_set_id); + + int uvlc; + + if (pps->transform_skip_enabled_flag) { + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc+2 > sps->Log2MaxTrafoSize) { + + // Note: this is out of spec, but the conformance stream + // PERSIST_RPARAM_A_RExt_Sony_2 codes a too large value. + + //ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + //return false; + } + + log2_max_transform_skip_block_size = uvlc+2; + } + + cross_component_prediction_enabled_flag = get_bits(br,1); + if (sps->ChromaArrayType != CHROMA_444 && + cross_component_prediction_enabled_flag) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + } + + chroma_qp_offset_list_enabled_flag = get_bits(br,1); + if (sps->ChromaArrayType == CHROMA_MONO && + chroma_qp_offset_list_enabled_flag) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + } + + if (chroma_qp_offset_list_enabled_flag) { + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc > sps->log2_diff_max_min_luma_coding_block_size) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + diff_cu_chroma_qp_offset_depth = uvlc; + + + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc > 5) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + chroma_qp_offset_list_len = uvlc+1; + + for (int i=0;i 12) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + cb_qp_offset_list[i] = svlc; + + svlc = get_svlc(br); + if (svlc == UVLC_ERROR || + svlc < -12 || svlc > 12) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + cr_qp_offset_list[i] = svlc; + } + } + + + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc > libde265_max(0, sps->BitDepth_Y-10)) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + log2_sao_offset_scale_luma = uvlc; + + uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR || + uvlc > libde265_max(0, sps->BitDepth_C-10)) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + log2_sao_offset_scale_chroma = uvlc; + + return true; +} + + +void pps_range_extension::dump(int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d,e) log2fh(fh, t,d,e) + + LOG0("---------- PPS range-extension ----------\n"); + LOG1("log2_max_transform_skip_block_size : %d\n", log2_max_transform_skip_block_size); + LOG1("cross_component_prediction_enabled_flag : %d\n", cross_component_prediction_enabled_flag); + LOG1("chroma_qp_offset_list_enabled_flag : %d\n", chroma_qp_offset_list_enabled_flag); + if (chroma_qp_offset_list_enabled_flag) { + LOG1("diff_cu_chroma_qp_offset_depth : %d\n", diff_cu_chroma_qp_offset_depth); + LOG1("chroma_qp_offset_list_len : %d\n", chroma_qp_offset_list_len); + for (int i=0;i= DE265_MAX_PPS_SETS || + uvlc == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return false; + } + + seq_parameter_set_id = uvlc = get_uvlc(br); + if (uvlc >= DE265_MAX_SPS_SETS || + uvlc == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); + return false; + } + + dependent_slice_segments_enabled_flag = get_bits(br,1); + output_flag_present_flag = get_bits(br,1); + num_extra_slice_header_bits = get_bits(br,3); + sign_data_hiding_flag = get_bits(br,1); + cabac_init_present_flag = get_bits(br,1); + num_ref_idx_l0_default_active = uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + num_ref_idx_l0_default_active++; + + num_ref_idx_l1_default_active = uvlc = get_uvlc(br); + if (uvlc == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + num_ref_idx_l1_default_active++; + + + if (!ctx->has_sps(seq_parameter_set_id)) { + ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); + return false; + } + + sps = ctx->get_shared_sps(seq_parameter_set_id); + + if ((pic_init_qp = get_svlc(br)) == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + pic_init_qp += 26; + + constrained_intra_pred_flag = get_bits(br,1); + transform_skip_enabled_flag = get_bits(br,1); + cu_qp_delta_enabled_flag = get_bits(br,1); + + if (cu_qp_delta_enabled_flag) { + if ((diff_cu_qp_delta_depth = get_uvlc(br)) == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + } else { + diff_cu_qp_delta_depth = 0; + } + + if ((pic_cb_qp_offset = get_svlc(br)) == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + if ((pic_cr_qp_offset = get_svlc(br)) == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + pps_slice_chroma_qp_offsets_present_flag = get_bits(br,1); + weighted_pred_flag = get_bits(br,1); + weighted_bipred_flag = get_bits(br,1); + transquant_bypass_enable_flag = get_bits(br,1); + tiles_enabled_flag = get_bits(br,1); + entropy_coding_sync_enabled_flag = get_bits(br,1); + + + // --- tiles --- + + if (tiles_enabled_flag) { + num_tile_columns = get_uvlc(br); + if (num_tile_columns == UVLC_ERROR || + num_tile_columns+1 > DE265_MAX_TILE_COLUMNS) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + num_tile_columns++; + + num_tile_rows = get_uvlc(br); + if (num_tile_rows == UVLC_ERROR || + num_tile_rows+1 > DE265_MAX_TILE_ROWS) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + num_tile_rows++; + + uniform_spacing_flag = get_bits(br,1); + + if (uniform_spacing_flag==false) { + int lastColumnWidth = sps->PicWidthInCtbsY; + int lastRowHeight = sps->PicHeightInCtbsY; + + for (int i=0; iadd_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + colWidth[i]++; + + lastColumnWidth -= colWidth[i]; + } + + if (lastColumnWidth <= 0) { + return false; + } + + colWidth[num_tile_columns-1] = lastColumnWidth; + + for (int i=0; iadd_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + rowHeight[i]++; + lastRowHeight -= rowHeight[i]; + } + + if (lastRowHeight <= 0) { + return false; + } + + + rowHeight[num_tile_rows-1] = lastRowHeight; + } + + loop_filter_across_tiles_enabled_flag = get_bits(br,1); + + } else { + num_tile_columns = 1; + num_tile_rows = 1; + uniform_spacing_flag = 1; + loop_filter_across_tiles_enabled_flag = 0; + } + + + + // END tiles + + + + beta_offset = 0; // default value + tc_offset = 0; // default value + + pps_loop_filter_across_slices_enabled_flag = get_bits(br,1); + deblocking_filter_control_present_flag = get_bits(br,1); + if (deblocking_filter_control_present_flag) { + deblocking_filter_override_enabled_flag = get_bits(br,1); + pic_disable_deblocking_filter_flag = get_bits(br,1); + if (!pic_disable_deblocking_filter_flag) { + beta_offset = get_svlc(br); + if (beta_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + beta_offset *= 2; + + tc_offset = get_svlc(br); + if (tc_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + tc_offset *= 2; + } + } + else { + deblocking_filter_override_enabled_flag = 0; + pic_disable_deblocking_filter_flag = 0; + } + + + // --- scaling list --- + + pic_scaling_list_data_present_flag = get_bits(br,1); + + // check consistency: if scaling-lists are not enabled, pic_scalign_list_data_present_flag + // must be FALSE + if (sps->scaling_list_enable_flag==0 && + pic_scaling_list_data_present_flag != 0) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + if (pic_scaling_list_data_present_flag) { + de265_error err = read_scaling_list(br, sps.get(), &scaling_list, true); + if (err != DE265_OK) { + ctx->add_warning(err, false); + return false; + } + } + else { + memcpy(&scaling_list, &sps->scaling_list, sizeof(scaling_list_data)); + } + + + + + lists_modification_present_flag = get_bits(br,1); + log2_parallel_merge_level = get_uvlc(br); + if (log2_parallel_merge_level == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + log2_parallel_merge_level += 2; + + if (log2_parallel_merge_level-2 > sps->log2_min_luma_coding_block_size-3 +1 + + sps->log2_diff_max_min_luma_coding_block_size) { + return false; + } + + slice_segment_header_extension_present_flag = get_bits(br,1); + pps_extension_flag = get_bits(br,1); + + if (pps_extension_flag) { + pps_range_extension_flag = get_bits(br,1); + pps_multilayer_extension_flag = get_bits(br,1); + pps_extension_6bits = get_bits(br,6); + + if (pps_range_extension_flag) { + bool success = range_extension.read(br, ctx, this); + if (!success) { + return false; + } + } + + //assert(false); + /* + while( more_rbsp_data() ) + + pps_extension_data_flag + u(1) + rbsp_trailing_bits() + + } + */ + } + + + set_derived_values(sps.get()); + + pps_read = true; + + return true; +} + + +void pic_parameter_set::set_derived_values(const seq_parameter_set* sps) +{ + Log2MinCuQpDeltaSize = sps->Log2CtbSizeY - diff_cu_qp_delta_depth; + + Log2MinCuChromaQpOffsetSize = sps->Log2CtbSizeY - range_extension.diff_cu_chroma_qp_offset_depth; + Log2MaxTransformSkipSize = range_extension.log2_max_transform_skip_block_size; + + if (uniform_spacing_flag) { + + // set columns widths + + int *const colPos = (int *)alloca((num_tile_columns+1) * sizeof(int)); + + for (int i=0;i<=num_tile_columns;i++) { + colPos[i] = i*sps->PicWidthInCtbsY / num_tile_columns; + } + for (int i=0;iPicHeightInCtbsY / num_tile_rows; + } + for (int i=0;iPicSizeInCtbsY); + CtbAddrTStoRS.resize(sps->PicSizeInCtbsY); + TileId .resize(sps->PicSizeInCtbsY); + TileIdRS .resize(sps->PicSizeInCtbsY); + MinTbAddrZS .resize(sps->PicSizeInTbsY ); + + + // raster scan (RS) <-> tile scan (TS) conversion + + for (int ctbAddrRS=0 ; ctbAddrRS < sps->PicSizeInCtbsY ; ctbAddrRS++) + { + int tbX = ctbAddrRS % sps->PicWidthInCtbsY; + int tbY = ctbAddrRS / sps->PicWidthInCtbsY; + int tileX=-1,tileY=-1; + + for (int i=0;i= colBd[i]) + tileX=i; + + for (int j=0;j= rowBd[j]) + tileY=j; + + CtbAddrRStoTS[ctbAddrRS] = 0; + for (int i=0;iCtbAddrRStoTS[ctbAddrRS] += (tbY - pps->rowBd[tileY])*pps->colWidth[tileX]; + //pps->CtbAddrRStoTS[ctbAddrRS] += tbX - pps->colBd[tileX]; + + CtbAddrRStoTS[ctbAddrRS] += sps->PicWidthInCtbsY * rowHeight[j]; + } + + assert(tileX>=0 && tileY>=0); + + CtbAddrRStoTS[ctbAddrRS] += (tbY-rowBd[tileY])*colWidth[tileX]; + CtbAddrRStoTS[ctbAddrRS] += tbX - colBd[tileX]; + + + // inverse mapping + + CtbAddrTStoRS[ CtbAddrRStoTS[ctbAddrRS] ] = ctbAddrRS; + } + + +#if 0 + logtrace(LogHeaders,"6.5.1 CtbAddrRSToTS\n"); + for (int y=0;yPicHeightInCtbsY;y++) + { + for (int x=0;xPicWidthInCtbsY;x++) + { + logtrace(LogHeaders,"%3d ", CtbAddrRStoTS[x + y*sps->PicWidthInCtbsY]); + } + + logtrace(LogHeaders,"\n"); + } +#endif + + // tile id + + for (int j=0, tIdx=0 ; jPicWidthInCtbsY + x] ] = tIdx; + TileIdRS[ y*sps->PicWidthInCtbsY + x ] = tIdx; + + //logtrace(LogHeaders,"tileID[%d,%d] = %d\n",x,y,pps->TileIdRS[ y*sps->PicWidthInCtbsY + x ]); + } + + tIdx++; + } + +#if 0 + logtrace(LogHeaders,"Tile IDs RS:\n"); + for (int y=0;yPicHeightInCtbsY;y++) { + for (int x=0;xPicWidthInCtbsY;x++) { + logtrace(LogHeaders,"%2d ",TileIdRS[y*sps->PicWidthInCtbsY+x]); + } + logtrace(LogHeaders,"\n"); + } +#endif + + // 6.5.2 Z-scan order array initialization process + + for (int y=0;yPicHeightInTbsY;y++) + for (int x=0;xPicWidthInTbsY;x++) + { + int tbX = (x<Log2MinTrafoSize)>>sps->Log2CtbSizeY; + int tbY = (y<Log2MinTrafoSize)>>sps->Log2CtbSizeY; + int ctbAddrRS = sps->PicWidthInCtbsY*tbY + tbX; + + MinTbAddrZS[x + y*sps->PicWidthInTbsY] = CtbAddrRStoTS[ctbAddrRS] + << ((sps->Log2CtbSizeY-sps->Log2MinTrafoSize)*2); + + int p=0; + for (int i=0 ; i<(sps->Log2CtbSizeY - sps->Log2MinTrafoSize) ; i++) { + int m=1<PicWidthInTbsY] += p; + } + + + // --- debug logging --- + + /* + logtrace(LogHeaders,"6.5.2 Z-scan order array\n"); + for (int y=0;yPicHeightInTbsY;y++) + { + for (int x=0;xPicWidthInTbsY;x++) + { + logtrace(LogHeaders,"%4d ", pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY]); + } + + logtrace(LogHeaders,"\n"); + } + + for (int i=0;iPicSizeInTbsY;i++) + { + for (int y=0;yPicHeightInTbsY;y++) + { + for (int x=0;xPicWidthInTbsY;x++) + { + if (pps->MinTbAddrZS[x + y*sps->PicWidthInTbsY] == i) { + logtrace(LogHeaders,"%d %d\n",x,y); + } + } + } + } + */ +} + + +bool pic_parameter_set::write(error_queue* errqueue, CABAC_encoder& out, + const seq_parameter_set* sps) +{ + if (pic_parameter_set_id >= DE265_MAX_PPS_SETS) { + errqueue->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return false; + } + out.write_uvlc(pic_parameter_set_id); + + if (seq_parameter_set_id >= DE265_MAX_PPS_SETS) { + errqueue->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); + return false; + } + out.write_uvlc(seq_parameter_set_id); + + out.write_bit(dependent_slice_segments_enabled_flag); + out.write_bit(output_flag_present_flag); + out.write_bits(num_extra_slice_header_bits,3); + out.write_bit(sign_data_hiding_flag); + out.write_bit(cabac_init_present_flag); + out.write_uvlc(num_ref_idx_l0_default_active-1); + out.write_uvlc(num_ref_idx_l1_default_active-1); + + out.write_svlc(pic_init_qp-26); + + out.write_bit(constrained_intra_pred_flag); + out.write_bit(transform_skip_enabled_flag); + out.write_bit(cu_qp_delta_enabled_flag); + + if (cu_qp_delta_enabled_flag) { + out.write_uvlc(diff_cu_qp_delta_depth); + } + + out.write_svlc(pic_cb_qp_offset); + out.write_svlc(pic_cr_qp_offset); + + out.write_bit(pps_slice_chroma_qp_offsets_present_flag); + out.write_bit(weighted_pred_flag); + out.write_bit(weighted_bipred_flag); + out.write_bit(transquant_bypass_enable_flag); + out.write_bit(tiles_enabled_flag); + out.write_bit(entropy_coding_sync_enabled_flag); + + + // --- tiles --- + + if (tiles_enabled_flag) { + if (num_tile_columns > DE265_MAX_TILE_COLUMNS) { + errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + out.write_uvlc(num_tile_columns-1); + + if (num_tile_rows > DE265_MAX_TILE_ROWS) { + errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + out.write_uvlc(num_tile_rows-1); + + out.write_bit(uniform_spacing_flag); + + if (uniform_spacing_flag==false) { + for (int i=0; iscaling_list_enable_flag==0 && + pic_scaling_list_data_present_flag != 0) { + errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false); + return false; + } + + if (pic_scaling_list_data_present_flag) { + de265_error err = write_scaling_list(out, sps, &scaling_list, true); + if (err != DE265_OK) { + errqueue->add_warning(err, false); + return false; + } + } + + + + out.write_bit(lists_modification_present_flag); + out.write_uvlc(log2_parallel_merge_level-2); + + out.write_bit(slice_segment_header_extension_present_flag); + out.write_bit(pps_extension_flag); + + if (pps_extension_flag) { + //assert(false); + /* + while( more_rbsp_data() ) + + pps_extension_data_flag + u(1) + rbsp_trailing_bits() + + } + */ + } + + + pps_read = true; + + return true; +} + + +void pic_parameter_set::dump(int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) + + LOG0("----------------- PPS -----------------\n"); + LOG1("pic_parameter_set_id : %d\n", pic_parameter_set_id); + LOG1("seq_parameter_set_id : %d\n", seq_parameter_set_id); + LOG1("dependent_slice_segments_enabled_flag : %d\n", dependent_slice_segments_enabled_flag); + LOG1("sign_data_hiding_flag : %d\n", sign_data_hiding_flag); + LOG1("cabac_init_present_flag : %d\n", cabac_init_present_flag); + LOG1("num_ref_idx_l0_default_active : %d\n", num_ref_idx_l0_default_active); + LOG1("num_ref_idx_l1_default_active : %d\n", num_ref_idx_l1_default_active); + + LOG1("pic_init_qp : %d\n", pic_init_qp); + LOG1("constrained_intra_pred_flag: %d\n", constrained_intra_pred_flag); + LOG1("transform_skip_enabled_flag: %d\n", transform_skip_enabled_flag); + LOG1("cu_qp_delta_enabled_flag : %d\n", cu_qp_delta_enabled_flag); + + if (cu_qp_delta_enabled_flag) { + LOG1("diff_cu_qp_delta_depth : %d\n", diff_cu_qp_delta_depth); + } + + LOG1("pic_cb_qp_offset : %d\n", pic_cb_qp_offset); + LOG1("pic_cr_qp_offset : %d\n", pic_cr_qp_offset); + LOG1("pps_slice_chroma_qp_offsets_present_flag : %d\n", pps_slice_chroma_qp_offsets_present_flag); + LOG1("weighted_pred_flag : %d\n", weighted_pred_flag); + LOG1("weighted_bipred_flag : %d\n", weighted_bipred_flag); + LOG1("output_flag_present_flag : %d\n", output_flag_present_flag); + LOG1("transquant_bypass_enable_flag: %d\n", transquant_bypass_enable_flag); + LOG1("tiles_enabled_flag : %d\n", tiles_enabled_flag); + LOG1("entropy_coding_sync_enabled_flag: %d\n", entropy_coding_sync_enabled_flag); + + if (tiles_enabled_flag) { + LOG1("num_tile_columns : %d\n", num_tile_columns); + LOG1("num_tile_rows : %d\n", num_tile_rows); + LOG1("uniform_spacing_flag: %d\n", uniform_spacing_flag); + + LOG0("tile column boundaries: "); + for (int i=0;i<=num_tile_columns;i++) { + LOG1("*%d ",colBd[i]); + } + LOG0("*\n"); + + LOG0("tile row boundaries: "); + for (int i=0;i<=num_tile_rows;i++) { + LOG1("*%d ",rowBd[i]); + } + LOG0("*\n"); + + //if( !uniform_spacing_flag ) { + /* + for( i = 0; i < num_tile_columns_minus1; i++ ) + + column_width_minus1[i] + ue(v) + for( i = 0; i < num_tile_rows_minus1; i++ ) + + row_height_minus1[i] + ue(v) + } + */ + + LOG1("loop_filter_across_tiles_enabled_flag : %d\n", loop_filter_across_tiles_enabled_flag); + } + + LOG1("pps_loop_filter_across_slices_enabled_flag: %d\n", pps_loop_filter_across_slices_enabled_flag); + LOG1("deblocking_filter_control_present_flag: %d\n", deblocking_filter_control_present_flag); + + if (deblocking_filter_control_present_flag) { + LOG1("deblocking_filter_override_enabled_flag: %d\n", deblocking_filter_override_enabled_flag); + LOG1("pic_disable_deblocking_filter_flag: %d\n", pic_disable_deblocking_filter_flag); + + LOG1("beta_offset: %d\n", beta_offset); + LOG1("tc_offset: %d\n", tc_offset); + } + + LOG1("pic_scaling_list_data_present_flag: %d\n", pic_scaling_list_data_present_flag); + if (pic_scaling_list_data_present_flag) { + //scaling_list_data() + } + + LOG1("lists_modification_present_flag: %d\n", lists_modification_present_flag); + LOG1("log2_parallel_merge_level : %d\n", log2_parallel_merge_level); + LOG1("num_extra_slice_header_bits : %d\n", num_extra_slice_header_bits); + LOG1("slice_segment_header_extension_present_flag : %d\n", slice_segment_header_extension_present_flag); + LOG1("pps_extension_flag : %d\n", pps_extension_flag); + LOG1("pps_range_extension_flag : %d\n", pps_range_extension_flag); + LOG1("pps_multilayer_extension_flag : %d\n", pps_multilayer_extension_flag); + LOG1("pps_extension_6bits : %d\n", pps_extension_6bits); + + LOG1("Log2MinCuQpDeltaSize : %d\n", Log2MinCuQpDeltaSize); + LOG1("Log2MinCuChromaQpOffsetSize (RExt) : %d\n", Log2MinCuChromaQpOffsetSize); + LOG1("Log2MaxTransformSkipSize (RExt) : %d\n", Log2MaxTransformSkipSize); + +#undef LOG0 +#undef LOG1 + + + if (pps_range_extension_flag) { + range_extension.dump(fd); + } +} + + +bool pic_parameter_set::is_tile_start_CTB(int ctbX,int ctbY) const +{ + // fast check + if (tiles_enabled_flag==0) { + return ctbX == 0 && ctbY == 0; + } + + for (int i=0;i + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "quality.h" +#include + + +uint32_t SSD(const uint8_t* img, int imgStride, + const uint8_t* ref, int refStride, + int width, int height) +{ + uint32_t sum=0; + + const uint8_t* iPtr = img; + const uint8_t* rPtr = ref; + + for (int y=0;yget_image_plane_at_pos(cIdx,x0,y0), img1->get_image_stride(cIdx), + img2->get_image_plane_at_pos(cIdx,x0,y0), img2->get_image_stride(cIdx), + 1< + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "refpic.h" +#include "decctx.h" +#include "util.h" + +#include +#include +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + + +void ref_pic_set::reset() +{ + NumNegativePics = 0; + NumPositivePics = 0; + NumDeltaPocs = 0; + NumPocTotalCurr_shortterm_only = 0; + + for (int i=0;i& sets, // previously read sets + bool sliceRefPicSet) // is this in the slice header? +{ + // --- is this set coded in prediction mode (not possible for the first set) + + char inter_ref_pic_set_prediction_flag; + + if (idxRps != 0) { + inter_ref_pic_set_prediction_flag = get_bits(br,1); + } + else { + inter_ref_pic_set_prediction_flag = 0; + } + + + + if (inter_ref_pic_set_prediction_flag) { + int vlc; + + /* Only for the last ref_pic_set (that's the one coded in the slice header), + we can specify relative to which reference set we code the set. */ + + int delta_idx; + if (sliceRefPicSet) { // idxRps == num_short_term_ref_pic_sets) { + delta_idx = vlc = get_uvlc(br); + if (delta_idx==UVLC_ERROR) { + return false; + } + + if (delta_idx>=idxRps) { + return false; + } + + delta_idx++; + } else { + delta_idx = 1; + } + + int RIdx = idxRps - delta_idx; // this is our source set, which we will modify + assert(RIdx>=0); + + int delta_rps_sign = get_bits(br,1); + int abs_delta_rps = vlc = get_uvlc(br); + if (vlc==UVLC_ERROR) { return false; } + abs_delta_rps++; + int DeltaRPS = (delta_rps_sign ? -abs_delta_rps : abs_delta_rps); + + // bits are stored in this order: + // - all bits for negative Pocs (forward), + // - then all bits for positive Pocs (forward), + // - then bits for '0', shifting of the current picture + // in total, these are 'nDeltaPocsRIdx'+1 bits + + logtrace(LogHeaders,"predicted from %d with delta %d\n",RIdx,DeltaRPS); + + int nDeltaPocsRIdx= sets[RIdx].NumDeltaPocs; // size of source set + char *const used_by_curr_pic_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char)); + char *const use_delta_flag = (char *)alloca((nDeltaPocsRIdx+1) * sizeof(char)); + + for (int j=0;j<=nDeltaPocsRIdx;j++) { + used_by_curr_pic_flag[j] = get_bits(br,1); + if (used_by_curr_pic_flag[j]) { + use_delta_flag[j] = 1; // if this frame is used, we also have to apply the delta + } else { + use_delta_flag[j] = get_bits(br,1); // otherwise, it is only optionally included + } + } + + logtrace(LogHeaders,"flags: "); + for (int j=0;j<=nDeltaPocsRIdx;j++) { + logtrace(LogHeaders,"%d ", use_delta_flag[j]); + } + logtrace(LogHeaders,"\n"); + + int nNegativeRIdx = sets[RIdx].NumNegativePics; + int nPositiveRIdx = sets[RIdx].NumPositivePics; + + // --- update list 0 (negative Poc) --- + // Iterate through all Pocs in decreasing value order (positive reverse, 0, negative forward). + + int i=0; // target index + + // positive list + for (int j=nPositiveRIdx-1;j>=0;j--) { + assert(RIdx >= 0 && RIdx < sets.size()); + assert(j>=0 && j < MAX_NUM_REF_PICS); + + int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS; // new delta + if (dPoc<0 && use_delta_flag[nNegativeRIdx+j]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS0[i] = dPoc; + out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nNegativeRIdx+j]; + i++; + } + } + + // frame 0 + if (DeltaRPS<0 && use_delta_flag[nDeltaPocsRIdx]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS0[i] = DeltaRPS; + out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nDeltaPocsRIdx]; + i++; + } + + // negative list + for (int j=0;j= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS0[i] = dPoc; + out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[j]; + i++; + } + } + + out_set->NumNegativePics = i; + + + // --- update list 1 (positive Poc) --- + // Iterate through all Pocs in increasing value order (negative reverse, 0, positive forward) + + i=0; // target index + + // negative list + for (int j=nNegativeRIdx-1;j>=0;j--) { + int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS; + if (dPoc>0 && use_delta_flag[j]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS1[i] = dPoc; + out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[j]; + i++; + } + } + + // frame 0 + if (DeltaRPS>0 && use_delta_flag[nDeltaPocsRIdx]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS1[i] = DeltaRPS; + out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nDeltaPocsRIdx]; + i++; + } + + // positive list + for (int j=0;j0 && use_delta_flag[nNegativeRIdx+j]) { + if (i>= MAX_NUM_REF_PICS) { return false; } + + out_set->DeltaPocS1[i] = dPoc; + out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nNegativeRIdx+j]; + i++; + } + } + + out_set->NumPositivePics = i; + + } else { + + // --- first, read the number of past and future frames in this set --- + + int num_negative_pics = get_uvlc(br); + int num_positive_pics = get_uvlc(br); + + if (num_negative_pics == UVLC_ERROR || + num_positive_pics == UVLC_ERROR) { + // invalid num-ref-pics value + errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + return false; + } + + // total number of reference pictures may not exceed buffer capacity + if (num_negative_pics + num_positive_pics > + sps->sps_max_dec_pic_buffering[ sps->sps_max_sub_layers-1 ]) { + + out_set->NumNegativePics = 0; + out_set->NumPositivePics = 0; + out_set->NumDeltaPocs = 0; + out_set->NumPocTotalCurr_shortterm_only = 0; + + errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + return false; + } + + if (num_negative_pics > MAX_NUM_REF_PICS || + num_positive_pics > MAX_NUM_REF_PICS) { + errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + return false; + } + + out_set->NumNegativePics = num_negative_pics; + out_set->NumPositivePics = num_positive_pics; + + // --- now, read the deltas between the reference frames to fill the lists --- + + // past frames + + int lastPocS=0; + for (int i=0;iDeltaPocS0[i] = lastPocS - delta_poc_s0; + out_set->UsedByCurrPicS0[i] = used_by_curr_pic_s0_flag; + lastPocS = out_set->DeltaPocS0[i]; + } + + // future frames + + lastPocS=0; + for (int i=0;iDeltaPocS1[i] = lastPocS + delta_poc_s1; + out_set->UsedByCurrPicS1[i] = used_by_curr_pic_s1_flag; + lastPocS = out_set->DeltaPocS1[i]; + } + } + + + out_set->compute_derived_values(); + + return true; +} + + +bool write_short_term_ref_pic_set_nopred(error_queue* errqueue, + const seq_parameter_set* sps, + CABAC_encoder& out, + const ref_pic_set* in_set, // which set to write + int idxRps, // index of the set to be written + const std::vector& sets, // previously read sets + bool sliceRefPicSet) // is this in the slice header? +{ + if (idxRps != 0) { + // inter_ref_pic_set_prediction_flag + out.write_bit(0); + } + + + // --- first, write the number of past and future frames in this set --- + + out.write_uvlc(in_set->NumNegativePics); + out.write_uvlc(in_set->NumPositivePics); + + // --- now, write the deltas between the reference frames to fill the lists --- + + // past frames + + int lastPocS=0; + for (int i=0;iNumNegativePics;i++) { + int delta_poc_s0 = lastPocS - in_set->DeltaPocS0[i]; + char used_by_curr_pic_s0_flag = in_set->UsedByCurrPicS0[i]; + + assert(delta_poc_s0 >= 1); + out.write_uvlc(delta_poc_s0-1); + out.write_bit(used_by_curr_pic_s0_flag); + lastPocS = in_set->DeltaPocS0[i]; + } + + // future frames + + lastPocS=0; + for (int i=0;iNumPositivePics;i++) { + int delta_poc_s1 = in_set->DeltaPocS1[i] - lastPocS; + char used_by_curr_pic_s1_flag = in_set->UsedByCurrPicS1[i]; + + assert(delta_poc_s1 >= 1); + out.write_uvlc(delta_poc_s1-1); + out.write_bit(used_by_curr_pic_s1_flag); + lastPocS = in_set->DeltaPocS1[i]; + } + + return true; +} + + +bool write_short_term_ref_pic_set(error_queue* errqueue, + const seq_parameter_set* sps, + CABAC_encoder& out, + const ref_pic_set* in_set, // which set to write + int idxRps, // index of the set to be read + const std::vector& sets, // previously read sets + bool sliceRefPicSet) // is this in the slice header? +{ + return write_short_term_ref_pic_set_nopred(errqueue, sps, out, in_set, idxRps, sets, + sliceRefPicSet); +} + + +void dump_short_term_ref_pic_set(const ref_pic_set* set, FILE* fh) +{ + log2fh(fh,"NumDeltaPocs: %d [-:%d +:%d]\n", set->NumDeltaPocs, + set->NumNegativePics, set->NumPositivePics); + + log2fh(fh,"DeltaPocS0:"); + for (int i=0;iNumNegativePics;i++) { + if (i) { log2fh(fh,","); } + log2fh(fh," %d/%d",set->DeltaPocS0[i],set->UsedByCurrPicS0[i]); + } + log2fh(fh,"\n"); + + log2fh(fh,"DeltaPocS1:"); + for (int i=0;iNumPositivePics;i++) { + if (i) { log2fh(fh,","); } + log2fh(fh," %d/%d",set->DeltaPocS1[i],set->UsedByCurrPicS1[i]); + } + log2fh(fh,"\n"); +} + + +void dump_compact_short_term_ref_pic_set(const ref_pic_set* set, int range, FILE* fh) +{ + char *const log = (char *)alloca((range+1+range+1) * sizeof(char)); + log[2*range+1] = 0; + for (int i=0;i<2*range+1;i++) log[i]='.'; + log[range]='|'; + + for (int i=set->NumNegativePics-1;i>=0;i--) { + int n = set->DeltaPocS0[i]; + if (n>=-range) { + if (set->UsedByCurrPicS0[i]) log[n+range] = 'X'; + else log[n+range] = 'o'; + } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS0[i] ? 'X':'o'); } + } + + for (int i=set->NumPositivePics-1;i>=0;i--) { + int n = set->DeltaPocS1[i]; + if (n<=range) { + if (set->UsedByCurrPicS1[i]) log[n+range] = 'X'; + else log[n+range] = 'o'; + } else { log2fh(fh,"*%d%c ",n, set->UsedByCurrPicS1[i] ? 'X':'o'); } + } + + log2fh(fh,"*%s\n",log); +} diff --git a/sao.cc b/sao.cc new file mode 100644 index 0000000..f93fc02 --- /dev/null +++ b/sao.cc @@ -0,0 +1,524 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "sao.h" +#include "util.h" + +#include +#include + + +template +void apply_sao_internal(de265_image* img, int xCtb,int yCtb, + const slice_segment_header* shdr, int cIdx, int nSW,int nSH, + const pixel_t* in_img, int in_stride, + /* */ pixel_t* out_img, int out_stride) +{ + const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb); + + int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3; + + logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH); + + if (SaoTypeIdx==0) { + return; + } + + const seq_parameter_set* sps = &img->get_sps(); + const pic_parameter_set* pps = &img->get_pps(); + const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C); + const int maxPixelValue = (1<get_width(cIdx); + const int height = img->get_height(cIdx); + + const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS; + + const int picWidthInCtbs = sps->PicWidthInCtbsY; + const int chromashiftW = sps->get_chroma_shift_W(cIdx); + const int chromashiftH = sps->get_chroma_shift_H(cIdx); + const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW; + const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH; + + + for (int i=0;i<5;i++) + { + logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]); + } + + + // actual size of CTB to be processed (can be smaller when partially outside of image) + const int ctbW = (xC+nSW>width) ? width -xC : nSW; + const int ctbH = (yC+nSH>height) ? height-yC : nSH; + + + const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb); + + if (SaoTypeIdx==2) { + int hPos[2], vPos[2]; + int vPosStride[2]; // vPos[] multiplied by image stride + int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3; + + switch (SaoEoClass) { + case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break; + case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break; + case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break; + case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break; + } + + vPosStride[0] = vPos[0] * in_stride; + vPosStride[1] = vPos[1] * in_stride; + + /* Reorder sao_info.saoOffsetVal[] array, so that we can index it + directly with the sum of the two pixel-difference signs. */ + int8_t saoOffsetVal[5]; // [2] unused + saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1]; + saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1]; + saoOffsetVal[2] = 0; + saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1]; + saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1]; + + + for (int j=0;jpcm_loop_filter_disable_flag && + img->get_pcm_flag((xC+i)<get_cu_transquant_bypass((xC+i)<=width || yS>=height) { + edgeIdx=0; + break; + } + + + // This part seems inefficient with all the get_SliceHeaderIndex() calls, + // but removing this part (because the input was known to have only a single + // slice anyway) reduced computation time only by 1.3%. + // TODO: however, this may still be a big part of SAO itself. + + slice_segment_header* sliceHeader = img->get_SliceHeader(xS<SliceAddrRS; + if (sliceAddrRS < ctbSliceAddrRS && + img->get_SliceHeader((xC+i)<slice_loop_filter_across_slices_enabled_flag==0) { + edgeIdx=0; + break; + } + + if (sliceAddrRS > ctbSliceAddrRS && + img->get_SliceHeader(xS<slice_loop_filter_across_slices_enabled_flag==0) { + edgeIdx=0; + break; + } + + + if (pps->loop_filter_across_tiles_enabled_flag==0 && + pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] != + pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) { + edgeIdx=0; + break; + } + } + + if (edgeIdx != 0) { + + edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) + + Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) ); + + if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table) + int offset = saoOffsetVal[edgeIdx+2]; + + out_ptr[i] = Clip3(0,maxPixelValue, + in_ptr[i] + offset); + } + } + } + } + } + else { + int bandShift = bitDepth-5; + int saoLeftClass = saoinfo->sao_band_position[cIdx]; + logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass); + + int bandTable[32]; + memset(bandTable, 0, sizeof(int)*32); + + for (int k=0;k<4;k++) { + bandTable[ (k+saoLeftClass)&31 ] = k+1; + } + + + /* If PCM or transquant_bypass is used in this CTB, we have to + run all checks (A). + Otherwise, we run a simplified version of the code (B). + + NOTE: this whole part of SAO does not seem to be a significant part of the time spent + */ + + if (extendedTests) { + + // (A) full version with all checks + + for (int j=0;jpcm_loop_filter_disable_flag && + img->get_pcm_flag((xC+i)<get_cu_transquant_bypass((xC+i)<>x actually computes >>(x%64). + // So we have to take care of large bandShifts. + int bandIdx; + if (bandShift >= 8) { + bandIdx = 0; + } else { + bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ]; + } + + if (bandIdx>0) { + int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; + + logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx, + offset, + in_img[xC+i+(yC+j)*in_stride], + in_img[xC+i+(yC+j)*in_stride]+offset); + + out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, + in_img[xC+i+(yC+j)*in_stride] + offset); + } + } + } + else + { + // (B) simplified version (only works if no PCM and transquant_bypass is active) + + for (int j=0;j= 8) { + bandIdx = 0; + } else { + bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ]; + } + + if (bandIdx>0) { + int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1]; + + out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue, + in_img[xC+i+(yC+j)*in_stride] + offset); + } + } + } + } +} + + +template +void apply_sao(de265_image* img, int xCtb,int yCtb, + const slice_segment_header* shdr, int cIdx, int nSW,int nSH, + const pixel_t* in_img, int in_stride, + /* */ pixel_t* out_img, int out_stride) +{ + if (img->high_bit_depth(cIdx)) { + apply_sao_internal(img,xCtb,yCtb, shdr,cIdx,nSW,nSH, + (uint16_t*)in_img, in_stride, + (uint16_t*)out_img,out_stride); + } + else { + apply_sao_internal(img,xCtb,yCtb, shdr,cIdx,nSW,nSH, + in_img, in_stride, + out_img,out_stride); + } +} + + +void apply_sample_adaptive_offset(de265_image* img) +{ + const seq_parameter_set& sps = img->get_sps(); + + if (sps.sample_adaptive_offset_enabled_flag==0) { + return; + } + + de265_image inputCopy; + de265_error err = inputCopy.copy_image(img); + if (err != DE265_OK) { + img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); + return; + } + + for (int yCtb=0; yCtbget_SliceHeaderCtb(xCtb,yCtb); + + if (shdr->slice_sao_luma_flag) { + apply_sao(img, xCtb,yCtb, shdr, 0, 1<get_image_plane(0), img->get_image_stride(0)); + } + + if (shdr->slice_sao_chroma_flag) { + int nSW = (1<get_image_plane(1), img->get_image_stride(1)); + + apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH, + inputCopy.get_image_plane(2), inputCopy.get_image_stride(2), + img->get_image_plane(2), img->get_image_stride(2)); + } + } +} + + +void apply_sample_adaptive_offset_sequential(de265_image* img) +{ + const seq_parameter_set& sps = img->get_sps(); + + if (sps.sample_adaptive_offset_enabled_flag==0) { + return; + } + + int lumaImageSize = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0); + int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1); + + uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ]; + if (inputCopy == NULL) { + img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); + return; + } + + + int nChannels = 3; + if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; } + + for (int cIdx=0;cIdxget_image_stride(cIdx); + int height = img->get_height(cIdx); + + memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx)); + + for (int yCtb=0; yCtbget_SliceHeaderCtb(xCtb,yCtb); + if (shdr==NULL) { return; } + + if (cIdx==0 && shdr->slice_sao_luma_flag) { + apply_sao(img, xCtb,yCtb, shdr, 0, 1<get_image_plane(0), img->get_image_stride(0)); + } + + if (cIdx!=0 && shdr->slice_sao_chroma_flag) { + int nSW = (1<get_image_plane(cIdx), img->get_image_stride(cIdx)); + } + } + } + + delete[] inputCopy; +} + + + + +class thread_task_sao : public thread_task +{ +public: + int ctb_y; + de265_image* img; /* this is where we get the SPS from + (either inputImg or outputImg can be a dummy image) + */ + + de265_image* inputImg; + de265_image* outputImg; + int inputProgress; + + virtual void work(); + virtual std::string name() const { + char buf[100]; + sprintf(buf,"sao-%d",ctb_y); + return buf; + } +}; + + +void thread_task_sao::work() +{ + state = Running; + img->thread_run(this); + + const seq_parameter_set& sps = img->get_sps(); + + const int rightCtb = sps.PicWidthInCtbsY-1; + const int ctbSize = (1<wait_for_progress(this, rightCtb,ctb_y, inputProgress); + + if (ctb_y>0) { + img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress); + } + + if (ctb_y+1wait_for_progress(this, rightCtb,ctb_y+1, inputProgress); + } + + + // copy input image to output for this CTB-row + + outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize); + + + // process SAO in the CTB-row + + for (int xCtb=0; xCtbget_SliceHeaderCtb(xCtb,ctb_y); + if (shdr==NULL) { + break; + } + + if (shdr->slice_sao_luma_flag) { + apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize, + inputImg ->get_image_plane(0), inputImg ->get_image_stride(0), + outputImg->get_image_plane(0), outputImg->get_image_stride(0)); + } + + if (shdr->slice_sao_chroma_flag) { + int nSW = ctbSize / sps.SubWidthC; + int nSH = ctbSize / sps.SubHeightC; + + apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH, + inputImg ->get_image_plane(1), inputImg ->get_image_stride(1), + outputImg->get_image_plane(1), outputImg->get_image_stride(1)); + + apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH, + inputImg ->get_image_plane(2), inputImg ->get_image_stride(2), + outputImg->get_image_plane(2), outputImg->get_image_stride(2)); + } + } + + + // mark SAO progress + + for (int x=0;x<=rightCtb;x++) { + const int CtbWidth = sps.PicWidthInCtbsY; + img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO); + } + + + state = Finished; + img->thread_finishes(this); +} + + +bool add_sao_tasks(image_unit* imgunit, int saoInputProgress) +{ + de265_image* img = imgunit->img; + const seq_parameter_set& sps = img->get_sps(); + + if (sps.sample_adaptive_offset_enabled_flag==0) { + return false; + } + + + decoder_context* ctx = img->decctx; + + de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(), + img->get_chroma_format(), + img->get_shared_sps(), + false, + img->decctx, //img->encctx, + img->pts, img->user_data, true); + if (err != DE265_OK) { + img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false); + return false; + } + + int nRows = sps.PicHeightInCtbsY; + + int n=0; + img->thread_start(nRows); + + for (int y=0;yinputImg = img; + task->outputImg = &imgunit->sao_output; + task->img = img; + task->ctb_y = y; + task->inputProgress = saoInputProgress; + + imgunit->tasks.push_back(task); + add_task(&ctx->thread_pool_, task); + n++; + } + + /* Currently need barrier here because when are finished, we have to swap the pixel + data back into the main image. */ + img->wait_for_completion(); + + img->exchange_pixel_data_with(imgunit->sao_output); + + return true; +} diff --git a/scan.cc b/scan.cc new file mode 100644 index 0000000..b29e283 --- /dev/null +++ b/scan.cc @@ -0,0 +1,152 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "scan.h" + +static position scan0 = { 0,0 }; +static position scan_h_1[ 2* 2], scan_v_1[ 2* 2], scan_d_1[ 2* 2]; +static position scan_h_2[ 4* 4], scan_v_2[ 4* 4], scan_d_2[ 4* 4]; +static position scan_h_3[ 8* 8], scan_v_3[ 8* 8], scan_d_3[ 8* 8]; +static position scan_h_4[16*16], scan_v_4[16*16], scan_d_4[16*16]; +static position scan_h_5[32*32], scan_v_5[32*32], scan_d_5[32*32]; + +static position* scan_h[7] = { &scan0,scan_h_1,scan_h_2,scan_h_3,scan_h_4,scan_h_5 }; +static position* scan_v[7] = { &scan0,scan_v_1,scan_v_2,scan_v_3,scan_v_4,scan_v_5 }; +static position* scan_d[7] = { &scan0,scan_d_1,scan_d_2,scan_d_3,scan_d_4,scan_d_5 }; + +static void init_scan_h(position* scan, int blkSize) +{ + int i=0; + for (int y=0;y=0) { + if (xsubBlock = lastSubBlock; + pos->scanPos = lastScanPos; +} + + +void init_scan_orders() +{ + for (int log2size=1;log2size<=5;log2size++) + { + init_scan_h(scan_h[log2size], 1< + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "sei.h" +#include "util.h" +#include "md5.h" + +#include "libde265/sps.h" +#include "libde265/image.h" +#include "libde265/decctx.h" + +#include + + +static de265_error read_sei_decoded_picture_hash(bitreader* reader, sei_message* sei, + const seq_parameter_set* sps) +{ + sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; + + seihash->hash_type = (enum sei_decoded_picture_hash_type)get_bits(reader,8); + + if (sps==NULL) { + return DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI; + } + + int nHashes = sps->chroma_format_idc==0 ? 1 : 3; + for (int i=0;ihash_type) { + case sei_decoded_picture_hash_type_MD5: + for (int b=0;b<16;b++) { seihash->md5[i][b] = get_bits(reader,8); } + break; + + case sei_decoded_picture_hash_type_CRC: + seihash->crc[i] = get_bits(reader,16); + break; + + case sei_decoded_picture_hash_type_checksum: + seihash->checksum[i] = get_bits(reader,32); + break; + } + } + + return DE265_OK; +} + + +static void dump_sei_decoded_picture_hash(const sei_message* sei, + const seq_parameter_set* sps) +{ + const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; + + loginfo(LogSEI," hash_type: "); + switch (seihash->hash_type) { + case sei_decoded_picture_hash_type_MD5: loginfo(LogSEI,"MD5\n"); break; + case sei_decoded_picture_hash_type_CRC: loginfo(LogSEI,"CRC\n"); break; + case sei_decoded_picture_hash_type_checksum: loginfo(LogSEI,"checksum\n"); break; + } + + int nHashes = sps->chroma_format_idc==0 ? 1 : 3; + for (int i=0;ihash_type) { + case sei_decoded_picture_hash_type_MD5: + loginfo(LogSEI," MD5[%d]: %02x", i,seihash->md5[i][0]); + for (int b=1;b<16;b++) { + loginfo(LogSEI,"*:%02x", seihash->md5[i][b]); + } + loginfo(LogSEI,"*\n"); + break; + + case sei_decoded_picture_hash_type_CRC: + loginfo(LogSEI," CRC[%d]: %02x\n", i,seihash->crc[i]); + break; + + case sei_decoded_picture_hash_type_checksum: + loginfo(LogSEI," checksum[%d]: %04x\n", i,seihash->checksum[i]); + break; + } + } +} + + +class raw_hash_data +{ +public: + raw_hash_data(int w, int stride); + ~raw_hash_data(); + + struct data_chunk { + const uint8_t* data; + int len; + }; + + data_chunk prepare_8bit(const uint8_t* data,int y); + data_chunk prepare_16bit(const uint8_t* data,int y); + +private: + int mWidth, mStride; + + uint8_t* mMem; +}; + + +raw_hash_data::raw_hash_data(int w, int stride) +{ + mWidth=w; + mStride=stride; + mMem = NULL; +} + +raw_hash_data::~raw_hash_data() +{ + delete[] mMem; +} + +raw_hash_data::data_chunk raw_hash_data::prepare_8bit(const uint8_t* data,int y) +{ + data_chunk chunk; + chunk.data = data+y*mStride; + chunk.len = mWidth; + return chunk; +} + +raw_hash_data::data_chunk raw_hash_data::prepare_16bit(const uint8_t* data,int y) +{ + if (mMem == NULL) { + mMem = new uint8_t[2*mWidth]; + } + + const uint16_t* data16 = (uint16_t*)data; + + for (int x=0; x> 8; + } + + data_chunk chunk; + chunk.data = mMem; + chunk.len = 2*mWidth; + return chunk; +} + + +static uint32_t compute_checksum_8bit(uint8_t* data,int w,int h,int stride, int bit_depth) +{ + uint32_t sum = 0; + + if (bit_depth<=8) { + for (int y=0; y> 8 ) ^ ( y >> 8 ); + sum += data[y*stride + x] ^ xorMask; + } + } + else { + for (int y=0; y> 8 ) ^ ( y >> 8 ); + sum += (data[y*stride + x] & 0xFF) ^ xorMask; + sum += (data[y*stride + x] >> 8) ^ xorMask; + } + } + + return sum & 0xFFFFFFFF; +} + +static inline uint16_t crc_process_byte(uint16_t crc, uint8_t byte) +{ + for (int bit=0;bit<8;bit++) { + int bitVal = (byte >> (7-bit)) & 1; + + int crcMsb = (crc>>15) & 1; + crc = (((crc<<1) + bitVal) & 0xFFFF); + + if (crcMsb) { crc ^= 0x1021; } + } + + return crc; +} + +/* +static uint16_t compute_CRC_8bit_old(const uint8_t* data,int w,int h,int stride) +{ + uint16_t crc = 0xFFFF; + + for (int y=0; y> 8); + uint16_t t = s ^ (s >> 4); + + return ((crc << 8) ^ + t ^ + (t << 5) ^ + (t << 12)) & 0xFFFF; +} + +static uint32_t compute_CRC_8bit_fast(const uint8_t* data,int w,int h,int stride, int bit_depth) +{ + raw_hash_data raw_data(w,stride); + + uint16_t crc = 0xFFFF; + + crc = crc_process_byte_parallel(crc, 0); + crc = crc_process_byte_parallel(crc, 0); + + for (int y=0; y8) + chunk = raw_data.prepare_16bit(data, y); + else + chunk = raw_data.prepare_8bit(data, y); + + for(int x=0; x8) + chunk = raw_data.prepare_16bit(data, y); + else + chunk = raw_data.prepare_8bit(data, y); + + MD5_Update(&md5, (void*)chunk.data, chunk.len); + } + + MD5_Final(result, &md5); +} + + +static de265_error process_sei_decoded_picture_hash(const sei_message* sei, de265_image* img) +{ + const sei_decoded_picture_hash* seihash = &sei->data.decoded_picture_hash; + + /* Do not check SEI on pictures that are not output. + Hash may be wrong, because of a broken link (BLA). + This happens, for example in conformance stream RAP_B, where a EOS-NAL + appears before a CRA (POC=32). */ + if (img->PicOutputFlag == false) { + return DE265_OK; + } + + //write_picture(img); + + int nHashes = img->get_sps().chroma_format_idc==0 ? 1 : 3; + for (int i=0;iget_width(i); + h = img->get_height(i); + + data = img->get_image_plane(i); + stride = img->get_image_stride(i); + + switch (seihash->hash_type) { + case sei_decoded_picture_hash_type_MD5: + { + uint8_t md5[16]; + compute_MD5(data,w,h,stride,md5, img->get_bit_depth(i)); + +/* + fprintf(stderr,"computed MD5: "); + for (int b=0;b<16;b++) { + fprintf(stderr,"%02x", md5[b]); + } + fprintf(stderr,"\n"); +*/ + + for (int b=0;b<16;b++) { + if (md5[b] != seihash->md5[i][b]) { +/* + fprintf(stderr,"SEI decoded picture MD5 mismatch (POC=%d)\n", img->PicOrderCntVal); +*/ + return DE265_ERROR_CHECKSUM_MISMATCH; + } + } + } + break; + + case sei_decoded_picture_hash_type_CRC: + { + uint16_t crc = compute_CRC_8bit_fast(data,w,h,stride, img->get_bit_depth(i)); + + logtrace(LogSEI,"SEI decoded picture hash: %04x <-[%d]-> decoded picture: %04x\n", + seihash->crc[i], i, crc); + + if (crc != seihash->crc[i]) { +/* + fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n", + seihash->crc[i], crc, img->PicOrderCntVal); +*/ + return DE265_ERROR_CHECKSUM_MISMATCH; + } + } + break; + + case sei_decoded_picture_hash_type_checksum: + { + uint32_t chksum = compute_checksum_8bit(data,w,h,stride, img->get_bit_depth(i)); + + if (chksum != seihash->checksum[i]) { +/* + fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n", + seihash->checksum[i], chksum, img->PicOrderCntVal); +*/ + return DE265_ERROR_CHECKSUM_MISMATCH; + } + } + break; + } + } + + loginfo(LogSEI,"decoded picture hash checked: OK\n"); + //printf("checked picture %d SEI: OK\n", img->PicOrderCntVal); + + return DE265_OK; +} + + +de265_error read_sei(bitreader* reader, sei_message* sei, bool suffix, const seq_parameter_set* sps) +{ + int payload_type = 0; + for (;;) + { + int byte = get_bits(reader,8); + payload_type += byte; + if (byte != 0xFF) { break; } + } + + //printf("SEI payload: %d\n",payload_type); + + int payload_size = 0; + for (;;) + { + int byte = get_bits(reader,8); + payload_size += byte; + if (byte != 0xFF) { break; } + } + + sei->payload_type = (enum sei_payload_type)payload_type; + sei->payload_size = payload_size; + + + // --- sei message dispatch + + de265_error err = DE265_OK; + + switch (sei->payload_type) { + case sei_payload_type_decoded_picture_hash: + err = read_sei_decoded_picture_hash(reader,sei,sps); + break; + + default: + // TODO: unknown SEI messages are ignored + break; + } + + return err; +} + +void dump_sei(const sei_message* sei, const seq_parameter_set* sps) +{ + loginfo(LogHeaders,"SEI message: %s\n", sei_type_name(sei->payload_type)); + + switch (sei->payload_type) { + case sei_payload_type_decoded_picture_hash: + dump_sei_decoded_picture_hash(sei, sps); + break; + + default: + // TODO: unknown SEI messages are ignored + break; + } +} + + +de265_error process_sei(const sei_message* sei, de265_image* img) +{ + de265_error err = DE265_OK; + + switch (sei->payload_type) { + case sei_payload_type_decoded_picture_hash: + if (img->decctx->param_sei_check_hash) { + err = process_sei_decoded_picture_hash(sei, img); + if (err==DE265_OK) { + //printf("SEI check ok\n"); + } + } + + break; + + default: + // TODO: unknown SEI messages are ignored + break; + } + + return err; +} + + +const char* sei_type_name(enum sei_payload_type type) +{ + switch (type) { + case sei_payload_type_buffering_period: + return "buffering_period"; + case sei_payload_type_pic_timing: + return "pic_timing"; + case sei_payload_type_pan_scan_rect: + return "pan_scan_rect"; + case sei_payload_type_filler_payload: + return "filler_payload"; + case sei_payload_type_user_data_registered_itu_t_t35: + return "user_data_registered_itu_t_t35"; + case sei_payload_type_user_data_unregistered: + return "user_data_unregistered"; + case sei_payload_type_recovery_point: + return "recovery_point"; + case sei_payload_type_scene_info: + return "scene_info"; + case sei_payload_type_picture_snapshot: + return "picture_snapshot"; + case sei_payload_type_progressive_refinement_segment_start: + return "progressive_refinement_segment_start"; + case sei_payload_type_progressive_refinement_segment_end: + return "progressive_refinement_segment_end"; + case sei_payload_type_film_grain_characteristics: + return "film_grain_characteristics"; + case sei_payload_type_post_filter_hint: + return "post_filter_hint"; + case sei_payload_type_tone_mapping_info: + return "tone_mapping_info"; + case sei_payload_type_frame_packing_arrangement: + return "frame_packing_arrangement"; + case sei_payload_type_display_orientation: + return "display_orientation"; + case sei_payload_type_structure_of_pictures_info: + return "structure_of_pictures_info"; + case sei_payload_type_active_parameter_sets: + return "active_parameter_sets"; + case sei_payload_type_decoding_unit_info: + return "decoding_unit_info"; + case sei_payload_type_temporal_sub_layer_zero_index: + return "temporal_sub_layer_zero_index"; + case sei_payload_type_decoded_picture_hash: + return "decoded_picture_hash"; + case sei_payload_type_scalable_nesting: + return "scalable_nesting"; + case sei_payload_type_region_refresh_info: + return "region_refresh_info"; + case sei_payload_type_no_display: + return "no_display"; + case sei_payload_type_motion_constrained_tile_sets: + return "motion_constrained_tile_sets"; + + default: + return "unknown SEI message"; + } +} diff --git a/slice.cc b/slice.cc new file mode 100644 index 0000000..e85ecc6 --- /dev/null +++ b/slice.cc @@ -0,0 +1,5072 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * Authors: struktur AG, Dirk Farin + * Min Chen + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "slice.h" +#include "motion.h" +#include "util.h" +#include "scan.h" +#include "intrapred.h" +#include "transform.h" +#include "threads.h" +#include "image.h" + +#include +#include +#include + + +#define LOCK de265_mutex_lock(&ctx->thread_pool.mutex) +#define UNLOCK de265_mutex_unlock(&ctx->thread_pool.mutex) + +extern bool read_short_term_ref_pic_set(error_queue* errqueue, + const seq_parameter_set* sps, + bitreader* br, + ref_pic_set* out_set, + int idxRps, // index of the set to be read + const std::vector& sets, + bool sliceRefPicSet); + + +void read_coding_tree_unit(thread_context* tctx); +void read_coding_quadtree(thread_context* tctx, + int xCtb, int yCtb, + int Log2CtbSizeY, + int ctDepth); +/* +void decode_inter_block(decoder_context* ctx,thread_context* tctx, + int xC, int yC, int log2CbSize); +*/ + +void slice_segment_header::set_defaults() +{ + slice_index = 0; + + first_slice_segment_in_pic_flag = 1; + no_output_of_prior_pics_flag = 0; + slice_pic_parameter_set_id = 0; + dependent_slice_segment_flag = 0; + slice_segment_address = 0; + + slice_type = SLICE_TYPE_I; + pic_output_flag = 1; + colour_plane_id = 0; + slice_pic_order_cnt_lsb = 0; + short_term_ref_pic_set_sps_flag = 1; + // ref_pic_set slice_ref_pic_set; + + short_term_ref_pic_set_idx = 0; + num_long_term_sps = 0; + num_long_term_pics = 0; + + //uint8_t lt_idx_sps[MAX_NUM_REF_PICS]; + //int poc_lsb_lt[MAX_NUM_REF_PICS]; + //char used_by_curr_pic_lt_flag[MAX_NUM_REF_PICS]; + + //char delta_poc_msb_present_flag[MAX_NUM_REF_PICS]; + //int delta_poc_msb_cycle_lt[MAX_NUM_REF_PICS]; + + slice_temporal_mvp_enabled_flag = 0; + slice_sao_luma_flag = 0; + slice_sao_chroma_flag = 0; + + num_ref_idx_active_override_flag = 0; + num_ref_idx_l0_active=1; // [1;16] + num_ref_idx_l1_active=1; // [1;16] + + ref_pic_list_modification_flag_l0 = 0; + ref_pic_list_modification_flag_l1 = 0; + //uint8_t list_entry_l0[16]; + //uint8_t list_entry_l1[16]; + + mvd_l1_zero_flag = 0; + cabac_init_flag = 0; + collocated_from_l0_flag = 0; + collocated_ref_idx = 0; + + // --- pred_weight_table --- + + luma_log2_weight_denom=0; // [0;7] + ChromaLog2WeightDenom=0; // [0;7] + + // first index is L0/L1 + /* + uint8_t luma_weight_flag[2][16]; // bool + uint8_t chroma_weight_flag[2][16]; // bool + int16_t LumaWeight[2][16]; + int8_t luma_offset[2][16]; + int16_t ChromaWeight[2][16][2]; + int8_t ChromaOffset[2][16][2]; + */ + + + five_minus_max_num_merge_cand = 0; + slice_qp_delta = 0; + + slice_cb_qp_offset = 0; + slice_cr_qp_offset = 0; + + cu_chroma_qp_offset_enabled_flag = 0; + + deblocking_filter_override_flag = 0; + slice_deblocking_filter_disabled_flag = 0; + slice_beta_offset=0; // = pps->beta_offset if undefined + slice_tc_offset=0; // = pps->tc_offset if undefined + + slice_loop_filter_across_slices_enabled_flag = 0; + + num_entry_point_offsets = 0; + //int offset_len; + //std::vector entry_point_offset; + + slice_segment_header_extension_length = 0; + + SliceAddrRS = slice_segment_address; +} + + +bool read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_context* ctx) +{ + int vlc; + + pic_parameter_set* pps = ctx->get_pps((int)shdr->slice_pic_parameter_set_id); + assert(pps); + seq_parameter_set* sps = ctx->get_sps((int)pps->seq_parameter_set_id); + assert(sps); + + shdr->luma_log2_weight_denom = vlc = get_uvlc(br); + if (vlc<0 || vlc>7) return false; + + if (sps->chroma_format_idc != 0) { + vlc = get_svlc(br); + vlc += shdr->luma_log2_weight_denom; + if (vlc<0 || vlc>7) return false; + shdr->ChromaLog2WeightDenom = vlc; + } + + int sumWeightFlags = 0; + + for (int l=0;l<=1;l++) + if (l==0 || (l==1 && shdr->slice_type == SLICE_TYPE_B)) + { + int num_ref = (l==0 ? shdr->num_ref_idx_l0_active-1 : shdr->num_ref_idx_l1_active-1); + + for (int i=0;i<=num_ref;i++) { + shdr->luma_weight_flag[l][i] = get_bits(br,1); + if (shdr->luma_weight_flag[l][i]) sumWeightFlags++; + } + + if (sps->chroma_format_idc != 0) { + for (int i=0;i<=num_ref;i++) { + shdr->chroma_weight_flag[l][i] = get_bits(br,1); + if (shdr->chroma_weight_flag[l][i]) sumWeightFlags+=2; + } + } + + for (int i=0;i<=num_ref;i++) { + if (shdr->luma_weight_flag[l][i]) { + + // delta_luma_weight + + vlc = get_svlc(br); + if (vlc < -128 || vlc > 127) return false; + + shdr->LumaWeight[l][i] = (1<luma_log2_weight_denom) + vlc; + + // luma_offset + + vlc = get_svlc(br); + if (vlc < -sps->WpOffsetHalfRangeY || vlc > sps->WpOffsetHalfRangeY-1) return false; + shdr->luma_offset[l][i] = vlc; + } + else { + shdr->LumaWeight[l][i] = 1<luma_log2_weight_denom; + shdr->luma_offset[l][i] = 0; + } + + if (shdr->chroma_weight_flag[l][i]) + for (int j=0;j<2;j++) { + // delta_chroma_weight + + vlc = get_svlc(br); + if (vlc < -128 || vlc > 127) return false; + + shdr->ChromaWeight[l][i][j] = (1<ChromaLog2WeightDenom) + vlc; + + // delta_chroma_offset + + vlc = get_svlc(br); + if (vlc < -4*sps->WpOffsetHalfRangeC || + vlc > 4*sps->WpOffsetHalfRangeC-1) return false; + + vlc = Clip3(-sps->WpOffsetHalfRangeC, + sps->WpOffsetHalfRangeC-1, + (sps->WpOffsetHalfRangeC + +vlc + -((sps->WpOffsetHalfRangeC*shdr->ChromaWeight[l][i][j]) + >> shdr->ChromaLog2WeightDenom))); + + shdr->ChromaOffset[l][i][j] = vlc; + } + else { + for (int j=0;j<2;j++) { + shdr->ChromaWeight[l][i][j] = 1<ChromaLog2WeightDenom; + shdr->ChromaOffset[l][i][j] = 0; + } + } + } + } + + // TODO: bitstream conformance requires that 'sumWeightFlags<=24' + + return true; +} + + +void slice_segment_header::reset() +{ + pps = NULL; + + slice_index = 0; + + first_slice_segment_in_pic_flag = 0; + no_output_of_prior_pics_flag = 0; + slice_pic_parameter_set_id = 0; + dependent_slice_segment_flag = 0; + slice_segment_address = 0; + + slice_type = 0; + pic_output_flag = 0; + colour_plane_id = 0; + slice_pic_order_cnt_lsb = 0; + short_term_ref_pic_set_sps_flag = 0; + slice_ref_pic_set.reset(); + + short_term_ref_pic_set_idx = 0; + num_long_term_sps = 0; + num_long_term_pics= 0; + + for (int i=0;iget_RapPicFlag()) { // TODO: is this still correct ? Should we drop RapPicFlag ? + no_output_of_prior_pics_flag = get_bits(br,1); + } + + slice_pic_parameter_set_id = get_uvlc(br); + if (slice_pic_parameter_set_id > DE265_MAX_PPS_SETS || + slice_pic_parameter_set_id == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return DE265_OK; + } + + if (!ctx->has_pps(slice_pic_parameter_set_id)) { + ctx->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return DE265_OK; + } + + pps = ctx->get_shared_pps(slice_pic_parameter_set_id); + + const seq_parameter_set* sps = pps->sps.get(); + if (!sps->sps_read) { + ctx->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false); + *continueDecoding = false; + return DE265_OK; + } + + if (!first_slice_segment_in_pic_flag) { + if (pps->dependent_slice_segments_enabled_flag) { + dependent_slice_segment_flag = get_bits(br,1); + } else { + dependent_slice_segment_flag = 0; + } + + int slice_segment_address = get_bits(br, ceil_log2(sps->PicSizeInCtbsY)); + + if (dependent_slice_segment_flag) { + if (slice_segment_address == 0) { + *continueDecoding = false; + ctx->add_warning(DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false); + return DE265_OK; + } + + if (ctx->previous_slice_header == NULL) { + return DE265_ERROR_NO_INITIAL_SLICE_HEADER; + } + + *this = *ctx->previous_slice_header; + + first_slice_segment_in_pic_flag = 0; + dependent_slice_segment_flag = 1; + } + + this->slice_segment_address = slice_segment_address; + } else { + dependent_slice_segment_flag = 0; + slice_segment_address = 0; + } + + if (slice_segment_address < 0 || + slice_segment_address >= sps->PicSizeInCtbsY) { + ctx->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + //printf("SLICE %d (%d)\n",slice_segment_address, sps->PicSizeInCtbsY); + + + if (!dependent_slice_segment_flag) { + for (int i=0; inum_extra_slice_header_bits; i++) { + //slice_reserved_undetermined_flag[i] + skip_bits(br,1); + } + + slice_type = get_uvlc(br); + if (slice_type > 2 || + slice_type == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + *continueDecoding = false; + return DE265_OK; + } + + if (pps->output_flag_present_flag) { + pic_output_flag = get_bits(br,1); + } + else { + pic_output_flag = 1; + } + + if (sps->separate_colour_plane_flag == 1) { + colour_plane_id = get_bits(br,2); + } + + + slice_pic_order_cnt_lsb = 0; + short_term_ref_pic_set_sps_flag = 0; + + int NumLtPics = 0; + + if (ctx->get_nal_unit_type() != NAL_UNIT_IDR_W_RADL && + ctx->get_nal_unit_type() != NAL_UNIT_IDR_N_LP) { + slice_pic_order_cnt_lsb = get_bits(br, sps->log2_max_pic_order_cnt_lsb); + short_term_ref_pic_set_sps_flag = get_bits(br,1); + + if (!short_term_ref_pic_set_sps_flag) { + read_short_term_ref_pic_set(ctx, sps, + br, &slice_ref_pic_set, + sps->num_short_term_ref_pic_sets(), + sps->ref_pic_sets, + true); + + CurrRpsIdx = sps->num_short_term_ref_pic_sets(); + CurrRps = slice_ref_pic_set; + } + else { + int nBits = ceil_log2(sps->num_short_term_ref_pic_sets()); + if (nBits>0) short_term_ref_pic_set_idx = get_bits(br,nBits); + else short_term_ref_pic_set_idx = 0; + + if (short_term_ref_pic_set_idx >= sps->num_short_term_ref_pic_sets()) { + ctx->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + CurrRpsIdx = short_term_ref_pic_set_idx; + CurrRps = sps->ref_pic_sets[CurrRpsIdx]; + } + + + // --- long-term MC --- + + if (sps->long_term_ref_pics_present_flag) { + if (sps->num_long_term_ref_pics_sps > 0) { + num_long_term_sps = get_uvlc(br); + if (num_long_term_sps == UVLC_ERROR) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + else { + num_long_term_sps = 0; + } + + num_long_term_pics= get_uvlc(br); + if (num_long_term_pics == UVLC_ERROR) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + // check maximum number of reference frames + + if (num_long_term_sps + + num_long_term_pics + + CurrRps.NumNegativePics + + CurrRps.NumPositivePics + > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers-1]) + { + ctx->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + *continueDecoding = false; + return DE265_OK; + } + + for (int i=0; inum_long_term_ref_pics_sps); + lt_idx_sps[i] = get_bits(br, nBits); + + // check that the referenced lt-reference really exists + + if (lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) { + ctx->add_warning(DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false); + *continueDecoding = false; + return DE265_OK; + } + + // delta_poc_msb_present_flag[i] = 0; // TODO ? + + ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[ lt_idx_sps[i] ]; + ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[ lt_idx_sps[i] ]; + } + else { + int nBits = sps->log2_max_pic_order_cnt_lsb; + poc_lsb_lt[i] = get_bits(br, nBits); + used_by_curr_pic_lt_flag[i] = get_bits(br,1); + + ctx->PocLsbLt[i] = poc_lsb_lt[i]; + ctx->UsedByCurrPicLt[i] = used_by_curr_pic_lt_flag[i]; + } + + if (ctx->UsedByCurrPicLt[i]) { + NumLtPics++; + } + + delta_poc_msb_present_flag[i] = get_bits(br,1); + if (delta_poc_msb_present_flag[i]) { + delta_poc_msb_cycle_lt[i] = get_uvlc(br); + if (delta_poc_msb_cycle_lt[i]==UVLC_ERROR) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + else { + delta_poc_msb_cycle_lt[i] = 0; + } + + if (i==0 || i==num_long_term_sps) { + ctx->DeltaPocMsbCycleLt[i] = delta_poc_msb_cycle_lt[i]; + } + else { + ctx->DeltaPocMsbCycleLt[i] = (delta_poc_msb_cycle_lt[i] + + ctx->DeltaPocMsbCycleLt[i-1]); + } + } + } + else { + num_long_term_sps = 0; + num_long_term_pics= 0; + } + + if (sps->sps_temporal_mvp_enabled_flag) { + slice_temporal_mvp_enabled_flag = get_bits(br,1); + } + else { + slice_temporal_mvp_enabled_flag = 0; + } + } + else { + slice_pic_order_cnt_lsb = 0; + num_long_term_sps = 0; + num_long_term_pics= 0; + } + + + // --- SAO --- + + if (sps->sample_adaptive_offset_enabled_flag) { + slice_sao_luma_flag = get_bits(br,1); + + if (sps->ChromaArrayType != CHROMA_MONO) { + slice_sao_chroma_flag = get_bits(br,1); + } + else { + slice_sao_chroma_flag = 0; + } + } + else { + slice_sao_luma_flag = 0; + slice_sao_chroma_flag = 0; + } + + num_ref_idx_l0_active = 0; + num_ref_idx_l1_active = 0; + + if (slice_type == SLICE_TYPE_P || + slice_type == SLICE_TYPE_B) { + num_ref_idx_active_override_flag = get_bits(br,1); + if (num_ref_idx_active_override_flag) { + num_ref_idx_l0_active = get_uvlc(br); + if (num_ref_idx_l0_active == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + num_ref_idx_l0_active++;; + + if (slice_type == SLICE_TYPE_B) { + num_ref_idx_l1_active = get_uvlc(br); + if (num_ref_idx_l1_active == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + num_ref_idx_l1_active++; + } + } + else { + num_ref_idx_l0_active = pps->num_ref_idx_l0_default_active; + num_ref_idx_l1_active = pps->num_ref_idx_l1_default_active; + } + + if (num_ref_idx_l0_active > 16) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + if (num_ref_idx_l1_active > 16) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + + NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics; + + if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) { + + int nBits = ceil_log2(NumPocTotalCurr); + + ref_pic_list_modification_flag_l0 = get_bits(br,1); + if (ref_pic_list_modification_flag_l0) { + for (int i=0;icabac_init_present_flag) { + cabac_init_flag = get_bits(br,1); + } + else { + cabac_init_flag = 0; + } + + if (slice_temporal_mvp_enabled_flag) { + if (slice_type == SLICE_TYPE_B) + collocated_from_l0_flag = get_bits(br,1); + else + collocated_from_l0_flag = 1; + + if (( collocated_from_l0_flag && num_ref_idx_l0_active > 1) || + (!collocated_from_l0_flag && num_ref_idx_l1_active > 1)) { + collocated_ref_idx = get_uvlc(br); + if (collocated_ref_idx == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + else { + collocated_ref_idx = 0; + } + + // check whether collocated_ref_idx points to a valid index + + if (( collocated_from_l0_flag && collocated_ref_idx >= num_ref_idx_l0_active) || + (!collocated_from_l0_flag && collocated_ref_idx >= num_ref_idx_l1_active)) { + ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + + if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) || + (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) { + + if (!read_pred_weight_table(br,this,ctx)) + { + ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + five_minus_max_num_merge_cand = get_uvlc(br); + if (five_minus_max_num_merge_cand == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + MaxNumMergeCand = 5-five_minus_max_num_merge_cand; + } + + slice_qp_delta = get_svlc(br); + if (slice_qp_delta == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + //logtrace(LogSlice,"slice_qp_delta: %d\n",shdr->slice_qp_delta); + + if (pps->pps_slice_chroma_qp_offsets_present_flag) { + slice_cb_qp_offset = get_svlc(br); + if (slice_cb_qp_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + slice_cr_qp_offset = get_svlc(br); + if (slice_cr_qp_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + else { + slice_cb_qp_offset = 0; + slice_cr_qp_offset = 0; + } + + if (pps->range_extension.chroma_qp_offset_list_enabled_flag) { + cu_chroma_qp_offset_enabled_flag = get_bits(br,1); + } + + if (pps->deblocking_filter_override_enabled_flag) { + deblocking_filter_override_flag = get_bits(br,1); + } + else { + deblocking_filter_override_flag = 0; + } + + slice_beta_offset = pps->beta_offset; + slice_tc_offset = pps->tc_offset; + + if (deblocking_filter_override_flag) { + slice_deblocking_filter_disabled_flag = get_bits(br,1); + if (!slice_deblocking_filter_disabled_flag) { + slice_beta_offset = get_svlc(br); + if (slice_beta_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + slice_beta_offset *= 2; + + slice_tc_offset = get_svlc(br); + if (slice_tc_offset == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + slice_tc_offset *= 2; + } + } + else { + slice_deblocking_filter_disabled_flag = pps->pic_disable_deblocking_filter_flag; + } + + if (pps->pps_loop_filter_across_slices_enabled_flag && + (slice_sao_luma_flag || slice_sao_chroma_flag || + !slice_deblocking_filter_disabled_flag )) { + slice_loop_filter_across_slices_enabled_flag = get_bits(br,1); + } + else { + slice_loop_filter_across_slices_enabled_flag = + pps->pps_loop_filter_across_slices_enabled_flag; + } + } + + if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag ) { + num_entry_point_offsets = get_uvlc(br); + if (num_entry_point_offsets == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (pps->entropy_coding_sync_enabled_flag) { + // check num_entry_points for valid range + + int firstCTBRow = slice_segment_address / sps->PicWidthInCtbsY; + int lastCTBRow = firstCTBRow + num_entry_point_offsets; + if (lastCTBRow >= sps->PicHeightInCtbsY) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + if (pps->tiles_enabled_flag) { + if (num_entry_point_offsets > pps->num_tile_columns * pps->num_tile_rows) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + entry_point_offset.resize( num_entry_point_offsets ); + + if (num_entry_point_offsets > 0) { + offset_len = get_uvlc(br); + if (offset_len == UVLC_ERROR) { + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + offset_len++; + + if (offset_len > 32) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + for (int i=0; i0) { + entry_point_offset[i] += entry_point_offset[i-1]; + } + } + } + } + else { + num_entry_point_offsets = 0; + } + + if (pps->slice_segment_header_extension_present_flag) { + slice_segment_header_extension_length = get_uvlc(br); + if (slice_segment_header_extension_length == UVLC_ERROR || + slice_segment_header_extension_length > 1000) { // TODO: safety check against too large values + ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + for (int i=0; i DE265_MAX_PPS_SETS) { + errqueue->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false); + return DE265_OK; + } + out.write_uvlc(slice_pic_parameter_set_id); + + if (!first_slice_segment_in_pic_flag) { + if (pps->dependent_slice_segments_enabled_flag) { + out.write_bit(dependent_slice_segment_flag); + } + + out.write_bits(slice_segment_address, ceil_log2(sps->PicSizeInCtbsY)); + + if (dependent_slice_segment_flag) { + if (slice_segment_address == 0) { + errqueue->add_warning(DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false); + return DE265_OK; + } + } + } + + if (slice_segment_address < 0 || + slice_segment_address > sps->PicSizeInCtbsY) { + errqueue->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + + if (!dependent_slice_segment_flag) { + for (int i=0; inum_extra_slice_header_bits; i++) { + //slice_reserved_undetermined_flag[i] + out.skip_bits(1); + } + + if (slice_type > 2) { + errqueue->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_OK; + } + out.write_uvlc(slice_type); + + if (pps->output_flag_present_flag) { + out.write_bit(pic_output_flag); + } + + if (sps->separate_colour_plane_flag == 1) { + out.write_bits(colour_plane_id,2); + } + + + int NumLtPics = 0; + + if (nal_unit_type != NAL_UNIT_IDR_W_RADL && + nal_unit_type != NAL_UNIT_IDR_N_LP) { + out.write_bits(slice_pic_order_cnt_lsb, sps->log2_max_pic_order_cnt_lsb); + out.write_bit(short_term_ref_pic_set_sps_flag); + + if (!short_term_ref_pic_set_sps_flag) { + /* TODO + read_short_term_ref_pic_set(ctx, sps, + br, &slice_ref_pic_set, + sps->num_short_term_ref_pic_sets, + sps->ref_pic_sets, + true); + */ + //CurrRpsIdx = sps->num_short_term_ref_pic_sets; + //CurrRps = slice_ref_pic_set; + } + else { + int nBits = ceil_log2(sps->num_short_term_ref_pic_sets()); + if (nBits>0) out.write_bits(short_term_ref_pic_set_idx,nBits); + else { assert(short_term_ref_pic_set_idx==0); } + + if (short_term_ref_pic_set_idx > sps->num_short_term_ref_pic_sets()) { + errqueue->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + //CurrRpsIdx = short_term_ref_pic_set_idx; + //CurrRps = sps->ref_pic_sets[CurrRpsIdx]; + } + + + // --- long-term MC --- + + if (sps->long_term_ref_pics_present_flag) { + if (sps->num_long_term_ref_pics_sps > 0) { + out.write_uvlc(num_long_term_sps); + } + else { + assert(num_long_term_sps == 0); + } + + out.write_uvlc(num_long_term_pics); + + + // check maximum number of reference frames + + if (num_long_term_sps + + num_long_term_pics + + CurrRps.NumNegativePics + + CurrRps.NumPositivePics + > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers-1]) + { + errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false); + return DE265_OK; + } + + for (int i=0; inum_long_term_ref_pics_sps); + out.write_bits(lt_idx_sps[i], nBits); + + // check that the referenced lt-reference really exists + + if (lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) { + errqueue->add_warning(DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false); + return DE265_OK; + } + + //ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[ lt_idx_sps[i] ]; + //ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[ lt_idx_sps[i] ]; + } + else { + int nBits = sps->log2_max_pic_order_cnt_lsb; + out.write_bits(poc_lsb_lt[i], nBits); + out.write_bit(used_by_curr_pic_lt_flag[i]); + + //ctx->PocLsbLt[i] = poc_lsb_lt[i]; + //ctx->UsedByCurrPicLt[i] = used_by_curr_pic_lt_flag[i]; + } + + //if (ctx->UsedByCurrPicLt[i]) { + //NumLtPics++; + //} + + out.write_bit(delta_poc_msb_present_flag[i]); + if (delta_poc_msb_present_flag[i]) { + out.write_uvlc(delta_poc_msb_cycle_lt[i]); + } + else { + assert(delta_poc_msb_cycle_lt[i] == 0); + } + + /* + if (i==0 || i==num_long_term_sps) { + ctx->DeltaPocMsbCycleLt[i] = delta_poc_msb_cycle_lt[i]; + } + else { + ctx->DeltaPocMsbCycleLt[i] = (delta_poc_msb_cycle_lt[i] + + ctx->DeltaPocMsbCycleLt[i-1]); + } + */ + } + } + else { + assert(num_long_term_sps == 0); + assert(num_long_term_pics== 0); + } + + if (sps->sps_temporal_mvp_enabled_flag) { + out.write_bit(slice_temporal_mvp_enabled_flag); + } + else { + assert(slice_temporal_mvp_enabled_flag == 0); + } + } + else { + assert(slice_pic_order_cnt_lsb == 0); + assert(num_long_term_sps == 0); + assert(num_long_term_pics== 0); + } + + + // --- SAO --- + + if (sps->sample_adaptive_offset_enabled_flag) { + out.write_bit(slice_sao_luma_flag); + out.write_bit(slice_sao_chroma_flag); + } + else { + assert(slice_sao_luma_flag == 0); + assert(slice_sao_chroma_flag== 0); + } + + if (slice_type == SLICE_TYPE_P || + slice_type == SLICE_TYPE_B) { + out.write_bit(num_ref_idx_active_override_flag); + + if (num_ref_idx_active_override_flag) { + out.write_uvlc(num_ref_idx_l0_active); + num_ref_idx_l0_active++;; + + if (slice_type == SLICE_TYPE_B) { + out.write_uvlc(num_ref_idx_l1_active); + num_ref_idx_l1_active++; + } + } + else { + assert(num_ref_idx_l0_active == pps->num_ref_idx_l0_default_active); + assert(num_ref_idx_l1_active == pps->num_ref_idx_l1_default_active); + } + + NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics; + + if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) { + + int nBits = ceil_log2(NumPocTotalCurr); + + out.write_bit(ref_pic_list_modification_flag_l0); + if (ref_pic_list_modification_flag_l0) { + for (int i=0;icabac_init_present_flag) { + out.write_bit(cabac_init_flag); + } + else { + assert(cabac_init_flag == 0); + } + + if (slice_temporal_mvp_enabled_flag) { + if (slice_type == SLICE_TYPE_B) + out.write_bit(collocated_from_l0_flag); + else + { assert(collocated_from_l0_flag == 1); } + + if (( collocated_from_l0_flag && num_ref_idx_l0_active > 1) || + (!collocated_from_l0_flag && num_ref_idx_l1_active > 1)) { + out.write_uvlc(collocated_ref_idx); + } + else { + assert(collocated_ref_idx == 0); + } + } + + if ((pps->weighted_pred_flag && slice_type == SLICE_TYPE_P) || + (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) { + + assert(0); + /* TODO + if (!read_pred_weight_table(br,this,ctx)) + { + ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + */ + } + + out.write_uvlc(five_minus_max_num_merge_cand); + //MaxNumMergeCand = 5-five_minus_max_num_merge_cand; + } + + out.write_svlc(slice_qp_delta); + + if (pps->pps_slice_chroma_qp_offsets_present_flag) { + out.write_svlc(slice_cb_qp_offset); + out.write_svlc(slice_cr_qp_offset); + } + else { + assert(slice_cb_qp_offset == 0); + assert(slice_cr_qp_offset == 0); + } + + if (pps->deblocking_filter_override_enabled_flag) { + out.write_bit(deblocking_filter_override_flag); + } + else { + assert(deblocking_filter_override_flag == 0); + } + + //slice_beta_offset = pps->beta_offset; + //slice_tc_offset = pps->tc_offset; + + if (deblocking_filter_override_flag) { + out.write_bit(slice_deblocking_filter_disabled_flag); + if (!slice_deblocking_filter_disabled_flag) { + out.write_svlc(slice_beta_offset/2); + out.write_svlc(slice_tc_offset /2); + } + } + else { + assert(slice_deblocking_filter_disabled_flag == pps->pic_disable_deblocking_filter_flag); + } + + if (pps->pps_loop_filter_across_slices_enabled_flag && + (slice_sao_luma_flag || slice_sao_chroma_flag || + !slice_deblocking_filter_disabled_flag )) { + out.write_bit(slice_loop_filter_across_slices_enabled_flag); + } + else { + assert(slice_loop_filter_across_slices_enabled_flag == + pps->pps_loop_filter_across_slices_enabled_flag); + } + } + + if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag ) { + out.write_uvlc(num_entry_point_offsets); + + if (num_entry_point_offsets > 0) { + out.write_uvlc(offset_len-1); + + for (int i=0; i0) prev = entry_point_offset[i-1]; + out.write_bits(entry_point_offset[i]-prev-1, offset_len); + } + } + } + } + else { + assert(num_entry_point_offsets == 0); + } + + if (pps->slice_segment_header_extension_present_flag) { + out.write_uvlc(slice_segment_header_extension_length); + if (slice_segment_header_extension_length > 1000) { // TODO: safety check against too large values + errqueue->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + for (int i=0; ipic_init_qp + slice_qp_delta; + + switch (slice_type) + { + case SLICE_TYPE_I: initType = 0; break; + case SLICE_TYPE_P: initType = cabac_init_flag + 1; break; + case SLICE_TYPE_B: initType = 2 - cabac_init_flag; break; + } + + MaxNumMergeCand = 5-five_minus_max_num_merge_cand; +} + + +//----------------------------------------------------------------------- + + +void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx, int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) +#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) +#define LOG4(t,d1,d2,d3,d4) log2fh(fh, t,d1,d2,d3,d4) + + const pic_parameter_set* pps = ctx->get_pps(slice_pic_parameter_set_id); + assert(pps->pps_read); // TODO: error handling + + const seq_parameter_set* sps = ctx->get_sps((int)pps->seq_parameter_set_id); + assert(sps->sps_read); // TODO: error handling + + + LOG0("----------------- SLICE -----------------\n"); + LOG1("first_slice_segment_in_pic_flag : %d\n", first_slice_segment_in_pic_flag); + if (ctx->get_nal_unit_type() >= NAL_UNIT_BLA_W_LP && + ctx->get_nal_unit_type() <= NAL_UNIT_RESERVED_IRAP_VCL23) { + LOG1("no_output_of_prior_pics_flag : %d\n", no_output_of_prior_pics_flag); + } + + LOG1("slice_pic_parameter_set_id : %d\n", slice_pic_parameter_set_id); + + if (!first_slice_segment_in_pic_flag) { + //if (pps->dependent_slice_segments_enabled_flag) { + LOG1("dependent_slice_segment_flag : %d\n", dependent_slice_segment_flag); + //} + LOG1("slice_segment_address : %d\n", slice_segment_address); + } + + //if (!dependent_slice_segment_flag) + { + //for (int i=0; inum_extra_slice_header_bits; i++) { + //slice_reserved_flag[i] + + LOG1("slice_type : %c\n", + slice_type == 0 ? 'B' : + slice_type == 1 ? 'P' : 'I'); + + if (pps->output_flag_present_flag) { + LOG1("pic_output_flag : %d\n", pic_output_flag); + } + + if (sps->separate_colour_plane_flag == 1) { + LOG1("colour_plane_id : %d\n", colour_plane_id); + } + + LOG1("slice_pic_order_cnt_lsb : %d\n", slice_pic_order_cnt_lsb); + + if (ctx->get_nal_unit_type() != NAL_UNIT_IDR_W_RADL && + ctx->get_nal_unit_type() != NAL_UNIT_IDR_N_LP) { + LOG1("short_term_ref_pic_set_sps_flag : %d\n", short_term_ref_pic_set_sps_flag); + + if (!short_term_ref_pic_set_sps_flag) { + LOG1("ref_pic_set[ %2d ]: ",sps->num_short_term_ref_pic_sets()); + dump_compact_short_term_ref_pic_set(&slice_ref_pic_set, 16, fh); + } + else if (sps->num_short_term_ref_pic_sets() > 1) { + LOG1("short_term_ref_pic_set_idx : %d\n", short_term_ref_pic_set_idx); + dump_compact_short_term_ref_pic_set(&sps->ref_pic_sets[short_term_ref_pic_set_idx], 16, fh); + } + + if (sps->long_term_ref_pics_present_flag) { + if (sps->num_long_term_ref_pics_sps > 0) { + LOG1("num_long_term_sps : %d\n", num_long_term_sps); + } + + LOG1("num_long_term_pics : %d\n", num_long_term_pics); + +#if 0 + for (int i=0; iPocLsbLt[i]); + LOG2("UsedByCurrPicLt[%d] : %d\n", i, ctx->UsedByCurrPicLt[i]); + LOG2("DeltaPocMsbCycleLt[%d] : %d\n", i, ctx->DeltaPocMsbCycleLt[i]); + } +#endif + } + + if (sps->sps_temporal_mvp_enabled_flag) { + LOG1("slice_temporal_mvp_enabled_flag : %d\n", slice_temporal_mvp_enabled_flag); + } + } + + + if (sps->sample_adaptive_offset_enabled_flag) { + LOG1("slice_sao_luma_flag : %d\n", slice_sao_luma_flag); + LOG1("slice_sao_chroma_flag : %d\n", slice_sao_chroma_flag); + } + + + if (slice_type == SLICE_TYPE_P || slice_type == SLICE_TYPE_B) { + LOG1("num_ref_idx_active_override_flag : %d\n", num_ref_idx_active_override_flag); + + LOG2("num_ref_idx_l0_active : %d %s\n", num_ref_idx_l0_active, + num_ref_idx_active_override_flag ? "" : "(from PPS)"); + + if (slice_type == SLICE_TYPE_B) { + LOG2("num_ref_idx_l1_active : %d %s\n", num_ref_idx_l1_active, + num_ref_idx_active_override_flag ? "" : "(from PPS)"); + } + + if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) + { + LOG1("ref_pic_list_modification_flag_l0 : %d\n", ref_pic_list_modification_flag_l0); + if (ref_pic_list_modification_flag_l0) { + for (int i=0;iweighted_pred_flag && slice_type == SLICE_TYPE_P) || + (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) + { + LOG1("luma_log2_weight_denom : %d\n", luma_log2_weight_denom); + if (sps->chroma_format_idc != 0) { + LOG1("ChromaLog2WeightDenom : %d\n", ChromaLog2WeightDenom); + } + + for (int l=0;l<=1;l++) + if (l==0 || (l==1 && slice_type == SLICE_TYPE_B)) + { + int num_ref = (l==0 ? + num_ref_idx_l0_active-1 : + num_ref_idx_l1_active-1); + + if (false) { // do not show these flags + for (int i=0;i<=num_ref;i++) { + LOG3("luma_weight_flag_l%d[%d] : %d\n",l,i,luma_weight_flag[l][i]); + } + + if (sps->chroma_format_idc != 0) { + for (int i=0;i<=num_ref;i++) { + LOG3("chroma_weight_flag_l%d[%d] : %d\n",l,i,chroma_weight_flag[l][i]); + } + } + } + + for (int i=0;i<=num_ref;i++) { + LOG3("LumaWeight_L%d[%d] : %d\n",l,i,LumaWeight[l][i]); + LOG3("luma_offset_l%d[%d] : %d\n",l,i,luma_offset[l][i]); + + for (int j=0;j<2;j++) { + LOG4("ChromaWeight_L%d[%d][%d] : %d\n",l,i,j,ChromaWeight[l][i][j]); + LOG4("ChromaOffset_L%d[%d][%d] : %d\n",l,i,j,ChromaOffset[l][i][j]); + } + } + } + } + + LOG1("five_minus_max_num_merge_cand : %d\n", five_minus_max_num_merge_cand); + } + + + LOG1("slice_qp_delta : %d\n", slice_qp_delta); + if (pps->pps_slice_chroma_qp_offsets_present_flag) { + LOG1("slice_cb_qp_offset : %d\n", slice_cb_qp_offset); + LOG1("slice_cr_qp_offset : %d\n", slice_cr_qp_offset); + } + + if (pps->deblocking_filter_override_enabled_flag) { + LOG1("deblocking_filter_override_flag : %d\n", deblocking_filter_override_flag); + } + + LOG2("slice_deblocking_filter_disabled_flag : %d %s\n", + slice_deblocking_filter_disabled_flag, + (deblocking_filter_override_flag ? "(override)" : "(from pps)")); + + if (deblocking_filter_override_flag) { + + if (!slice_deblocking_filter_disabled_flag) { + LOG1("slice_beta_offset : %d\n", slice_beta_offset); + LOG1("slice_tc_offset : %d\n", slice_tc_offset); + } + } + + if (pps->pps_loop_filter_across_slices_enabled_flag && + (slice_sao_luma_flag || slice_sao_chroma_flag || + !slice_deblocking_filter_disabled_flag)) { + LOG1("slice_loop_filter_across_slices_enabled_flag : %d\n", + slice_loop_filter_across_slices_enabled_flag); + } + } + + if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag) { + LOG1("num_entry_point_offsets : %d\n", num_entry_point_offsets); + + if (num_entry_point_offsets > 0) { + LOG1("offset_len : %d\n", offset_len); + + for (int i=0; ishdr->SliceQPY; + const int initType = tctx->shdr->initType; + assert(initType >= 0 && initType <= 2); + + tctx->ctx_model.init(initType, QPY); + + for (int i=0;i<4;i++) { + tctx->StatCoeff[i] = 0; + } +} + + + +static int decode_transform_skip_flag(thread_context* tctx, int cIdx) +{ + const int context = (cIdx==0) ? 0 : 1; + + logtrace(LogSlice,"# transform_skip_flag (context=%d)\n",context); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_TRANSFORM_SKIP_FLAG+context]); + + logtrace(LogSymbols,"$1 transform_skip_flag=%d\n",bit); + + return bit; +} + + +static int decode_sao_merge_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_merge_left/up_flag\n"); + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_SAO_MERGE_FLAG]); + + logtrace(LogSymbols,"$1 sao_merge_flag=%d\n",bit); + + return bit; +} + + + +static int decode_sao_type_idx(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_type_idx_luma/chroma\n"); + + int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_SAO_TYPE_IDX]); + + if (bit0==0) { + logtrace(LogSymbols,"$1 sao_type_idx=%d\n",0); + return 0; + } + else { + int bit1 = decode_CABAC_bypass(&tctx->cabac_decoder); + if (bit1==0) { + logtrace(LogSymbols,"$1 sao_type_idx=%d\n",1); + return 1; + } + else { + logtrace(LogSymbols,"$1 sao_type_idx=%d\n",2); + return 2; + } + } +} + + +static int decode_sao_offset_abs(thread_context* tctx, int bitDepth) +{ + logtrace(LogSlice,"# sao_offset_abs\n"); + int cMax = (1<<(libde265_min(bitDepth,10)-5))-1; + int value = decode_CABAC_TU_bypass(&tctx->cabac_decoder, cMax); + logtrace(LogSymbols,"$1 sao_offset_abs=%d\n",value); + return value; +} + + +static int decode_sao_class(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_class\n"); + int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2); + logtrace(LogSymbols,"$1 sao_class=%d\n",value); + return value; +} + + +static int decode_sao_offset_sign(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_offset_sign\n"); + int value = decode_CABAC_bypass(&tctx->cabac_decoder); + logtrace(LogSymbols,"$1 sao_offset_sign=%d\n",value); + return value; +} + + +static int decode_sao_band_position(thread_context* tctx) +{ + logtrace(LogSlice,"# sao_band_position\n"); + int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder,5); + logtrace(LogSymbols,"$1 sao_band_position=%d\n",value); + return value; +} + + +static int decode_transquant_bypass_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# cu_transquant_bypass_enable_flag\n"); + int value = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG]); + logtrace(LogSymbols,"$1 transquant_bypass_flag=%d\n",value); + return value; +} + + +#include +#include + +static int decode_split_cu_flag(thread_context* tctx, + int x0, int y0, int ctDepth) +{ + // check if neighbors are available + + int availableL = check_CTB_available(tctx->img, x0,y0, x0-1,y0); + int availableA = check_CTB_available(tctx->img, x0,y0, x0,y0-1); + + int condL = 0; + int condA = 0; + + if (availableL && tctx->img->get_ctDepth(x0-1,y0) > ctDepth) condL=1; + if (availableA && tctx->img->get_ctDepth(x0,y0-1) > ctDepth) condA=1; + + int contextOffset = condL + condA; + int context = contextOffset; + + // decode bit + + logtrace(LogSlice,"# split_cu_flag context=%d R=%x\n", context, tctx->cabac_decoder.range); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_CU_FLAG + context]); + + logtrace(LogSlice,"> split_cu_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit); + + logtrace(LogSymbols,"$1 split_cu_flag=%d\n",bit); + + return bit; +} + + +static int decode_cu_skip_flag(thread_context* tctx, + int x0, int y0, int ctDepth) +{ + decoder_context* ctx = tctx->decctx; + + // check if neighbors are available + + int availableL = check_CTB_available(tctx->img, x0,y0, x0-1,y0); + int availableA = check_CTB_available(tctx->img, x0,y0, x0,y0-1); + + int condL = 0; + int condA = 0; + + if (availableL && tctx->img->get_cu_skip_flag(x0-1,y0)) condL=1; + if (availableA && tctx->img->get_cu_skip_flag(x0,y0-1)) condA=1; + + int contextOffset = condL + condA; + int context = contextOffset; + + // decode bit + + logtrace(LogSlice,"# cu_skip_flag context=%d R=%x\n", context, tctx->cabac_decoder.range); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CU_SKIP_FLAG + context]); + + logtrace(LogSlice,"> cu_skip_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit); + + logtrace(LogSymbols,"$1 cu_skip_flag=%d\n",bit); + + return bit; +} + + +static enum PartMode decode_part_mode(thread_context* tctx, + enum PredMode pred_mode, int cLog2CbSize) +{ + de265_image* img = tctx->img; + + if (pred_mode == MODE_INTRA) { + logtrace(LogSlice,"# part_mode (INTRA)\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE]); + + logtrace(LogSlice,"> %s\n",bit ? "2Nx2N" : "NxN"); + + logtrace(LogSymbols,"$1 part_mode=%d\n",bit ? PART_2Nx2N : PART_NxN); + + return bit ? PART_2Nx2N : PART_NxN; + } + else { + const seq_parameter_set& sps = img->get_sps(); + + int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+0]); + if (bit0) { logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2Nx2N); return PART_2Nx2N; } + + // CHECK_ME: I optimize code and fix bug here, need more VERIFY! + int bit1 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+1]); + if (cLog2CbSize > sps.Log2MinCbSizeY) { + if (!sps.amp_enabled_flag) { + logtrace(LogSymbols,"$1 part_mode=%d\n",bit1 ? PART_2NxN : PART_Nx2N); + return bit1 ? PART_2NxN : PART_Nx2N; + } + else { + int bit3 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+3]); + if (bit3) { + logtrace(LogSymbols,"$1 part_mode=%d\n",bit1 ? PART_2NxN : PART_Nx2N); + return bit1 ? PART_2NxN : PART_Nx2N; + } + + int bit4 = decode_CABAC_bypass(&tctx->cabac_decoder); + if ( bit1 && bit4) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxnD); + return PART_2NxnD; + } + if ( bit1 && !bit4) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxnU); + return PART_2NxnU; + } + if (!bit1 && !bit4) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_nLx2N); + return PART_nLx2N; + } + if (!bit1 && bit4) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_nRx2N); + return PART_nRx2N; + } + } + } + else { + // TODO, we could save one if here when first decoding the next bin and then + // checkcLog2CbSize==3 when it is '0' + + if (bit1) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxN); + return PART_2NxN; + } + + if (cLog2CbSize==3) { + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_Nx2N); + return PART_Nx2N; + } + else { + int bit2 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+2]); + logtrace(LogSymbols,"$1 part_mode=%d\n",PART_NxN-bit2); + return (enum PartMode)((int)PART_NxN - bit2)/*bit2 ? PART_Nx2N : PART_NxN*/; + } + } + } + + assert(false); // should never be reached + return PART_2Nx2N; +} + + +static inline int decode_prev_intra_luma_pred_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# prev_intra_luma_pred_flag\n"); + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG]); + logtrace(LogSymbols,"$1 prev_intra_luma_pred_flag=%d\n",bit); + return bit; +} + + +static inline int decode_mpm_idx(thread_context* tctx) +{ + logtrace(LogSlice,"# mpm_idx (TU:2)\n"); + int mpm = decode_CABAC_TU_bypass(&tctx->cabac_decoder, 2); + logtrace(LogSlice,"> mpm_idx = %d\n",mpm); + logtrace(LogSymbols,"$1 mpm_idx=%d\n",mpm); + return mpm; +} + + +static inline int decode_rem_intra_luma_pred_mode(thread_context* tctx) +{ + logtrace(LogSlice,"# rem_intra_luma_pred_mode (5 bits)\n"); + int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 5); + logtrace(LogSymbols,"$1 rem_intra_luma_pred_mode=%d\n",value); + return value; +} + + +static int decode_intra_chroma_pred_mode(thread_context* tctx) +{ + logtrace(LogSlice,"# intra_chroma_pred_mode\n"); + + int prefix = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE]); + + int mode; + if (prefix==0) { + mode=4; + } + else { + mode = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2); + } + + logtrace(LogSlice,"> intra_chroma_pred_mode = %d\n",mode); + logtrace(LogSymbols,"$1 intra_chroma_pred_mode=%d\n",mode); + + return mode; +} + + +static int decode_split_transform_flag(thread_context* tctx, + int log2TrafoSize) +{ + logtrace(LogSlice,"# split_transform_flag (log2TrafoSize=%d)\n",log2TrafoSize); + + int context = 5-log2TrafoSize; + assert(context >= 0 && context <= 2); + + logtrace(LogSlice,"# context: %d\n",context); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + context]); + logtrace(LogSymbols,"$1 split_transform_flag=%d\n",bit); + return bit; +} + + +static int decode_cbf_chroma(thread_context* tctx, + int trafoDepth) +{ + logtrace(LogSlice,"# cbf_chroma\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_CHROMA + trafoDepth]); + + logtrace(LogSymbols,"$1 cbf_chroma=%d\n",bit); + return bit; +} + + +static int decode_cbf_luma(thread_context* tctx, + int trafoDepth) +{ + logtrace(LogSlice,"# cbf_luma\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_LUMA + (trafoDepth==0)]); + + logtrace(LogSlice,"> cbf_luma = %d\n",bit); + + logtrace(LogSymbols,"$1 cbf_luma=%d\n",bit); + return bit; +} + + +static inline int decode_coded_sub_block_flag(thread_context* tctx, + int cIdx, + uint8_t coded_sub_block_neighbors) +{ + logtrace(LogSlice,"# coded_sub_block_flag\n"); + + // tricky computation of csbfCtx + int csbfCtx = ((coded_sub_block_neighbors & 1) | // right neighbor set or + (coded_sub_block_neighbors >> 1)); // bottom neighbor set -> csbfCtx=1 + + int ctxIdxInc = csbfCtx; + if (cIdx!=0) { + ctxIdxInc += 2; + } + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + ctxIdxInc]); + + logtrace(LogSymbols,"$1 coded_sub_block_flag=%d\n",bit); + return bit; +} + + +static int decode_cu_qp_delta_abs(thread_context* tctx) +{ + logtrace(LogSlice,"# cu_qp_delta_abs\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 0]); + if (bit==0) { + logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",0); + return 0; + } + + int prefix=1; + for (int i=0;i<4;i++) { + bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 1]); + if (bit==0) { break; } + else { prefix++; } + } + + if (prefix==5) { + int value = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 0); + logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",value+5); + return value + 5; + } + else { + logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",prefix); + return prefix; + } +} + + +static int decode_last_significant_coeff_prefix(thread_context* tctx, + int log2TrafoSize, + int cIdx, + context_model* model) +{ + logtrace(LogSlice,"# last_significant_coeff_prefix log2TrafoSize:%d cIdx:%d\n",log2TrafoSize,cIdx); + + int cMax = (log2TrafoSize<<1)-1; + + int ctxOffset, ctxShift; + if (cIdx==0) { + ctxOffset = 3*(log2TrafoSize-2) + ((log2TrafoSize-1)>>2); + ctxShift = (log2TrafoSize+1)>>2; + } + else { + ctxOffset = 15; + ctxShift = log2TrafoSize-2; + } + + int binIdx; + int value = cMax; + for (binIdx=0;binIdx> ctxShift); + + logtrace(LogSlice,"context: %d+%d\n",ctxOffset,ctxIdxInc); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, &model[ctxOffset + ctxIdxInc]); + if (bit==0) { + value=binIdx; + break; + } + } + + logtrace(LogSlice,"> last_significant_coeff_prefix: %d\n", value); + + return value; +} + + +static const uint8_t ctxIdxMap[16] = { + 0,1,4,5, + 2,3,4,5, + 6,6,8,8, + 7,7,8,99 +}; + +uint8_t* ctxIdxLookup[4 /* 4-log2-32 */][2 /* !!cIdx */][2 /* !!scanIdx */][4 /* prevCsbf */]; + +bool alloc_and_init_significant_coeff_ctxIdx_lookupTable() +{ + int tableSize = 4*4*(2) + 8*8*(2*2*4) + 16*16*(2*4) + 32*32*(2*4); + + uint8_t* p = (uint8_t*)malloc(tableSize); + if (p==NULL) { + return false; + } + + memset(p,0xFF,tableSize); // just for debugging + + + // --- Set pointers to memory areas. Note that some parameters share the same memory. --- + + // 4x4 + + for (int cIdx=0;cIdx<2;cIdx++) { + for (int scanIdx=0;scanIdx<2;scanIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) + ctxIdxLookup[0][cIdx][scanIdx][prevCsbf] = p; + + p += 4*4; + } + + // 8x8 + + for (int cIdx=0;cIdx<2;cIdx++) + for (int scanIdx=0;scanIdx<2;scanIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) { + ctxIdxLookup[1][cIdx][scanIdx][prevCsbf] = p; + p += 8*8; + } + + // 16x16 + + for (int cIdx=0;cIdx<2;cIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) { + for (int scanIdx=0;scanIdx<2;scanIdx++) { + ctxIdxLookup[2][cIdx][scanIdx][prevCsbf] = p; + } + + p += 16*16; + } + + // 32x32 + + for (int cIdx=0;cIdx<2;cIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) { + for (int scanIdx=0;scanIdx<2;scanIdx++) { + ctxIdxLookup[3][cIdx][scanIdx][prevCsbf] = p; + } + + p += 32*32; + } + + + // --- precompute ctxIdx tables --- + + for (int log2w=2; log2w<=5 ; log2w++) + for (int cIdx=0;cIdx<2;cIdx++) + for (int scanIdx=0;scanIdx<2;scanIdx++) + for (int prevCsbf=0;prevCsbf<4;prevCsbf++) + { + for (int yC=0;yC<(1<>2; + + int sigCtx; + + // if log2TrafoSize==2 + if (sbWidth==1) { + sigCtx = ctxIdxMap[(yC<<2) + xC]; + } + else if (xC+yC==0) { + sigCtx = 0; + } + else { + int xS = xC>>2; + int yS = yC>>2; + /* + int prevCsbf = 0; + + if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; } + if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; } + */ + int xP = xC & 3; + int yP = yC & 3; + + //logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP); + //logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf); + + switch (prevCsbf) { + case 0: + sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2; + break; + case 1: + sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0; + break; + case 2: + sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0; + break; + default: + sigCtx = 2; + break; + } + + //logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx); + + if (cIdx==0) { + if (xS+yS > 0) sigCtx+=3; + + //logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx); + + // if log2TrafoSize==3 + if (sbWidth==2) { // 8x8 block + sigCtx += (scanIdx==0) ? 9 : 15; + } else { + sigCtx += 21; + } + + //logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx); + } + else { + // if log2TrafoSize==3 + if (sbWidth==2) { // 8x8 block + sigCtx+=9; + } + else { + sigCtx+=12; + } + } + + } + + int ctxIdxInc; + if (cIdx==0) { ctxIdxInc=sigCtx; } + else { ctxIdxInc=27+sigCtx; } + + if (ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<>4]; + int x0 = S.x<<2; + int y0 = S.y<<2; + + int subX = ScanOrderPos[s & 0xF].x; + int subY = ScanOrderPos[s & 0xF].y; + int xC = x0 + subX; + int yC = y0 + subY; + + + int w = 1<>2; + + int sigCtx; + + // if log2TrafoSize==2 + if (sbWidth==1) { + sigCtx = ctxIdxMap[(yC<<2) + xC]; + } + else if (xC+yC==0) { + sigCtx = 0; + } + else { + int xS = xC>>2; + int yS = yC>>2; + /* + int prevCsbf = 0; + + if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; } + if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; } + */ + int xP = xC & 3; + int yP = yC & 3; + + logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP); + logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf); + + //printf("%d | %d %d\n",prevCsbf,xP,yP); + + switch (prevCsbf) { + case 0: + //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0; + sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2; + break; + case 1: + sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0; + break; + case 2: + sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0; + break; + default: + sigCtx = 2; + break; + } + + logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx); + + if (cIdx==0) { + if (xS+yS > 0) sigCtx+=3; + + logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx); + + // if log2TrafoSize==3 + if (sbWidth==2) { // 8x8 block + sigCtx += (scanIdx==0) ? 9 : 15; + } else { + sigCtx += 21; + } + + logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx); + } + else { + // if log2TrafoSize==3 + if (sbWidth==2) { // 8x8 block + sigCtx+=9; + } + else { + sigCtx+=12; + } + } + } + + int ctxIdxInc; + if (cIdx==0) { ctxIdxInc=sigCtx; } + else { ctxIdxInc=27+sigCtx; } + + + ctxIdxLookup[log2w-2][cIdx][scanIdx][prevCsbf][xC+(yC<>2; + int yS = yC>>2; + int prevCsbf = 0; + if (xS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+1 +yS*sbWidth]; } + if (yS < sbWidth-1) { prevCsbf += coded_sub_block_flag[xS+(1+yS)*sbWidth]<<1; } + + int xP = xC & 3; + int yP = yC & 3; + + logtrace(LogSlice,"posInSubset: %d,%d\n",xP,yP); + logtrace(LogSlice,"prevCsbf: %d\n",prevCsbf); + + //printf("%d | %d %d\n",prevCsbf,xP,yP); + + switch (prevCsbf) { + case 0: + //sigCtx = (xP+yP==0) ? 2 : (xP+yP<3) ? 1 : 0; + sigCtx = (xP+yP>=3) ? 0 : (xP+yP>0) ? 1 : 2; + break; + case 1: + sigCtx = (yP==0) ? 2 : (yP==1) ? 1 : 0; + break; + case 2: + sigCtx = (xP==0) ? 2 : (xP==1) ? 1 : 0; + break; + default: + sigCtx = 2; + break; + } + + logtrace(LogSlice,"a) sigCtx=%d\n",sigCtx); + + if (cIdx==0) { + if (xS+yS > 0) sigCtx+=3; + + logtrace(LogSlice,"b) sigCtx=%d\n",sigCtx); + + // if log2TrafoSize==3 + if (sbWidth==2) { + sigCtx += (scanIdx==0) ? 9 : 15; + } else { + sigCtx += 21; + } + + logtrace(LogSlice,"c) sigCtx=%d\n",sigCtx); + } + else { + // if log2TrafoSize==3 + if (sbWidth==2) { + sigCtx+=9; + } + else { + sigCtx+=12; + } + } + } + + int ctxIdxInc; + if (cIdx==0) { ctxIdxInc=sigCtx; } + else { ctxIdxInc=27+sigCtx; } + + int context = tctx->shdr->initType*42 + ctxIdxInc; + logtrace(LogSlice,"context: %d\n",context); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + context]); + return bit; +} +#endif + + + +static inline int decode_significant_coeff_flag_lookup(thread_context* tctx, + uint8_t ctxIdxInc) +{ + logtrace(LogSlice,"# significant_coeff_flag\n"); + logtrace(LogSlice,"context: %d\n",ctxIdxInc); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + ctxIdxInc]); + + logtrace(LogSymbols,"$1 significant_coeff_flag=%d\n",bit); + + return bit; +} + + + + + +static inline int decode_coeff_abs_level_greater1(thread_context* tctx, + int cIdx, int i, + bool firstCoeffInSubblock, + bool firstSubblock, + int lastSubblock_greater1Ctx, + int* lastInvocation_greater1Ctx, + int* lastInvocation_coeff_abs_level_greater1_flag, + int* lastInvocation_ctxSet, int c1) +{ + logtrace(LogSlice,"# coeff_abs_level_greater1\n"); + + logtrace(LogSlice," cIdx:%d i:%d firstCoeffInSB:%d firstSB:%d lastSB>1:%d last>1Ctx:%d lastLev>1:%d lastCtxSet:%d\n", cIdx,i,firstCoeffInSubblock,firstSubblock,lastSubblock_greater1Ctx, + *lastInvocation_greater1Ctx, + *lastInvocation_coeff_abs_level_greater1_flag, + *lastInvocation_ctxSet); + + int lastGreater1Ctx; + int greater1Ctx; + int ctxSet; + + logtrace(LogSlice,"c1: %d\n",c1); + + if (firstCoeffInSubblock) { + // block with real DC -> ctx 0 + if (i==0 || cIdx>0) { ctxSet=0; } + else { ctxSet=2; } + + if (firstSubblock) { lastGreater1Ctx=1; } + else { lastGreater1Ctx = lastSubblock_greater1Ctx; } + + if (lastGreater1Ctx==0) { ctxSet++; } + + logtrace(LogSlice,"ctxSet: %d\n",ctxSet); + + greater1Ctx=1; + } + else { // !firstCoeffInSubblock + ctxSet = *lastInvocation_ctxSet; + logtrace(LogSlice,"ctxSet (old): %d\n",ctxSet); + + greater1Ctx = *lastInvocation_greater1Ctx; + if (greater1Ctx>0) { + int lastGreater1Flag=*lastInvocation_coeff_abs_level_greater1_flag; + if (lastGreater1Flag==1) greater1Ctx=0; + else { /*if (greater1Ctx>0)*/ greater1Ctx++; } + } + } + + ctxSet = c1; // use HM algo + + int ctxIdxInc = (ctxSet*4) + (greater1Ctx>=3 ? 3 : greater1Ctx); + + if (cIdx>0) { ctxIdxInc+=16; } + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + ctxIdxInc]); + + *lastInvocation_greater1Ctx = greater1Ctx; + *lastInvocation_coeff_abs_level_greater1_flag = bit; + *lastInvocation_ctxSet = ctxSet; + + //logtrace(LogSymbols,"$1 coeff_abs_level_greater1=%d\n",bit); + + return bit; +} + + +static int decode_coeff_abs_level_greater2(thread_context* tctx, + int cIdx, // int i,int n, + int ctxSet) +{ + logtrace(LogSlice,"# coeff_abs_level_greater2\n"); + + int ctxIdxInc = ctxSet; + + if (cIdx>0) ctxIdxInc+=4; + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + ctxIdxInc]); + + logtrace(LogSymbols,"$1 coeff_abs_level_greater2=%d\n",bit); + + return bit; +} + + +#define MAX_PREFIX 64 + +static int decode_coeff_abs_level_remaining(thread_context* tctx, + int cRiceParam) +{ + logtrace(LogSlice,"# decode_coeff_abs_level_remaining\n"); + + int prefix=-1; + int codeword=0; + do { + prefix++; + codeword = decode_CABAC_bypass(&tctx->cabac_decoder); + + if (prefix>MAX_PREFIX) { + return 0; // TODO: error + } + } + while (codeword); + + // prefix = nb. 1 bits + + int value; + + if (prefix <= 3) { + // when code only TR part (level < TRMax) + + codeword = decode_CABAC_FL_bypass(&tctx->cabac_decoder, cRiceParam); + value = (prefix<cabac_decoder, prefix-3+cRiceParam); + value = (((1<<(prefix-3))+3-1)<cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_MERGE_FLAG]); + + logtrace(LogSymbols,"$1 merge_flag=%d\n",bit); + + return bit; +} + + +static int decode_merge_idx(thread_context* tctx) +{ + logtrace(LogSlice,"# merge_idx\n"); + + if (tctx->shdr->MaxNumMergeCand <= 1) { + logtrace(LogSymbols,"$1 merge_idx=%d\n",0); + return 0; + } + + // TU coding, first bin is CABAC, remaining are bypass. + // cMax = MaxNumMergeCand-1 + + int idx = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_MERGE_IDX]); + + if (idx==0) { + // nothing + } + else { + idx=1; + + while (idxshdr->MaxNumMergeCand-1) { + if (decode_CABAC_bypass(&tctx->cabac_decoder)) { + idx++; + } + else { + break; + } + } + } + + logtrace(LogSlice,"> merge_idx = %d\n",idx); + logtrace(LogSymbols,"$1 merge_idx=%d\n",idx); + + return idx; +} + + +static int decode_pred_mode_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# pred_mode_flag\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_PRED_MODE_FLAG]); + + logtrace(LogSymbols,"$1 pred_mode=%d\n",bit); + return bit; +} + +static int decode_mvp_lx_flag(thread_context* tctx) +{ + logtrace(LogSlice,"# mvp_lx_flag\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_MVP_LX_FLAG]); + + logtrace(LogSymbols,"$1 mvp_lx_flag=%d\n",bit); + return bit; +} + +static int decode_rqt_root_cbf(thread_context* tctx) +{ + logtrace(LogSlice,"# rqt_root_cbf\n"); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_RQT_ROOT_CBF]); + + logtrace(LogSymbols,"$1 rqt_root_cbf=%d\n",bit); + return bit; +} + +static int decode_ref_idx_lX(thread_context* tctx, int numRefIdxLXActive) +{ + logtrace(LogSlice,"# ref_idx_lX\n"); + + int cMax = numRefIdxLXActive-1; + + if (cMax==0) { + logtrace(LogSlice,"> ref_idx = 0 (cMax==0)\n"); + return 0; + } // do check for single reference frame here + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 0]); + + int idx=0; + + while (bit) { + idx++; + if (idx==cMax) { break; } + + if (idx==1) { + bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_REF_IDX_LX + 1]); + } + else { + bit = decode_CABAC_bypass(&tctx->cabac_decoder); + } + } + + logtrace(LogSlice,"> ref_idx = %d\n",idx); + + logtrace(LogSymbols,"$1 ref_idx_lX=%d\n",idx); + return idx; +} + + +static enum InterPredIdc decode_inter_pred_idc(thread_context* tctx, + int x0, int y0, + int nPbW, int nPbH, + int ctDepth) +{ + logtrace(LogSlice,"# inter_pred_idc\n"); + + int value; + + context_model* model = &tctx->ctx_model[CONTEXT_MODEL_INTER_PRED_IDC]; + + if (nPbW+nPbH==12) { + value = decode_CABAC_bit(&tctx->cabac_decoder, + &model[4]); + } + else { + int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, + &model[ctDepth]); + if (bit0==0) { + value = decode_CABAC_bit(&tctx->cabac_decoder, + &model[4]); + } + else { + value = 2; + } + } + + logtrace(LogSlice,"> inter_pred_idc = %d (%s)\n",value, + value==0 ? "L0" : (value==1 ? "L1" : "BI")); + + logtrace(LogSymbols,"$1 decode_inter_pred_idx=%d\n",value+1); + + return (enum InterPredIdc) (value+1); +} + + +static int decode_explicit_rdpcm_flag(thread_context* tctx,int cIdx) +{ + context_model* model = &tctx->ctx_model[CONTEXT_MODEL_RDPCM_FLAG]; + int value = decode_CABAC_bit(&tctx->cabac_decoder, &model[cIdx ? 1 : 0]); + return value; +} + + +static int decode_explicit_rdpcm_dir(thread_context* tctx,int cIdx) +{ + context_model* model = &tctx->ctx_model[CONTEXT_MODEL_RDPCM_DIR]; + int value = decode_CABAC_bit(&tctx->cabac_decoder, &model[cIdx ? 1 : 0]); + return value; +} + + + +/* Take CtbAddrInTS and compute + -> CtbAddrInRS, CtbX, CtbY + */ +bool setCtbAddrFromTS(thread_context* tctx) +{ + const seq_parameter_set& sps = tctx->img->get_sps(); + + if (tctx->CtbAddrInTS < sps.PicSizeInCtbsY) { + tctx->CtbAddrInRS = tctx->img->get_pps().CtbAddrTStoRS[tctx->CtbAddrInTS]; + + tctx->CtbX = tctx->CtbAddrInRS % sps.PicWidthInCtbsY; + tctx->CtbY = tctx->CtbAddrInRS / sps.PicWidthInCtbsY; + return false; + } + else { + tctx->CtbAddrInRS = sps.PicSizeInCtbsY; + + tctx->CtbX = tctx->CtbAddrInRS % sps.PicWidthInCtbsY; + tctx->CtbY = tctx->CtbAddrInRS / sps.PicWidthInCtbsY; + return true; + } +} + +// returns true when we reached the end of the image (ctbAddr==picSizeInCtbsY) +bool advanceCtbAddr(thread_context* tctx) +{ + tctx->CtbAddrInTS++; + + return setCtbAddrFromTS(tctx); +} + + +void read_sao(thread_context* tctx, int xCtb,int yCtb, + int CtbAddrInSliceSeg) +{ + slice_segment_header* shdr = tctx->shdr; + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + + logtrace(LogSlice,"# read_sao(%d,%d)\n",xCtb,yCtb); + + sao_info saoinfo; + memset(&saoinfo,0,sizeof(sao_info)); + logtrace(LogSlice,"sizeof saoinfo: %d\n",sizeof(sao_info)); + + + char sao_merge_left_flag = 0; + char sao_merge_up_flag = 0; + + if (xCtb>0) { + //char leftCtbInSliceSeg = (CtbAddrInSliceSeg>0); + char leftCtbInSliceSeg = (tctx->CtbAddrInRS > shdr->SliceAddrRS); + char leftCtbInTile = (pps.TileIdRS[xCtb + yCtb * sps.PicWidthInCtbsY] == + pps.TileIdRS[xCtb-1 + yCtb * sps.PicWidthInCtbsY]); + + if (leftCtbInSliceSeg && leftCtbInTile) { + sao_merge_left_flag = decode_sao_merge_flag(tctx); + logtrace(LogSlice,"sao_merge_left_flag: %d\n",sao_merge_left_flag); + } + } + + if (yCtb>0 && sao_merge_left_flag==0) { + logtrace(LogSlice,"CtbAddrInRS:%d PicWidthInCtbsY:%d slice_segment_address:%d\n", + tctx->CtbAddrInRS, + sps.PicWidthInCtbsY, + shdr->slice_segment_address); + char upCtbInSliceSeg = (tctx->CtbAddrInRS - sps.PicWidthInCtbsY) >= shdr->SliceAddrRS; + char upCtbInTile = (pps.TileIdRS[xCtb + yCtb * sps.PicWidthInCtbsY] == + pps.TileIdRS[xCtb + (yCtb-1) * sps.PicWidthInCtbsY]); + + if (upCtbInSliceSeg && upCtbInTile) { + sao_merge_up_flag = decode_sao_merge_flag(tctx); + logtrace(LogSlice,"sao_merge_up_flag: %d\n",sao_merge_up_flag); + } + } + + if (!sao_merge_up_flag && !sao_merge_left_flag) { + int nChroma = 3; + if (sps.ChromaArrayType == CHROMA_MONO) nChroma=1; + + for (int cIdx=0; cIdxslice_sao_luma_flag && cIdx==0) || + (shdr->slice_sao_chroma_flag && cIdx>0)) { + + uint8_t SaoTypeIdx = 0; + + if (cIdx==0) { + char sao_type_idx_luma = decode_sao_type_idx(tctx); + logtrace(LogSlice,"sao_type_idx_luma: %d\n", sao_type_idx_luma); + saoinfo.SaoTypeIdx = SaoTypeIdx = sao_type_idx_luma; + } + else if (cIdx==1) { + char sao_type_idx_chroma = decode_sao_type_idx(tctx); + logtrace(LogSlice,"sao_type_idx_chroma: %d\n", sao_type_idx_chroma); + SaoTypeIdx = sao_type_idx_chroma; + saoinfo.SaoTypeIdx |= SaoTypeIdx<<(2*1); + saoinfo.SaoTypeIdx |= SaoTypeIdx<<(2*2); // set for both chroma components + } + else { + // SaoTypeIdx = 0 + + SaoTypeIdx = (saoinfo.SaoTypeIdx >> (2*cIdx)) & 0x3; + } + + if (SaoTypeIdx != 0) { + for (int i=0;i<4;i++) { + saoinfo.saoOffsetVal[cIdx][i] = decode_sao_offset_abs(tctx, img->get_bit_depth(cIdx)); + logtrace(LogSlice,"saoOffsetVal[%d][%d] = %d\n",cIdx,i, saoinfo.saoOffsetVal[cIdx][i]); + } + + int sign[4]; + if (SaoTypeIdx==1) { + for (int i=0;i<4;i++) { + if (saoinfo.saoOffsetVal[cIdx][i] != 0) { + sign[i] = decode_sao_offset_sign(tctx) ? -1 : 1; + } + else { + sign[i] = 0; // not really required, but compiler warns about uninitialized values + } + } + + saoinfo.sao_band_position[cIdx] = decode_sao_band_position(tctx); + } + else { + uint8_t SaoEoClass = 0; + + sign[0] = sign[1] = 1; + sign[2] = sign[3] = -1; + + if (cIdx==0) { + saoinfo.SaoEoClass = SaoEoClass = decode_sao_class(tctx); + } + else if (cIdx==1) { + SaoEoClass = decode_sao_class(tctx); + saoinfo.SaoEoClass |= SaoEoClass << (2*1); + saoinfo.SaoEoClass |= SaoEoClass << (2*2); + } + + logtrace(LogSlice,"SaoEoClass[%d] = %d\n",cIdx,SaoEoClass); + } + + int log2OffsetScale; + + if (cIdx==0) { + log2OffsetScale = pps.range_extension.log2_sao_offset_scale_luma; + } + else { + log2OffsetScale = pps.range_extension.log2_sao_offset_scale_chroma; + } + + for (int i=0;i<4;i++) { + saoinfo.saoOffsetVal[cIdx][i] = sign[i]*(saoinfo.saoOffsetVal[cIdx][i] << log2OffsetScale); + } + } + } + } + + img->set_sao_info(xCtb,yCtb, &saoinfo); + } + + + if (sao_merge_left_flag) { + img->set_sao_info(xCtb,yCtb, img->get_sao_info(xCtb-1,yCtb)); + } + + if (sao_merge_up_flag) { + img->set_sao_info(xCtb,yCtb, img->get_sao_info(xCtb,yCtb-1)); + } +} + + +void read_coding_tree_unit(thread_context* tctx) +{ + slice_segment_header* shdr = tctx->shdr; + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + + int xCtb = (tctx->CtbAddrInRS % sps.PicWidthInCtbsY); + int yCtb = (tctx->CtbAddrInRS / sps.PicWidthInCtbsY); + int xCtbPixels = xCtb << sps.Log2CtbSizeY; + int yCtbPixels = yCtb << sps.Log2CtbSizeY; + + logtrace(LogSlice,"----- decode CTB %d;%d (%d;%d) POC=%d, SliceAddrRS=%d\n", + xCtbPixels,yCtbPixels, xCtb,yCtb, + tctx->img->PicOrderCntVal, tctx->shdr->SliceAddrRS); + + img->set_SliceAddrRS(xCtb, yCtb, tctx->shdr->SliceAddrRS); + + img->set_SliceHeaderIndex(xCtbPixels,yCtbPixels, shdr->slice_index); + + int CtbAddrInSliceSeg = tctx->CtbAddrInRS - shdr->slice_segment_address; + + if (shdr->slice_sao_luma_flag || shdr->slice_sao_chroma_flag) + { + read_sao(tctx, xCtb,yCtb, CtbAddrInSliceSeg); + } + + read_coding_quadtree(tctx, xCtbPixels, yCtbPixels, sps.Log2CtbSizeY, 0); +} + + +LIBDE265_INLINE static int luma_pos_to_ctbAddrRS(const seq_parameter_set* sps, int x,int y) +{ + int ctbX = x >> sps->Log2CtbSizeY; + int ctbY = y >> sps->Log2CtbSizeY; + + return ctbY * sps->PicWidthInCtbsY + ctbX; +} + + +int check_CTB_available(const de265_image* img, + int xC,int yC, int xN,int yN) +{ + // check whether neighbor is outside of frame + + if (xN < 0 || yN < 0) { return 0; } + if (xN >= img->get_sps().pic_width_in_luma_samples) { return 0; } + if (yN >= img->get_sps().pic_height_in_luma_samples) { return 0; } + + + int current_ctbAddrRS = luma_pos_to_ctbAddrRS(&img->get_sps(), xC,yC); + int neighbor_ctbAddrRS = luma_pos_to_ctbAddrRS(&img->get_sps(), xN,yN); + + // TODO: check if this is correct (6.4.1) + + if (img->get_SliceAddrRS_atCtbRS(current_ctbAddrRS) != + img->get_SliceAddrRS_atCtbRS(neighbor_ctbAddrRS)) { + return 0; + } + + // check if both CTBs are in the same tile. + + if (img->get_pps().TileIdRS[current_ctbAddrRS] != + img->get_pps().TileIdRS[neighbor_ctbAddrRS]) { + return 0; + } + + return 1; +} + + +int residual_coding(thread_context* tctx, + int x0, int y0, // position of TU in frame + int log2TrafoSize, + int cIdx) +{ + logtrace(LogSlice,"- residual_coding x0:%d y0:%d log2TrafoSize:%d cIdx:%d\n",x0,y0,log2TrafoSize,cIdx); + + //slice_segment_header* shdr = tctx->shdr; + + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + + enum PredMode PredMode = img->get_pred_mode(x0,y0); + + if (cIdx==0) { + img->set_nonzero_coefficient(x0,y0,log2TrafoSize); + } + + + if (pps.transform_skip_enabled_flag && + !tctx->cu_transquant_bypass_flag && + (log2TrafoSize <= pps.Log2MaxTransformSkipSize)) + { + tctx->transform_skip_flag[cIdx] = decode_transform_skip_flag(tctx,cIdx); + } + else + { + tctx->transform_skip_flag[cIdx] = 0; + } + + + tctx->explicit_rdpcm_flag = false; + + if (PredMode == MODE_INTER && sps.range_extension.explicit_rdpcm_enabled_flag && + ( tctx->transform_skip_flag[cIdx] || tctx->cu_transquant_bypass_flag)) + { + tctx->explicit_rdpcm_flag = decode_explicit_rdpcm_flag(tctx,cIdx); + if (tctx->explicit_rdpcm_flag) { + tctx->explicit_rdpcm_dir = decode_explicit_rdpcm_dir(tctx,cIdx); + } + + //printf("EXPLICIT RDPCM %d;%d\n",x0,y0); + } + else + { + tctx->explicit_rdpcm_flag = false; + } + + + + // sbType for persistent_rice_adaptation_enabled_flag + + int sbType = (cIdx==0) ? 2 : 0; + if (tctx->transform_skip_flag[cIdx] || tctx->cu_transquant_bypass_flag) { + sbType++; + } + + + // --- decode position of last coded coefficient --- + + int last_significant_coeff_x_prefix = + decode_last_significant_coeff_prefix(tctx,log2TrafoSize,cIdx, + &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX]); + + int last_significant_coeff_y_prefix = + decode_last_significant_coeff_prefix(tctx,log2TrafoSize,cIdx, + &tctx->ctx_model[CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX]); + + + // TODO: we can combine both FL-bypass calls into one, but the gain may be limited... + + int LastSignificantCoeffX; + if (last_significant_coeff_x_prefix > 3) { + int nBits = (last_significant_coeff_x_prefix>>1)-1; + int last_significant_coeff_x_suffix = decode_CABAC_FL_bypass(&tctx->cabac_decoder,nBits); + + LastSignificantCoeffX = + ((2+(last_significant_coeff_x_prefix & 1)) << nBits) + last_significant_coeff_x_suffix; + } + else { + LastSignificantCoeffX = last_significant_coeff_x_prefix; + } + + int LastSignificantCoeffY; + if (last_significant_coeff_y_prefix > 3) { + int nBits = (last_significant_coeff_y_prefix>>1)-1; + int last_significant_coeff_y_suffix = decode_CABAC_FL_bypass(&tctx->cabac_decoder,nBits); + + LastSignificantCoeffY = + ((2+(last_significant_coeff_y_prefix & 1)) << nBits) + last_significant_coeff_y_suffix; + } + else { + LastSignificantCoeffY = last_significant_coeff_y_prefix; + } + + + + // --- determine scanIdx --- + + int scanIdx; + + if (PredMode == MODE_INTRA) { + if (cIdx==0) { + scanIdx = get_intra_scan_idx(log2TrafoSize, img->get_IntraPredMode(x0,y0), cIdx, &sps); + //printf("luma scan idx=%d <- intra mode=%d\n",scanIdx, img->get_IntraPredMode(x0,y0)); + } + else { + scanIdx = get_intra_scan_idx(log2TrafoSize, img->get_IntraPredModeC(x0,y0), cIdx, &sps); + //printf("chroma scan idx=%d <- intra mode=%d chroma:%d trsize:%d\n",scanIdx, + // img->get_IntraPredModeC(x0,y0), sps->chroma_format_idc, 1<nCoeff[cIdx] = 0; + + + // i - subblock index + // n - coefficient index in subblock + + for (int i=lastSubBlock;i>=0;i--) { + position S = ScanOrderSub[i]; + int inferSbDcSigCoeffFlag=0; + + logtrace(LogSlice,"sub block scan idx: %d\n",i); + + + // --- check whether this sub-block is coded --- + + int sub_block_is_coded = 0; + + if ((i0)) { + sub_block_is_coded = decode_coded_sub_block_flag(tctx, cIdx, + coded_sub_block_neighbors[S.x+S.y*sbWidth]); + inferSbDcSigCoeffFlag=1; + } + else if (i==0 || i==lastSubBlock) { + // first (DC) and last sub-block are always coded + // - the first will most probably contain coefficients + // - the last obviously contains the last coded coefficient + + sub_block_is_coded = 1; + } + + if (sub_block_is_coded) { + if (S.x > 0) coded_sub_block_neighbors[S.x-1 + S.y *sbWidth] |= 1; + if (S.y > 0) coded_sub_block_neighbors[S.x + (S.y-1)*sbWidth] |= 2; + } + + + // ----- find significant coefficients in this sub-block ----- + + int16_t coeff_value[16]; + int8_t coeff_scan_pos[16]; + int8_t coeff_sign[16]; + int8_t coeff_has_max_base_level[16]; + int nCoefficients=0; + + + if (sub_block_is_coded) { + int x0 = S.x<<2; + int y0 = S.y<<2; + + int log2w = log2TrafoSize-2; + int prevCsbf = coded_sub_block_neighbors[S.x+S.y*sbWidth]; + uint8_t* ctxIdxMap = ctxIdxLookup[log2w][!!cIdx][!!scanIdx][prevCsbf]; + + logdebug(LogSlice,"log2w:%d cIdx:%d scanIdx:%d prevCsbf:%d\n", + log2w,cIdx,scanIdx,prevCsbf); + + + // set the last coded coefficient in the last subblock + + int last_coeff = (i==lastSubBlock) ? lastScanPos-1 : 15; + + if (i==lastSubBlock) { + coeff_value[nCoefficients] = 1; + coeff_has_max_base_level[nCoefficients] = 1; + coeff_scan_pos[nCoefficients] = lastScanPos; + nCoefficients++; + } + + + // --- decode all coefficients' significant_coeff flags except for the DC coefficient --- + + for (int n= last_coeff ; n>0 ; n--) { + int subX = ScanOrderPos[n].x; + int subY = ScanOrderPos[n].y; + xC = x0 + subX; + yC = y0 + subY; + + + // for all AC coefficients in sub-block, a significant_coeff flag is coded + + int ctxInc; + if (sps.range_extension.transform_skip_context_enabled_flag && + (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx])) { + ctxInc = ( cIdx == 0 ) ? 42 : (16+27); + } + else { + ctxInc = ctxIdxMap[xC+(yC<=0) // last coded coefficient (always set to 1) is not the DC coefficient + { + if (inferSbDcSigCoeffFlag==0) { + // if we cannot infert the DC coefficient, it is coded + + int ctxInc; + if (sps.range_extension.transform_skip_context_enabled_flag && + (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx])) { + ctxInc = ( cIdx == 0 ) ? 42 : (16+27); + } + else { + ctxInc = ctxIdxMap[x0+(y0<0) { ctxSet=0; } + else { ctxSet=2; } + + if (c1==0) { ctxSet++; } + c1=1; + + + // --- decode greater-1 flags --- + + int newLastGreater1ScanPos=-1; + + int lastGreater1Coefficient = libde265_min(8,nCoefficients); + for (int c=0;c0) { + c1++; + } + } + } + + firstSubblock = false; + lastSubblock_greater1Ctx = lastInvocation_greater1Ctx; + + + // --- decode greater-2 flag --- + + if (newLastGreater1ScanPos != -1) { + int flag = decode_coeff_abs_level_greater2(tctx,cIdx, lastInvocation_ctxSet); + coeff_value[newLastGreater1ScanPos] += flag; + coeff_has_max_base_level[newLastGreater1ScanPos] = flag; + } + + + // --- decode coefficient signs --- + + int signHidden; + + + IntraPredMode predModeIntra; + if (cIdx==0) predModeIntra = img->get_IntraPredMode(x0,y0); + else predModeIntra = img->get_IntraPredModeC(x0,y0); + + + if (tctx->cu_transquant_bypass_flag || + (PredMode == MODE_INTRA && + sps.range_extension.implicit_rdpcm_enabled_flag && + tctx->transform_skip_flag[cIdx] && + ( predModeIntra == 10 || predModeIntra == 26 )) || + tctx->explicit_rdpcm_flag) + { + signHidden = 0; + } + else + { + signHidden = (coeff_scan_pos[0]-coeff_scan_pos[nCoefficients-1] > 3); + } + + + for (int n=0;ncabac_decoder); + logtrace(LogSlice,"sign[%d] = %d\n", n, coeff_sign[n]); + } + + // n==nCoefficients-1 + if (!pps.sign_data_hiding_flag || !signHidden) { + coeff_sign[nCoefficients-1] = decode_CABAC_bypass(&tctx->cabac_decoder); + logtrace(LogSlice,"sign[%d] = %d\n", nCoefficients-1, coeff_sign[nCoefficients-1]); + } + else { + coeff_sign[nCoefficients-1] = 0; + } + + + // --- decode coefficient value --- + + int sumAbsLevel=0; + int uiGoRiceParam; + + if (sps.range_extension.persistent_rice_adaptation_enabled_flag==0) { + uiGoRiceParam = 0; + } + else { + uiGoRiceParam = tctx->StatCoeff[sbType]/4; + } + + // printf("initial uiGoRiceParam=%d\n",uiGoRiceParam); + bool firstCoeffWithAbsLevelRemaining = true; + + for (int n=0;n 3*(1<4) uiGoRiceParam=4; + } + } + else { + if (baseLevel + coeff_abs_level_remaining > 3*(1<= (3 << (tctx->StatCoeff[sbType]/4 ))) { + tctx->StatCoeff[sbType]++; + } + else if (2*coeff_abs_level_remaining < (1 << (tctx->StatCoeff[sbType]/4 )) && + tctx->StatCoeff[sbType] > 0) { + tctx->StatCoeff[sbType]--; + } + } + + firstCoeffWithAbsLevelRemaining=false; + } + else { + coeff_abs_level_remaining = 0; + } + + logtrace(LogSlice, "coeff_abs_level_remaining=%d\n",coeff_abs_level_remaining); + + + int16_t currCoeff = baseLevel + coeff_abs_level_remaining; + if (coeff_sign[n]) { + currCoeff = -currCoeff; + } + + if (pps.sign_data_hiding_flag && signHidden) { + sumAbsLevel += baseLevel + coeff_abs_level_remaining; + + if (n==nCoefficients-1 && (sumAbsLevel & 1)) { + currCoeff = -currCoeff; + } + } + + logtrace(LogSlice, "quantized coefficient=%d\n",currCoeff); + +#ifdef DE265_LOG_TRACE + //TransCoeffLevel[yC*CoeffStride + xC] = currCoeff; +#endif + + // put coefficient in list + int p = coeff_scan_pos[n]; + xC = (S.x<<2) + ScanOrderPos[p].x; + yC = (S.y<<2) + ScanOrderPos[p].y; + + tctx->coeffList[cIdx][ tctx->nCoeff[cIdx] ] = currCoeff; + tctx->coeffPos [cIdx][ tctx->nCoeff[cIdx] ] = xC + yC*CoeffStride; + tctx->nCoeff[cIdx]++; + + //printf("%d ",currCoeff); + } // iterate through coefficients in sub-block + + //printf(" (%d;%d)\n",x0,y0); + + } // if nonZero + } // next sub-block + + return DE265_OK; +} + + +static void decode_TU(thread_context* tctx, + int x0,int y0, + int xCUBase,int yCUBase, + int nT, int cIdx, enum PredMode cuPredMode, bool cbf) +{ + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + + int residualDpcm = 0; + + if (cuPredMode == MODE_INTRA) // if intra mode + { + enum IntraPredMode intraPredMode; + + if (cIdx==0) { + intraPredMode = img->get_IntraPredMode(x0,y0); + } + else { + const int SubWidthC = sps.SubWidthC; + const int SubHeightC = sps.SubHeightC; + + intraPredMode = img->get_IntraPredModeC(x0*SubWidthC,y0*SubHeightC); + } + + if (intraPredMode<0 || intraPredMode>=35) { + // TODO: ERROR + intraPredMode = INTRA_DC; + } + + decode_intra_prediction(img, x0,y0, intraPredMode, nT, cIdx); + + + residualDpcm = sps.range_extension.implicit_rdpcm_enabled_flag && + (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx]) && + (intraPredMode == 10 || intraPredMode == 26); + + if (residualDpcm && intraPredMode == 26) + residualDpcm = 2; + } + else // INTER + { + if (tctx->explicit_rdpcm_flag) { + residualDpcm = (tctx->explicit_rdpcm_dir ? 2 : 1); + } + } + + if (cbf) { + scale_coefficients(tctx, x0,y0, xCUBase,yCUBase, nT, cIdx, + tctx->transform_skip_flag[cIdx], cuPredMode==MODE_INTRA, residualDpcm); + } + /* + else if (!cbf && cIdx==0) { + memset(tctx->residual_luma,0,32*32*sizeof(int32_t)); + } + */ + else if (!cbf && cIdx!=0 && tctx->ResScaleVal) { + // --- cross-component-prediction when CBF==0 --- + + tctx->nCoeff[cIdx] = 0; + residualDpcm=0; + + scale_coefficients(tctx, x0,y0, xCUBase,yCUBase, nT, cIdx, + tctx->transform_skip_flag[cIdx], cuPredMode==MODE_INTRA, residualDpcm); + } +} + + +static int decode_log2_res_scale_abs_plus1(thread_context* tctx, int cIdxMinus1) +{ + //const int context = (cIdx==0) ? 0 : 1; + + logtrace(LogSlice,"# log2_res_scale_abs_plus1 (c=%d)\n",cIdxMinus1); + + int value = 0; + int cMax = 4; + for (int binIdx=0;binIdxcabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1+ctxIdxInc]); + if (!bit) break; + value++; + } + + logtrace(LogSymbols,"$1 log2_res_scale_abs_plus1=%d\n",value); + + return value; +} + + +static int decode_res_scale_sign_flag(thread_context* tctx, int cIdxMinus1) +{ + //const int context = (cIdx==0) ? 0 : 1; + + logtrace(LogSlice,"# res_scale_sign_flag (c=%d)\n",cIdxMinus1); + + int bit = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_RES_SCALE_SIGN_FLAG+cIdxMinus1]); + + logtrace(LogSymbols,"$1 res_scale_sign_flag=%d\n",bit); + + return bit; +} + + +static void read_cross_comp_pred(thread_context* tctx, int cIdxMinus1) +{ + int log2_res_scale_abs_plus1 = decode_log2_res_scale_abs_plus1(tctx,cIdxMinus1); + int ResScaleVal; + + if (log2_res_scale_abs_plus1 != 0) { + int res_scale_sign_flag = decode_res_scale_sign_flag(tctx,cIdxMinus1); + + ResScaleVal = 1 << (log2_res_scale_abs_plus1 - 1); + ResScaleVal *= 1 - 2 * res_scale_sign_flag; + } + else { + ResScaleVal = 0; + } + + tctx->ResScaleVal = ResScaleVal; +} + + +int read_transform_unit(thread_context* tctx, + int x0, int y0, // position of TU in frame + int xBase, int yBase, // position of parent TU in frame + int xCUBase,int yCUBase, // position of CU in frame + int log2TrafoSize, + int trafoDepth, + int blkIdx, + int cbf_luma, int cbf_cb, int cbf_cr) +{ + logtrace(LogSlice,"- read_transform_unit x0:%d y0:%d xBase:%d yBase:%d nT:%d cbf:%d:%d:%d\n", + x0,y0,xBase,yBase, 1<img->get_sps(); + + const int ChromaArrayType = sps.ChromaArrayType; + + int log2TrafoSizeC = (ChromaArrayType==CHROMA_444 ? log2TrafoSize : log2TrafoSize-1); + log2TrafoSizeC = libde265_max(2, log2TrafoSizeC); + + const int cbfLuma = cbf_luma; + const int cbfChroma = cbf_cb | cbf_cr; + + tctx->transform_skip_flag[0]=0; + tctx->transform_skip_flag[1]=0; + tctx->transform_skip_flag[2]=0; + + tctx->explicit_rdpcm_flag = false; + + + enum PredMode cuPredMode = tctx->img->get_pred_mode(x0,y0); + + if (cbfLuma || cbfChroma) + { + bool doDecodeQuantParameters = false; + + if (tctx->img->get_pps().cu_qp_delta_enabled_flag && + !tctx->IsCuQpDeltaCoded) { + + int cu_qp_delta_abs = decode_cu_qp_delta_abs(tctx); + int cu_qp_delta_sign=0; + if (cu_qp_delta_abs) { + cu_qp_delta_sign = decode_CABAC_bypass(&tctx->cabac_decoder); + } + + tctx->IsCuQpDeltaCoded = 1; + tctx->CuQpDelta = cu_qp_delta_abs*(1-2*cu_qp_delta_sign); + + //printf("read cu_qp_delta (%d;%d) = %d\n",x0,y0,tctx->CuQpDelta); + + logtrace(LogSlice,"cu_qp_delta_abs = %d\n",cu_qp_delta_abs); + logtrace(LogSlice,"cu_qp_delta_sign = %d\n",cu_qp_delta_sign); + logtrace(LogSlice,"CuQpDelta = %d\n",tctx->CuQpDelta); + + doDecodeQuantParameters = true; + //decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase); + } + + if (tctx->shdr->cu_chroma_qp_offset_enabled_flag && cbfChroma && + !tctx->cu_transquant_bypass_flag && !tctx->IsCuChromaQpOffsetCoded ) { + logtrace(LogSlice,"# cu_chroma_qp_offset_flag\n"); + + int cu_chroma_qp_offset_flag = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG]); + + + const pic_parameter_set& pps = tctx->img->get_pps(); + + int cu_chroma_qp_offset_idx = 0; + if (cu_chroma_qp_offset_flag && pps.range_extension.chroma_qp_offset_list_len > 1) { + cu_chroma_qp_offset_idx = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX]); + } + + tctx->IsCuChromaQpOffsetCoded = 1; + + if (cu_chroma_qp_offset_flag) { + tctx->CuQpOffsetCb = pps.range_extension.cb_qp_offset_list[ cu_chroma_qp_offset_idx ]; + tctx->CuQpOffsetCr = pps.range_extension.cr_qp_offset_list[ cu_chroma_qp_offset_idx ]; + } + else { + tctx->CuQpOffsetCb = 0; + tctx->CuQpOffsetCr = 0; + } + + doDecodeQuantParameters = true; + //decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase); + } + + + if (doDecodeQuantParameters) { + decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase); + } + } + + // position of TU in local CU + int xL = x0 - xCUBase; + int yL = y0 - yCUBase; + int nT = 1<ResScaleVal = 0; + + int err; + if (cbf_luma) { + if ((err=residual_coding(tctx,x0,y0, log2TrafoSize,0)) != DE265_OK) return err; + } + + decode_TU(tctx, x0,y0, xCUBase,yCUBase, nT, 0, cuPredMode, cbf_luma); + + + // --- chroma --- + + const int yOffset422 = 1<2 || ChromaArrayType == CHROMA_444) { + // TODO: cross-component prediction + + const bool do_cross_component_prediction = + (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag && + cbf_luma && + (cuPredMode == MODE_INTER || tctx->img->is_IntraPredModeC_Mode4(x0,y0))); + + if (do_cross_component_prediction) { + read_cross_comp_pred(tctx, 0); + } + else { + tctx->ResScaleVal = 0; + } + + { + if (cbf_cb & 1) { + if ((err=residual_coding(tctx,x0,y0,log2TrafoSizeC,1)) != DE265_OK) return err; + } + + if (sps.ChromaArrayType != CHROMA_MONO) { + decode_TU(tctx, + x0/SubWidthC,y0/SubHeightC, + xCUBase/SubWidthC,yCUBase/SubHeightC, nTC, 1, cuPredMode, cbf_cb & 1); + } + } + + // 4:2:2 + if (ChromaArrayType == CHROMA_422) { + const int yOffset = 1<ResScaleVal = 0; + } + + { + if (cbf_cr & 1) { + if ((err=residual_coding(tctx,x0,y0,log2TrafoSizeC,2)) != DE265_OK) return err; + } + + if (sps.ChromaArrayType != CHROMA_MONO) { + decode_TU(tctx, + x0/SubWidthC,y0/SubHeightC, + xCUBase/SubWidthC,yCUBase/SubHeightC, + nTC, 2, cuPredMode, cbf_cr & 1); + } + } + + // 4:2:2 + if (ChromaArrayType == CHROMA_422) { + const int yOffset = 1<get_width(0); + int h = img->get_height(0); + + for (int y=0;yget_log2CbSize(x,y)); + } + printf("\n"); + } +} + + +void read_transform_tree(thread_context* tctx, + int x0, int y0, // position of TU in frame + int xBase, int yBase, // position of parent TU in frame + int xCUBase, int yCUBase, // position of CU in frame + int log2TrafoSize, + int trafoDepth, + int blkIdx, + int MaxTrafoDepth, + int IntraSplitFlag, + enum PredMode cuPredMode, + uint8_t parent_cbf_cb,uint8_t parent_cbf_cr) +{ + logtrace(LogSlice,"- read_transform_tree (interleaved) x0:%d y0:%d xBase:%d yBase:%d " + "log2TrafoSize:%d trafoDepth:%d MaxTrafoDepth:%d parent-cbf-cb:%d parent-cbf-cr:%d\n", + x0,y0,xBase,yBase,log2TrafoSize,trafoDepth,MaxTrafoDepth,parent_cbf_cb,parent_cbf_cr); + + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + + int split_transform_flag; + + enum PredMode PredMode = img->get_pred_mode(x0,y0); + assert(PredMode == cuPredMode); + + /* If TrafoSize is larger than maximum size -> split automatically + If TrafoSize is at minimum size -> do not split + If maximum transformation depth is reached -> do not split + If intra-prediction is NxN mode -> split automatically (only at level 0) + Otherwise -> read split flag + */ + if (log2TrafoSize <= sps.Log2MaxTrafoSize && + log2TrafoSize > sps.Log2MinTrafoSize && + trafoDepth < MaxTrafoDepth && + !(IntraSplitFlag && trafoDepth==0)) + { + split_transform_flag = decode_split_transform_flag(tctx, log2TrafoSize); + } + else + { + enum PartMode PartMode = img->get_PartMode(x0,y0); + + int interSplitFlag= (sps.max_transform_hierarchy_depth_inter==0 && + trafoDepth == 0 && + PredMode == MODE_INTER && + PartMode != PART_2Nx2N); + + split_transform_flag = (log2TrafoSize > sps.Log2MaxTrafoSize || + (IntraSplitFlag==1 && trafoDepth==0) || + interSplitFlag==1) ? 1:0; + } + + if (split_transform_flag) { + logtrace(LogSlice,"set_split_transform_flag(%d,%d, %d)\n",x0,y0,trafoDepth); + img->set_split_transform_flag(x0,y0,trafoDepth); + } + + int cbf_cb=-1; + int cbf_cr=-1; + + // CBF_CB/CR flags are encoded like this: + // 4:2:0 and 4:4:4 modes: binary flag in bit 0 + // 4:2:2 mode: bit 0: top block, bit 1: bottom block + + if ((log2TrafoSize>2 && sps.ChromaArrayType != CHROMA_MONO) || + sps.ChromaArrayType == CHROMA_444) { + // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0 + if (/*trafoDepth==0 ||*/ parent_cbf_cb) { + cbf_cb = decode_cbf_chroma(tctx,trafoDepth); + + if (sps.ChromaArrayType == CHROMA_422 && (!split_transform_flag || log2TrafoSize==3)) { + cbf_cb |= (decode_cbf_chroma(tctx,trafoDepth) << 1); + } + } + + // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0 + if (/*trafoDepth==0 ||*/ parent_cbf_cr) { + cbf_cr = decode_cbf_chroma(tctx,trafoDepth); + + if (sps.ChromaArrayType == CHROMA_422 && (!split_transform_flag || log2TrafoSize==3)) { + cbf_cr |= (decode_cbf_chroma(tctx,trafoDepth) << 1); + } + } + } + + //printf("CBF: cb:%d cr:%d\n",cbf_cb,cbf_cr); + + // cbf_cr/cbf_cb not present in bitstream -> induce values + + if (cbf_cb<0) { + assert(!(trafoDepth==0 && log2TrafoSize==2)); + + /* The standard specifies to check trafoDepth>0 AND log2TrafoSize==2. + However, I think that trafoDepth>0 is redundant as a CB is always + at least 8x8 and hence trafoDepth>0. + */ + + if (trafoDepth>0 && log2TrafoSize==2) { + cbf_cb = parent_cbf_cb; + } else { + cbf_cb=0; + } + } + + if (cbf_cr<0) { + if (trafoDepth>0 && log2TrafoSize==2) { + cbf_cr = parent_cbf_cr; + } else { + cbf_cr=0; + } + } + + if (split_transform_flag) { + int x1 = x0 + (1<<(log2TrafoSize-1)); + int y1 = y0 + (1<<(log2TrafoSize-1)); + + logtrace(LogSlice,"transform split.\n"); + + read_transform_tree(tctx, x0,y0, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 0, + MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr); + read_transform_tree(tctx, x1,y0, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 1, + MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr); + read_transform_tree(tctx, x0,y1, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 2, + MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr); + read_transform_tree(tctx, x1,y1, x0,y0, xCUBase,yCUBase, log2TrafoSize-1, trafoDepth+1, 3, + MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr); + } + else { + int cbf_luma; + + if (PredMode==MODE_INTRA || trafoDepth!=0 || cbf_cb || cbf_cr) { + cbf_luma = decode_cbf_luma(tctx,trafoDepth); + } + else { + /* There cannot be INTER blocks with no residual data. + That case is already handled with rqt_root_cbf. + */ + + cbf_luma = 1; + } + + logtrace(LogSlice,"call read_transform_unit %d/%d\n",x0,y0); + + read_transform_unit(tctx, x0,y0,xBase,yBase, xCUBase,yCUBase, log2TrafoSize,trafoDepth, blkIdx, + cbf_luma, cbf_cb, cbf_cr); + } +} + + +const char* part_mode_name(enum PartMode pm) +{ + switch (pm) { + case PART_2Nx2N: return "2Nx2N"; + case PART_2NxN: return "2NxN"; + case PART_Nx2N: return "Nx2N"; + case PART_NxN: return "NxN"; + case PART_2NxnU: return "2NxnU"; + case PART_2NxnD: return "2NxnD"; + case PART_nLx2N: return "nLx2N"; + case PART_nRx2N: return "nRx2N"; + } + + return "undefined part mode"; +} + + +void read_mvd_coding(thread_context* tctx, + int x0,int y0, int refList) +{ + int abs_mvd_greater0_flag[2]; + abs_mvd_greater0_flag[0] = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0]); + abs_mvd_greater0_flag[1] = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0]); + + int abs_mvd_greater1_flag[2]; + if (abs_mvd_greater0_flag[0]) { + abs_mvd_greater1_flag[0] = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1]); + } + else { + abs_mvd_greater1_flag[0]=0; + } + + if (abs_mvd_greater0_flag[1]) { + abs_mvd_greater1_flag[1] = decode_CABAC_bit(&tctx->cabac_decoder, + &tctx->ctx_model[CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1]); + } + else { + abs_mvd_greater1_flag[1]=0; + } + + + int abs_mvd_minus2[2]; + int mvd_sign_flag[2]; + int value[2]; + + for (int c=0;c<2;c++) { + if (abs_mvd_greater0_flag[c]) { + if (abs_mvd_greater1_flag[c]) { + abs_mvd_minus2[c] = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 1); + } + else { + abs_mvd_minus2[c] = abs_mvd_greater1_flag[c] -1; + } + + mvd_sign_flag[c] = decode_CABAC_bypass(&tctx->cabac_decoder); + + value[c] = abs_mvd_minus2[c]+2; + if (mvd_sign_flag[c]) { value[c] = -value[c]; } + } + else { + value[c] = 0; + } + } + + //set_mvd(tctx->decctx, x0,y0, refList, value[0],value[1]); + tctx->motion.mvd[refList][0] = value[0]; + tctx->motion.mvd[refList][1] = value[1]; + + logtrace(LogSlice, "MVD[%d;%d|%d] = %d;%d\n",x0,y0,refList, value[0],value[1]); +} + + +void read_prediction_unit_SKIP(thread_context* tctx, + int x0, int y0, + int nPbW, int nPbH) +{ + int merge_idx = decode_merge_idx(tctx); + + tctx->motion.merge_idx = merge_idx; + tctx->motion.merge_flag = true; + + logtrace(LogSlice,"prediction skip 2Nx2N, merge_idx: %d\n",merge_idx); +} + + +/* xC/yC : CB position + xB/yB : position offset of the PB + nPbW/nPbH : size of PB + nCS : CB size + */ +void read_prediction_unit(thread_context* tctx, + int xC,int yC, int xB,int yB, + int nPbW, int nPbH, + int ctDepth, int nCS,int partIdx) +{ + logtrace(LogSlice,"read_prediction_unit %d;%d %dx%d\n",xC+xB,yC+xB,nPbW,nPbH); + + int x0 = xC+xB; + int y0 = yC+yB; + + slice_segment_header* shdr = tctx->shdr; + + int merge_flag = decode_merge_flag(tctx); + tctx->motion.merge_flag = merge_flag; + + if (merge_flag) { + int merge_idx = decode_merge_idx(tctx); + + logtrace(LogSlice,"prediction unit %d,%d, merge mode, index: %d\n",x0,y0,merge_idx); + + tctx->motion.merge_idx = merge_idx; + } + else { // no merge flag + enum InterPredIdc inter_pred_idc; + + if (shdr->slice_type == SLICE_TYPE_B) { + inter_pred_idc = decode_inter_pred_idc(tctx,x0,y0,nPbW,nPbH,ctDepth); + } + else { + inter_pred_idc = PRED_L0; + } + + tctx->motion.inter_pred_idc = inter_pred_idc; // set_inter_pred_idc(ctx,x0,y0, inter_pred_idc); + + if (inter_pred_idc != PRED_L1) { + int ref_idx_l0 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l0_active); + + // NOTE: case for only one reference frame is handles in decode_ref_idx_lX() + tctx->motion.refIdx[0] = ref_idx_l0; + + read_mvd_coding(tctx,x0,y0, 0); + + int mvp_l0_flag = decode_mvp_lx_flag(tctx); // l0 + tctx->motion.mvp_l0_flag = mvp_l0_flag; + + logtrace(LogSlice,"prediction unit %d,%d, L0, refIdx=%d mvp_l0_flag:%d\n", + x0,y0, tctx->motion.refIdx[0], mvp_l0_flag); + } + + if (inter_pred_idc != PRED_L0) { + int ref_idx_l1 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l1_active); + + // NOTE: case for only one reference frame is handles in decode_ref_idx_lX() + tctx->motion.refIdx[1] = ref_idx_l1; + + if (shdr->mvd_l1_zero_flag && + inter_pred_idc == PRED_BI) { + tctx->motion.mvd[1][0] = 0; + tctx->motion.mvd[1][1] = 0; + } + else { + read_mvd_coding(tctx,x0,y0, 1); + } + + int mvp_l1_flag = decode_mvp_lx_flag(tctx); // l1 + tctx->motion.mvp_l1_flag = mvp_l1_flag; + + logtrace(LogSlice,"prediction unit %d,%d, L1, refIdx=%d mvp_l1_flag:%d\n", + x0,y0, tctx->motion.refIdx[1], mvp_l1_flag); + } + } + + + + decode_prediction_unit(tctx->decctx, tctx->shdr, tctx->img, tctx->motion, + xC,yC,xB,yB, nCS, nPbW,nPbH, partIdx); +} + + + + +template +void read_pcm_samples_internal(thread_context* tctx, int x0, int y0, int log2CbSize, + int cIdx, bitreader& br) +{ + const seq_parameter_set& sps = tctx->img->get_sps(); + + int nPcmBits; + int bitDepth; + + int w = 1<0) { + w /= sps.SubWidthC; + h /= sps.SubHeightC; + + x0 /= sps.SubWidthC; + y0 /= sps.SubHeightC; + + nPcmBits = sps.pcm_sample_bit_depth_chroma; + bitDepth = sps.BitDepth_C; + } + else { + nPcmBits = sps.pcm_sample_bit_depth_luma; + bitDepth = sps.BitDepth_Y; + } + + pixel_t* ptr; + int stride; + ptr = tctx->img->get_image_plane_at_pos_NEW(cIdx,x0,y0); + stride = tctx->img->get_image_stride(cIdx); + + int shift = bitDepth - nPcmBits; + + for (int y=0;ycabac_decoder.bitstream_curr; + br.bytes_remaining = tctx->cabac_decoder.bitstream_end - tctx->cabac_decoder.bitstream_curr; + br.nextbits = 0; + br.nextbits_cnt = 0; + + + if (tctx->img->high_bit_depth(0)) { + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,0,br); + } else { + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,0,br); + } + + if (tctx->img->get_sps().ChromaArrayType != CHROMA_MONO) { + if (tctx->img->high_bit_depth(1)) { + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,1,br); + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,2,br); + } else { + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,1,br); + read_pcm_samples_internal(tctx,x0,y0,log2CbSize,2,br); + } + } + + prepare_for_CABAC(&br); + tctx->cabac_decoder.bitstream_curr = br.data; + init_CABAC_decoder_2(&tctx->cabac_decoder); +} + + +int map_chroma_pred_mode(int intra_chroma_pred_mode, int IntraPredMode) +{ + if (intra_chroma_pred_mode==4) { + return IntraPredMode; + } + else { + static const enum IntraPredMode IntraPredModeCCand[4] = { + INTRA_PLANAR, + INTRA_ANGULAR_26, // vertical + INTRA_ANGULAR_10, // horizontal + INTRA_DC + }; + + int IntraPredModeC = IntraPredModeCCand[intra_chroma_pred_mode]; + if (IntraPredModeC == IntraPredMode) { + return INTRA_ANGULAR_34; + } + else { + return IntraPredModeC; + } + } +} + +// h.265-V2 Table 8-3 +static const uint8_t map_chroma_422[35] = { + 0,1,2, 2, 2, 2, 3, 5, 7, 8,10,12,13,15,17,18,19,20, + 21,22,23,23,24,24,25,25,26,27,27,28,28,29,29,30,31 +}; + +void read_coding_unit(thread_context* tctx, + int x0, int y0, // position of coding unit in frame + int log2CbSize, + int ctDepth) +{ + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + slice_segment_header* shdr = tctx->shdr; + + logtrace(LogSlice,"- read_coding_unit %d;%d cbsize:%d\n",x0,y0,1<set_log2CbSize(x0,y0, log2CbSize, true); + + /* This is only required on corrupted input streams. + It may happen that there are several slices in the image that overlap. + In this case, flags would accumulate from both slices. + */ + img->clear_split_transform_flags(x0,y0, log2CbSize); + + int nCbS = 1<cu_transquant_bypass_flag = transquant_bypass; + + if (transquant_bypass) { + img->set_cu_transquant_bypass(x0,y0,log2CbSize); + } + } + else { + tctx->cu_transquant_bypass_flag = 0; + } + + uint8_t cu_skip_flag = 0; + if (shdr->slice_type != SLICE_TYPE_I) { + cu_skip_flag = decode_cu_skip_flag(tctx,x0,y0,ctDepth); + } + + int IntraSplitFlag = 0; + + enum PredMode cuPredMode; + + if (cu_skip_flag) { + read_prediction_unit_SKIP(tctx,x0,y0,nCbS,nCbS); + + img->set_PartMode(x0,y0, PART_2Nx2N); // need this for deblocking filter + img->set_pred_mode(x0,y0,log2CbSize, MODE_SKIP); + cuPredMode = MODE_SKIP; + + logtrace(LogSlice,"CU pred mode: SKIP\n"); + + + // DECODE + + int nCS_L = 1<decctx,tctx->shdr,tctx->img,tctx->motion, + x0,y0, 0,0, nCS_L, nCS_L,nCS_L, 0); + } + else /* not skipped */ { + if (shdr->slice_type != SLICE_TYPE_I) { + int pred_mode_flag = decode_pred_mode_flag(tctx); + cuPredMode = pred_mode_flag ? MODE_INTRA : MODE_INTER; + } + else { + cuPredMode = MODE_INTRA; + } + + img->set_pred_mode(x0,y0,log2CbSize, cuPredMode); + + logtrace(LogSlice,"CU pred mode: %s\n", cuPredMode==MODE_INTRA ? "INTRA" : "INTER"); + + + enum PartMode PartMode; + + if (cuPredMode != MODE_INTRA || + log2CbSize == sps.Log2MinCbSizeY) { + PartMode = decode_part_mode(tctx, cuPredMode, log2CbSize); + + if (PartMode==PART_NxN && cuPredMode==MODE_INTRA) { + IntraSplitFlag=1; + } + } else { + PartMode = PART_2Nx2N; + } + + img->set_PartMode(x0,y0, PartMode); // needed for deblocking ? + + logtrace(LogSlice, "PartMode: %s\n", part_mode_name(PartMode)); + + + bool pcm_flag = false; + + if (cuPredMode == MODE_INTRA) { + if (PartMode == PART_2Nx2N && sps.pcm_enabled_flag && + log2CbSize >= sps.Log2MinIpcmCbSizeY && + log2CbSize <= sps.Log2MaxIpcmCbSizeY) { + pcm_flag = decode_CABAC_term_bit(&tctx->cabac_decoder); + } + + if (pcm_flag) { + img->set_pcm_flag(x0,y0,log2CbSize); + + read_pcm_samples(tctx, x0,y0, log2CbSize); + } + else { + int pbOffset = (PartMode == PART_NxN) ? (nCbS/2) : nCbS; + int log2IntraPredSize = (PartMode == PART_NxN) ? (log2CbSize-1) : log2CbSize; + + logtrace(LogSlice,"nCbS:%d pbOffset:%d\n",nCbS,pbOffset); + + int prev_intra_luma_pred_flag[4]; + + int idx=0; + for (int j=0;j0); // left candidate always available for right blk + int availableB = availableB0 || (j>0); // top candidate always available for bottom blk + + + + int PUidx = (x>>sps.Log2MinPUSize) + (y>>sps.Log2MinPUSize)*sps.PicWidthInMinPUs; + + enum IntraPredMode candModeList[3]; + + fillIntraPredModeCandidates(candModeList,x,y,PUidx, + availableA, availableB, img); + + for (int i=0;i<3;i++) + logtrace(LogSlice,"candModeList[%d] = %d\n", i, candModeList[i]); + + if (prev_intra_luma_pred_flag[idx]==1) { + IntraPredMode = candModeList[ mpm_idx[idx] ]; + } + else { + // sort candModeList + + if (candModeList[0] > candModeList[1]) { + std::swap(candModeList[0],candModeList[1]); + } + if (candModeList[0] > candModeList[2]) { + std::swap(candModeList[0],candModeList[2]); + } + if (candModeList[1] > candModeList[2]) { + std::swap(candModeList[1],candModeList[2]); + } + + // skip modes in the list + // (we have 35 modes. skipping the 3 in the list gives us 32, which can be selected by 5 bits) + IntraPredMode = rem_intra_luma_pred_mode[idx]; + for (int n=0;n<=2;n++) { + if (IntraPredMode >= candModeList[n]) { IntraPredMode++; } + } + } + + logtrace(LogSlice,"IntraPredMode[%d][%d] = %d (log2blk:%d)\n",x,y,IntraPredMode, log2IntraPredSize); + + img->set_IntraPredMode(PUidx, log2IntraPredSize, + (enum IntraPredMode)IntraPredMode); + + idx++; + } + + + // set chroma intra prediction mode + + if (sps.ChromaArrayType == CHROMA_444) { + // chroma 4:4:4 + + idx = 0; + for (int j=0;jget_IntraPredMode(x,y); + + int IntraPredModeC = map_chroma_pred_mode(intra_chroma_pred_mode, IntraPredMode); + + logtrace(LogSlice,"IntraPredModeC[%d][%d]: %d (blksize:%d)\n",x,y,IntraPredModeC, + 1<set_IntraPredModeC(x,y, log2IntraPredSize, + (enum IntraPredMode)IntraPredModeC, + intra_chroma_pred_mode == 4); + idx++; + } + } + else if (sps.ChromaArrayType != CHROMA_MONO) { + // chroma 4:2:0 and 4:2:2 + + int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx); + int IntraPredMode = img->get_IntraPredMode(x0,y0); + logtrace(LogSlice,"IntraPredMode: %d\n",IntraPredMode); + int IntraPredModeC = map_chroma_pred_mode(intra_chroma_pred_mode, IntraPredMode); + + if (sps.ChromaArrayType == CHROMA_422) { + IntraPredModeC = map_chroma_422[ IntraPredModeC ]; + } + + img->set_IntraPredModeC(x0,y0, log2CbSize, + (enum IntraPredMode)IntraPredModeC, + intra_chroma_pred_mode == 4); + } + } + } + else { // INTER + int nCS = 1<motion.merge_flag; // !!get_merge_flag(ctx,x0,y0); + + if (cuPredMode != MODE_INTRA && + !(PartMode == PART_2Nx2N && merge_flag)) { + + rqt_root_cbf = !!decode_rqt_root_cbf(tctx); + } + else { + /* rqt_root_cbf=1 is inferred for Inter blocks with 2Nx2N, merge mode. + These must be some residual data, because otherwise, the CB could + also be coded in SKIP mode. + */ + + rqt_root_cbf = true; + } + + //set_rqt_root_cbf(ctx,x0,y0, log2CbSize, rqt_root_cbf); + + if (rqt_root_cbf) { + int MaxTrafoDepth; + + if (cuPredMode==MODE_INTRA) { + MaxTrafoDepth = sps.max_transform_hierarchy_depth_intra + IntraSplitFlag; + } + else { + MaxTrafoDepth = sps.max_transform_hierarchy_depth_inter; + } + + logtrace(LogSlice,"MaxTrafoDepth: %d\n",MaxTrafoDepth); + + uint8_t initial_chroma_cbf = 1; + if (sps.ChromaArrayType == CHROMA_MONO) { + initial_chroma_cbf = 0; + } + + read_transform_tree(tctx, x0,y0, x0,y0, x0,y0, log2CbSize, 0,0, + MaxTrafoDepth, IntraSplitFlag, cuPredMode, + initial_chroma_cbf, initial_chroma_cbf); + } + } // !pcm + } +} + + +// ------------------------------------------------------------------------------------------ + + +void read_coding_quadtree(thread_context* tctx, + int x0, int y0, + int log2CbSize, + int ctDepth) +{ + logtrace(LogSlice,"- read_coding_quadtree %d;%d cbsize:%d depth:%d POC:%d\n",x0,y0,1<img->PicOrderCntVal); + + de265_image* img = tctx->img; + const seq_parameter_set& sps = img->get_sps(); + const pic_parameter_set& pps = img->get_pps(); + + int split_flag; + + // We only send a split flag if CU is larger than minimum size and + // completely contained within the image area. + // If it is partly outside the image area and not at minimum size, + // it is split. If already at minimum size, it is not split further. + if (x0+(1< sps.Log2MinCbSizeY) { + split_flag = decode_split_cu_flag(tctx, x0,y0, ctDepth); + } else { + if (log2CbSize > sps.Log2MinCbSizeY) { split_flag=1; } + else { split_flag=0; } + } + + + if (pps.cu_qp_delta_enabled_flag && + log2CbSize >= pps.Log2MinCuQpDeltaSize) + { + tctx->IsCuQpDeltaCoded = 0; + tctx->CuQpDelta = 0; + } + else + { + // shdr->CuQpDelta = 0; // TODO check: is this the right place to set to default value ? + } + + + if (tctx->shdr->cu_chroma_qp_offset_enabled_flag && + log2CbSize >= pps.Log2MinCuChromaQpOffsetSize) { + tctx->IsCuChromaQpOffsetCoded = 0; + } + + if (split_flag) { + int x1 = x0 + (1<<(log2CbSize-1)); + int y1 = y0 + (1<<(log2CbSize-1)); + + read_coding_quadtree(tctx,x0,y0, log2CbSize-1, ctDepth+1); + + if (x1set_ctDepth(x0,y0, log2CbSize, ctDepth); + + read_coding_unit(tctx, x0,y0, log2CbSize, ctDepth); + } + + logtrace(LogSlice,"-\n"); +} + + +// --------------------------------------------------------------------------- + +enum DecodeResult { + Decode_EndOfSliceSegment, + Decode_EndOfSubstream, + Decode_Error +}; + +/* Decode CTBs until the end of sub-stream, the end-of-slice, or some error occurs. + */ +enum DecodeResult decode_substream(thread_context* tctx, + bool block_wpp, // block on WPP dependencies + bool first_independent_substream) +{ + const pic_parameter_set& pps = tctx->img->get_pps(); + const seq_parameter_set& sps = tctx->img->get_sps(); + + const int ctbW = sps.PicWidthInCtbsY; + + + const int startCtbY = tctx->CtbY; + + //printf("start decoding substream at %d;%d\n",tctx->CtbX,tctx->CtbY); + + // in WPP mode: initialize CABAC model with stored model from row above + + if ((!first_independent_substream || tctx->CtbY != startCtbY) && + pps.entropy_coding_sync_enabled_flag && + tctx->CtbY>=1 && tctx->CtbX==0) + { + if (sps.PicWidthInCtbsY>1) { + if ((tctx->CtbY-1) >= tctx->imgunit->ctx_models.size()) { + return Decode_Error; + } + + //printf("CTX wait on %d/%d\n",1,tctx->CtbY-1); + + // we have to wait until the context model data is there + tctx->img->wait_for_progress(tctx->task, 1,tctx->CtbY-1,CTB_PROGRESS_PREFILTER); + + // copy CABAC model from previous CTB row + tctx->ctx_model = tctx->imgunit->ctx_models[(tctx->CtbY-1)]; + tctx->imgunit->ctx_models[(tctx->CtbY-1)].release(); // not used anymore + } + else { + tctx->img->wait_for_progress(tctx->task, 0,tctx->CtbY-1,CTB_PROGRESS_PREFILTER); + initialize_CABAC_models(tctx); + } + } + + + do { + const int ctbx = tctx->CtbX; + const int ctby = tctx->CtbY; + + if (ctbx+ctby*ctbW >= pps.CtbAddrRStoTS.size()) { + return Decode_Error; + } + + if (ctbx >= sps.PicWidthInCtbsY || + ctby >= sps.PicHeightInCtbsY) { + return Decode_Error; + } + + if (block_wpp && ctby>0 && ctbx < ctbW-1) { + + // TODO: if we are in tiles mode and at the right border, do not wait for x+1,y-1 + + //printf("wait on %d/%d (%d)\n",ctbx+1,ctby-1, ctbx+1+(ctby-1)*sps->PicWidthInCtbsY); + + tctx->img->wait_for_progress(tctx->task, ctbx+1,ctby-1, CTB_PROGRESS_PREFILTER); + } + + //printf("%p: decode %d;%d\n", tctx, tctx->CtbX,tctx->CtbY); + + + // read and decode CTB + + if (tctx->ctx_model.empty() == false) { + return Decode_Error; + } + + read_coding_tree_unit(tctx); + + + // save CABAC-model for WPP (except in last CTB row) + + if (pps.entropy_coding_sync_enabled_flag && + ctbx == 1 && + ctby < sps.PicHeightInCtbsY-1) + { + // no storage for context table has been allocated + if (tctx->imgunit->ctx_models.size() <= ctby) { + return Decode_Error; + } + + tctx->imgunit->ctx_models[ctby] = tctx->ctx_model; + tctx->imgunit->ctx_models[ctby].decouple(); // store an independent copy + } + + + // end of slice segment ? + + int end_of_slice_segment_flag = decode_CABAC_term_bit(&tctx->cabac_decoder); + //printf("end-of-slice flag: %d\n", end_of_slice_segment_flag); + + if (end_of_slice_segment_flag) { + // at the end of the slice segment, we store the CABAC model if we need it + // because a dependent slice may follow + + if (pps.dependent_slice_segments_enabled_flag) { + tctx->shdr->ctx_model_storage = tctx->ctx_model; + tctx->shdr->ctx_model_storage.decouple(); // store an independent copy + + tctx->shdr->ctx_model_storage_defined = true; + } + } + + tctx->img->ctb_progress[ctbx+ctby*ctbW].set_progress(CTB_PROGRESS_PREFILTER); + + //printf("%p: decoded %d|%d\n",tctx, ctby,ctbx); + + + logtrace(LogSlice,"read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag); + //printf("read CTB %d -> end=%d\n", tctx->CtbAddrInRS, end_of_slice_segment_flag); + + const int lastCtbY = tctx->CtbY; + + bool endOfPicture = advanceCtbAddr(tctx); // true if we read past the end of the image + + if (endOfPicture && + end_of_slice_segment_flag == false) + { + tctx->decctx->add_warning(DE265_WARNING_CTB_OUTSIDE_IMAGE_AREA, false); + tctx->img->integrity = INTEGRITY_DECODING_ERRORS; + return Decode_Error; + } + + + if (end_of_slice_segment_flag) { + /* corrupted inputs may send the end_of_slice_segment_flag even if not all + CTBs in a row have been coded. Hence, we mark all of them as finished. + */ + + /* + for (int x = ctbx+1 ; xPicWidthInCtbsY; x++) { + printf("mark skipped %d;%d\n",ctbx,ctby); + tctx->img->ctb_progress[ctbx+ctby*ctbW].set_progress(CTB_PROGRESS_PREFILTER); + } + */ + + return Decode_EndOfSliceSegment; + } + + + if (!end_of_slice_segment_flag) { + bool end_of_sub_stream = false; + end_of_sub_stream |= (pps.tiles_enabled_flag && + pps.TileId[tctx->CtbAddrInTS] != pps.TileId[tctx->CtbAddrInTS-1]); + end_of_sub_stream |= (pps.entropy_coding_sync_enabled_flag && + lastCtbY != tctx->CtbY); + + if (end_of_sub_stream) { + int end_of_sub_stream_one_bit = decode_CABAC_term_bit(&tctx->cabac_decoder); + if (!end_of_sub_stream_one_bit) { + tctx->decctx->add_warning(DE265_WARNING_EOSS_BIT_NOT_SET, false); + tctx->img->integrity = INTEGRITY_DECODING_ERRORS; + return Decode_Error; + } + + init_CABAC_decoder_2(&tctx->cabac_decoder); // byte alignment + return Decode_EndOfSubstream; + } + } + + } while (true); +} + + + +bool initialize_CABAC_at_slice_segment_start(thread_context* tctx) +{ + de265_image* img = tctx->img; + const pic_parameter_set& pps = img->get_pps(); + const seq_parameter_set& sps = img->get_sps(); + slice_segment_header* shdr = tctx->shdr; + + if (shdr->dependent_slice_segment_flag) { + int prevCtb = pps.CtbAddrTStoRS[ pps.CtbAddrRStoTS[shdr->slice_segment_address] -1 ]; + + int sliceIdx = img->get_SliceHeaderIndex_atIndex(prevCtb); + if (sliceIdx >= img->slices.size()) { + return false; + } + slice_segment_header* prevCtbHdr = img->slices[ sliceIdx ]; + + if (pps.is_tile_start_CTB(shdr->slice_segment_address % sps.PicWidthInCtbsY, + shdr->slice_segment_address / sps.PicWidthInCtbsY + )) { + initialize_CABAC_models(tctx); + } + else { + // wait for previous slice to finish decoding + + //printf("wait for previous slice to finish decoding\n"); + + + slice_unit* prevSliceSegment = tctx->imgunit->get_prev_slice_segment(tctx->sliceunit); + //assert(prevSliceSegment); + if (prevSliceSegment==NULL) { + return false; + } + + prevSliceSegment->finished_threads.wait_for_progress(prevSliceSegment->nThreads); + + + /* + printf("wait for %d,%d (init)\n", + prevCtb / sps->PicWidthInCtbsY, + prevCtb % sps->PicWidthInCtbsY); + tctx->img->wait_for_progress(tctx->task, prevCtb, CTB_PROGRESS_PREFILTER); + */ + + if (!prevCtbHdr->ctx_model_storage_defined) { + return false; + } + + tctx->ctx_model = prevCtbHdr->ctx_model_storage; + prevCtbHdr->ctx_model_storage.release(); + } + } + else { + initialize_CABAC_models(tctx); + } + + return true; +} + + +std::string thread_task_ctb_row::name() const { + char buf[100]; + sprintf(buf,"ctb-row-%d",debug_startCtbRow); + return buf; +} + + +std::string thread_task_slice_segment::name() const { + char buf[100]; + sprintf(buf,"slice-segment-%d;%d",debug_startCtbX,debug_startCtbY); + return buf; +} + + +void thread_task_slice_segment::work() +{ + thread_task_slice_segment* data = this; + thread_context* tctx = data->tctx; + de265_image* img = tctx->img; + + state = Running; + img->thread_run(this); + + setCtbAddrFromTS(tctx); + + //printf("%p: A start decoding at %d/%d\n", tctx, tctx->CtbX,tctx->CtbY); + + if (data->firstSliceSubstream) { + bool success = initialize_CABAC_at_slice_segment_start(tctx); + if (!success) { + state = Finished; + tctx->sliceunit->finished_threads.increase_progress(1); + img->thread_finishes(this); + return; + } + } + else { + initialize_CABAC_models(tctx); + } + + init_CABAC_decoder_2(&tctx->cabac_decoder); + + /*enum DecodeResult result =*/ decode_substream(tctx, false, data->firstSliceSubstream); + + state = Finished; + tctx->sliceunit->finished_threads.increase_progress(1); + img->thread_finishes(this); + + return; // DE265_OK; +} + + +void thread_task_ctb_row::work() +{ + thread_task_ctb_row* data = this; + thread_context* tctx = data->tctx; + de265_image* img = tctx->img; + + const seq_parameter_set& sps = img->get_sps(); + int ctbW = sps.PicWidthInCtbsY; + + state = Running; + img->thread_run(this); + + setCtbAddrFromTS(tctx); + + int ctby = tctx->CtbAddrInRS / ctbW; + int myCtbRow = ctby; + + //printf("start CTB-row decoding at row %d\n", ctby); + + if (data->firstSliceSubstream) { + bool success = initialize_CABAC_at_slice_segment_start(tctx); + if (!success) { + // could not decode this row, mark whole row as finished + for (int x=0;xctb_progress[myCtbRow*ctbW + x].set_progress(CTB_PROGRESS_PREFILTER); + } + + state = Finished; + tctx->sliceunit->finished_threads.increase_progress(1); + img->thread_finishes(this); + return; + } + //initialize_CABAC(tctx); + } + + init_CABAC_decoder_2(&tctx->cabac_decoder); + + bool firstIndependentSubstream = + data->firstSliceSubstream && !tctx->shdr->dependent_slice_segment_flag; + + /*enum DecodeResult result =*/ + decode_substream(tctx, true, firstIndependentSubstream); + + // mark progress on remaining CTBs in row (in case of decoder error and early termination) + + // TODO: what about slices that end properly in the middle of a CTB row? + + if (tctx->CtbY == myCtbRow) { + int lastCtbX = sps.PicWidthInCtbsY; // assume no tiles when WPP is on + for (int x = tctx->CtbX; xctb_progress[myCtbRow*ctbW + x].set_progress(CTB_PROGRESS_PREFILTER); + } + } + } + + state = Finished; + tctx->sliceunit->finished_threads.increase_progress(1); + img->thread_finishes(this); +} + + +de265_error read_slice_segment_data(thread_context* tctx) +{ + setCtbAddrFromTS(tctx); + + de265_image* img = tctx->img; + const pic_parameter_set& pps = img->get_pps(); + const seq_parameter_set& sps = img->get_sps(); + slice_segment_header* shdr = tctx->shdr; + + bool success = initialize_CABAC_at_slice_segment_start(tctx); + if (!success) { + return DE265_ERROR_UNSPECIFIED_DECODING_ERROR; + } + + init_CABAC_decoder_2(&tctx->cabac_decoder); + + //printf("-----\n"); + + bool first_slice_substream = !shdr->dependent_slice_segment_flag; + + int substream=0; + + enum DecodeResult result; + do { + int ctby = tctx->CtbY; + + + // check whether entry_points[] are correct in the bitstream + + if (substream>0) { + if (substream-1 >= tctx->shdr->entry_point_offset.size() || + tctx->cabac_decoder.bitstream_curr - tctx->cabac_decoder.bitstream_start -2 /* -2 because of CABAC init */ + != tctx->shdr->entry_point_offset[substream-1]) { + tctx->decctx->add_warning(DE265_WARNING_INCORRECT_ENTRY_POINT_OFFSET, true); + } + } + + substream++; + + + result = decode_substream(tctx, false, first_slice_substream); + + + if (result == Decode_EndOfSliceSegment || + result == Decode_Error) { + break; + } + + first_slice_substream = false; + + if (pps.tiles_enabled_flag) { + initialize_CABAC_models(tctx); + } + } while (true); + + return DE265_OK; +} + + +/* TODO: + When a task wants to block, but is the first in the list of pending tasks, + do some error concealment instead of blocking, since it will never be deblocked. + This will only happen in the case of input error. + */ diff --git a/sps.cc b/sps.cc new file mode 100644 index 0000000..476cdbb --- /dev/null +++ b/sps.cc @@ -0,0 +1,1298 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "sps.h" +#include "util.h" +#include "scan.h" +#include "decctx.h" + +#include +#include +#include + +#define READ_VLC_OFFSET(variable, vlctype, offset) \ + if ((vlc = get_ ## vlctype(br)) == UVLC_ERROR) { \ + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); \ + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \ + } \ + variable = vlc + offset; + +#define READ_VLC(variable, vlctype) READ_VLC_OFFSET(variable,vlctype,0) + + +static int SubWidthC_tab[] = { 1,2,2,1 }; +static int SubHeightC_tab[] = { 1,2,1,1 }; + + +// TODO if (!check_high(ctx, vlc, 15)) return false; +// TODO if (!check_ulvc(ctx, vlc)) return false; + + +// TODO: should be in some header-file of refpic.c +extern bool read_short_term_ref_pic_set(error_queue* errqueue, + const seq_parameter_set* sps, + bitreader* br, + ref_pic_set* out_set, + int idxRps, // index of the set to be read + const std::vector& sets, + bool sliceRefPicSet); + +extern bool write_short_term_ref_pic_set(error_queue* errqueue, + const seq_parameter_set* sps, + CABAC_encoder& out, + const ref_pic_set* in_set, // which set to write + int idxRps, // index of the set to be read + const std::vector& sets, // previously read sets + bool sliceRefPicSet); // is this in the slice header? + + +sps_range_extension::sps_range_extension() +{ + transform_skip_rotation_enabled_flag = 0; + transform_skip_context_enabled_flag = 0; + implicit_rdpcm_enabled_flag = 0; + explicit_rdpcm_enabled_flag = 0; + extended_precision_processing_flag = 0; + intra_smoothing_disabled_flag = 0; + high_precision_offsets_enabled_flag = 0; + persistent_rice_adaptation_enabled_flag = 0; + cabac_bypass_alignment_enabled_flag = 0; +} + + +seq_parameter_set::seq_parameter_set() +{ + // TODO: this is dangerous + //memset(this,0,sizeof(seq_parameter_set)); + + sps_read = false; + //ref_pic_sets = NULL; +} + + +seq_parameter_set::~seq_parameter_set() +{ + //free(ref_pic_sets); +} + + +void seq_parameter_set::set_defaults(enum PresetSet) +{ + video_parameter_set_id = 0; + sps_max_sub_layers = 1; + sps_temporal_id_nesting_flag = 1; + + profile_tier_level_.general.set_defaults(Profile_Main, 6,2); // TODO + + seq_parameter_set_id = 0; + chroma_format_idc = 1; + ChromaArrayType = chroma_format_idc; + + separate_colour_plane_flag = 0; + pic_width_in_luma_samples = 0; + pic_height_in_luma_samples = 0; + conformance_window_flag = 0; + + conf_win_left_offset = 0; + conf_win_right_offset = 0; + conf_win_top_offset = 0; + conf_win_bottom_offset = 0; + + bit_depth_luma =8; + bit_depth_chroma=8; + + log2_max_pic_order_cnt_lsb = 8; + sps_sub_layer_ordering_info_present_flag = 0; + + sps_max_dec_pic_buffering[0] = 1; + sps_max_num_reorder_pics[0] = 0; + sps_max_latency_increase_plus1[0] = 0; + + set_CB_log2size_range(4,4); + set_TB_log2size_range(3,4); + max_transform_hierarchy_depth_inter = 1; + max_transform_hierarchy_depth_intra = 1; + + scaling_list_enable_flag = 0; + sps_scaling_list_data_present_flag = 0; + + // TODO struct scaling_list_data scaling_list; + + amp_enabled_flag = 0; + sample_adaptive_offset_enabled_flag = 0; + pcm_enabled_flag = 0; + + pcm_sample_bit_depth_luma = 8; + pcm_sample_bit_depth_chroma = 8; + // TODO log2_min_pcm_luma_coding_block_size; + // TODO log2_diff_max_min_pcm_luma_coding_block_size; + pcm_loop_filter_disable_flag = 1; + + // num_short_term_ref_pic_sets = 0; + // std::vector ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) ) + ref_pic_sets.clear(); + + long_term_ref_pics_present_flag = 0; + + num_long_term_ref_pics_sps = 0; + + /* TODO + int lt_ref_pic_poc_lsb_sps[MAX_NUM_LT_REF_PICS_SPS]; + char used_by_curr_pic_lt_sps_flag[MAX_NUM_LT_REF_PICS_SPS]; + */ + + sps_temporal_mvp_enabled_flag = 0; + strong_intra_smoothing_enable_flag = 0; + vui_parameters_present_flag = 0; + + /* + if( vui_parameters_present_flag ) + vui_parameters() + */ + + sps_extension_present_flag = 0; + sps_range_extension_flag = 0; + sps_multilayer_extension_flag = 0; + sps_extension_6bits = 0; +} + + +void seq_parameter_set::set_CB_log2size_range(int mini,int maxi) +{ + log2_min_luma_coding_block_size = mini; + log2_diff_max_min_luma_coding_block_size = maxi-mini; +} + + +void seq_parameter_set::set_TB_log2size_range(int mini,int maxi) +{ + log2_min_transform_block_size = mini; + log2_diff_max_min_transform_block_size = maxi-mini; +} + + +void seq_parameter_set::set_resolution(int w,int h) +{ + pic_width_in_luma_samples = w; + pic_height_in_luma_samples = h; +} + + +de265_error seq_parameter_set::read(error_queue* errqueue, bitreader* br) +{ + int vlc; + + video_parameter_set_id = get_bits(br,4); + sps_max_sub_layers = get_bits(br,3) +1; + if (sps_max_sub_layers>7) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + sps_temporal_id_nesting_flag = get_bits(br,1); + + profile_tier_level_.read(br, sps_max_sub_layers); + + READ_VLC(seq_parameter_set_id, uvlc); + if (seq_parameter_set_id >= DE265_MAX_SPS_SETS) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + // --- decode chroma type --- + + READ_VLC(chroma_format_idc, uvlc); + + if (chroma_format_idc == 3) { + separate_colour_plane_flag = get_bits(br,1); + } + else { + separate_colour_plane_flag = 0; + } + + if (chroma_format_idc<0 || + chroma_format_idc>3) { + errqueue->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + // --- picture size --- + + READ_VLC(pic_width_in_luma_samples, uvlc); + READ_VLC(pic_height_in_luma_samples, uvlc); + + if (pic_width_in_luma_samples == 0 || + pic_height_in_luma_samples == 0) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (pic_width_in_luma_samples > MAX_PICTURE_WIDTH || + pic_height_in_luma_samples> MAX_PICTURE_HEIGHT) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + conformance_window_flag = get_bits(br,1); + + if (conformance_window_flag) { + READ_VLC(conf_win_left_offset, uvlc); + READ_VLC(conf_win_right_offset, uvlc); + READ_VLC(conf_win_top_offset, uvlc); + READ_VLC(conf_win_bottom_offset,uvlc); + } + else { + conf_win_left_offset = 0; + conf_win_right_offset = 0; + conf_win_top_offset = 0; + conf_win_bottom_offset= 0; + } + + READ_VLC_OFFSET(bit_depth_luma, uvlc, 8); + READ_VLC_OFFSET(bit_depth_chroma,uvlc, 8); + if (bit_depth_luma > 16 || + bit_depth_chroma > 16) { + errqueue->add_warning(DE265_WARNING_SPS_HEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + READ_VLC_OFFSET(log2_max_pic_order_cnt_lsb, uvlc, 4); + if (log2_max_pic_order_cnt_lsb<4 || + log2_max_pic_order_cnt_lsb>16) { + errqueue->add_warning(DE265_WARNING_SPS_HEADER_INVALID, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + MaxPicOrderCntLsb = 1<<(log2_max_pic_order_cnt_lsb); + + + // --- sub_layer_ordering_info --- + + sps_sub_layer_ordering_info_present_flag = get_bits(br,1); + + int firstLayer = (sps_sub_layer_ordering_info_present_flag ? + 0 : sps_max_sub_layers-1 ); + + for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { + + // sps_max_dec_pic_buffering[i] + + vlc=get_uvlc(br); + if (vlc == UVLC_ERROR || + vlc+1 > MAX_NUM_REF_PICS) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + sps_max_dec_pic_buffering[i] = vlc+1; + + // sps_max_num_reorder_pics[i] + + READ_VLC(sps_max_num_reorder_pics[i], uvlc); + + + // sps_max_latency_increase[i] + + READ_VLC(sps_max_latency_increase_plus1[i], uvlc); + + SpsMaxLatencyPictures[i] = (sps_max_num_reorder_pics[i] + + sps_max_latency_increase_plus1[i]-1); + } + + // copy info to all layers if only specified once + + if (sps_sub_layer_ordering_info_present_flag) { + int ref = sps_max_sub_layers-1; + assert(ref<7); + + for (int i=0 ; i < sps_max_sub_layers-1; i++ ) { + sps_max_dec_pic_buffering[i] = sps_max_dec_pic_buffering[ref]; + sps_max_num_reorder_pics[i] = sps_max_num_reorder_pics[ref]; + sps_max_latency_increase_plus1[i] = sps_max_latency_increase_plus1[ref]; + } + } + + + READ_VLC_OFFSET(log2_min_luma_coding_block_size, uvlc, 3); + READ_VLC (log2_diff_max_min_luma_coding_block_size, uvlc); + READ_VLC_OFFSET(log2_min_transform_block_size, uvlc, 2); + READ_VLC(log2_diff_max_min_transform_block_size, uvlc); + READ_VLC(max_transform_hierarchy_depth_inter, uvlc); + READ_VLC(max_transform_hierarchy_depth_intra, uvlc); + + if (log2_min_luma_coding_block_size > 6) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + if (log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size > 6) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + if (log2_min_transform_block_size > 5) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + if (log2_min_transform_block_size + log2_diff_max_min_transform_block_size > 5) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; } + + scaling_list_enable_flag = get_bits(br,1); + + if (scaling_list_enable_flag) { + + sps_scaling_list_data_present_flag = get_bits(br,1); + if (sps_scaling_list_data_present_flag) { + + de265_error err; + if ((err=read_scaling_list(br,this, &scaling_list, false)) != DE265_OK) { + return err; + } + } + else { + set_default_scaling_lists(&scaling_list); + } + } + + amp_enabled_flag = get_bits(br,1); + sample_adaptive_offset_enabled_flag = get_bits(br,1); + pcm_enabled_flag = get_bits(br,1); + if (pcm_enabled_flag) { + pcm_sample_bit_depth_luma = get_bits(br,4)+1; + pcm_sample_bit_depth_chroma = get_bits(br,4)+1; + READ_VLC_OFFSET(log2_min_pcm_luma_coding_block_size, uvlc, 3); + READ_VLC(log2_diff_max_min_pcm_luma_coding_block_size, uvlc); + pcm_loop_filter_disable_flag = get_bits(br,1); + } + else { + pcm_sample_bit_depth_luma = 0; + pcm_sample_bit_depth_chroma = 0; + log2_min_pcm_luma_coding_block_size = 0; + log2_diff_max_min_pcm_luma_coding_block_size = 0; + pcm_loop_filter_disable_flag = 0; + } + + int num_short_term_ref_pic_sets; + READ_VLC(num_short_term_ref_pic_sets, uvlc); + if (num_short_term_ref_pic_sets < 0 || + num_short_term_ref_pic_sets > 64) { + errqueue->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + // --- allocate reference pic set --- + + // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself + + ref_pic_sets.resize(num_short_term_ref_pic_sets); + + for (int i = 0; i < num_short_term_ref_pic_sets; i++) { + + bool success = read_short_term_ref_pic_set(errqueue,this,br, + &ref_pic_sets[i], i, + ref_pic_sets, + false); + + if (!success) { + return DE265_WARNING_SPS_HEADER_INVALID; + } + + // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh); + } + + long_term_ref_pics_present_flag = get_bits(br,1); + + if (long_term_ref_pics_present_flag) { + + READ_VLC(num_long_term_ref_pics_sps, uvlc); + if (num_long_term_ref_pics_sps > MAX_NUM_LT_REF_PICS_SPS) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { + lt_ref_pic_poc_lsb_sps[i] = get_bits(br, log2_max_pic_order_cnt_lsb); + used_by_curr_pic_lt_sps_flag[i] = get_bits(br,1); + } + } + else { + num_long_term_ref_pics_sps = 0; // NOTE: missing definition in standard ! + } + + sps_temporal_mvp_enabled_flag = get_bits(br,1); + strong_intra_smoothing_enable_flag = get_bits(br,1); + + vui_parameters_present_flag = get_bits(br,1); + if (vui_parameters_present_flag) { + vui.read(errqueue, br, this); + } + + + sps_extension_present_flag = get_bits(br,1); + if (sps_extension_present_flag) { + sps_range_extension_flag = get_bits(br,1); + sps_multilayer_extension_flag = get_bits(br,1); + sps_extension_6bits = get_bits(br,6); + } + else { + sps_range_extension_flag = 0; + } + + if (sps_range_extension_flag) { + de265_error err = range_extension.read(errqueue, br); + if (err != DE265_OK) { return err; } + } + + /* + sps_extension_flag = get_bits(br,1); + if (sps_extension_flag) { + assert(false); + } + */ + + + de265_error err = compute_derived_values(); + if (err != DE265_OK) { return err; } + + sps_read = true; + + return DE265_OK; +} + + +de265_error seq_parameter_set::compute_derived_values(bool sanitize_values) +{ + // --- compute derived values --- + + SubWidthC = SubWidthC_tab [chroma_format_idc]; + SubHeightC = SubHeightC_tab[chroma_format_idc]; + + if (separate_colour_plane_flag) { + ChromaArrayType = 0; + } + else { + ChromaArrayType = chroma_format_idc; + } + + if (ChromaArrayType==0) { + WinUnitX = 1; + WinUnitY = 1; + } + else { + WinUnitX = SubWidthC_tab [chroma_format_idc]; + WinUnitY = SubHeightC_tab[chroma_format_idc]; + } + + + + BitDepth_Y = bit_depth_luma; + QpBdOffset_Y = 6*(bit_depth_luma-8); + BitDepth_C = bit_depth_chroma; + QpBdOffset_C = 6*(bit_depth_chroma-8); + + Log2MinCbSizeY = log2_min_luma_coding_block_size; + Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size; + MinCbSizeY = 1 << Log2MinCbSizeY; + CtbSizeY = 1 << Log2CtbSizeY; + + PicWidthInMinCbsY = ceil_div(pic_width_in_luma_samples, MinCbSizeY); + PicWidthInCtbsY = ceil_div(pic_width_in_luma_samples, CtbSizeY); + PicHeightInMinCbsY = ceil_div(pic_height_in_luma_samples, MinCbSizeY); + PicHeightInCtbsY = ceil_div(pic_height_in_luma_samples,CtbSizeY); + PicSizeInMinCbsY = PicWidthInMinCbsY * PicHeightInMinCbsY; + PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY; + PicSizeInSamplesY = pic_width_in_luma_samples * pic_height_in_luma_samples; + + if (chroma_format_idc==0 || separate_colour_plane_flag) { + CtbWidthC = 0; + CtbHeightC = 0; + } + else { + CtbWidthC = CtbSizeY / SubWidthC; + CtbHeightC = CtbSizeY / SubHeightC; + } + + Log2MinTrafoSize = log2_min_transform_block_size; + Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size; + + if (max_transform_hierarchy_depth_inter > Log2CtbSizeY - Log2MinTrafoSize) { + if (sanitize_values) { + max_transform_hierarchy_depth_inter = Log2CtbSizeY - Log2MinTrafoSize; + } else { + fprintf(stderr,"SPS error: transform hierarchy depth (inter) > CTB size - min TB size\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + if (max_transform_hierarchy_depth_intra > Log2CtbSizeY - Log2MinTrafoSize) { + if (sanitize_values) { + max_transform_hierarchy_depth_intra = Log2CtbSizeY - Log2MinTrafoSize; + } else { + fprintf(stderr,"SPS error: transform hierarchy depth (intra) > CTB size - min TB size\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + } + + + if (sanitize_values) { + if (max_transform_hierarchy_depth_inter < Log2CtbSizeY - Log2MaxTrafoSize) { + max_transform_hierarchy_depth_inter = Log2CtbSizeY - Log2MaxTrafoSize; + } + + if (max_transform_hierarchy_depth_intra < Log2CtbSizeY - Log2MaxTrafoSize) { + max_transform_hierarchy_depth_intra = Log2CtbSizeY - Log2MaxTrafoSize; + } + } + + + Log2MinPUSize = Log2MinCbSizeY-1; + PicWidthInMinPUs = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinPUSize); + PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize); + + Log2MinIpcmCbSizeY = log2_min_pcm_luma_coding_block_size; + Log2MaxIpcmCbSizeY = (log2_min_pcm_luma_coding_block_size + + log2_diff_max_min_pcm_luma_coding_block_size); + + // the following are not in the standard + PicWidthInTbsY = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); + PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); + PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY; + + + if (range_extension.high_precision_offsets_enabled_flag) { + WpOffsetBdShiftY = 0; + WpOffsetBdShiftC = 0; + WpOffsetHalfRangeY = 1 << (BitDepth_Y - 1); + WpOffsetHalfRangeC = 1 << (BitDepth_C - 1); + } + else { + WpOffsetBdShiftY = ( BitDepth_Y - 8 ); + WpOffsetBdShiftC = ( BitDepth_C - 8 ); + WpOffsetHalfRangeY = 1 << 7; + WpOffsetHalfRangeC = 1 << 7; + } + + + // --- check SPS sanity --- + + if (pic_width_in_luma_samples % MinCbSizeY != 0 || + pic_height_in_luma_samples % MinCbSizeY != 0) { + // TODO: warn that image size is coded wrong in bitstream (must be multiple of MinCbSizeY) + fprintf(stderr,"SPS error: CB alignment\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (Log2MinTrafoSize > Log2MinCbSizeY) { + fprintf(stderr,"SPS error: TB > CB\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (Log2MaxTrafoSize > libde265_min(Log2CtbSizeY,5)) { + fprintf(stderr,"SPS error: TB_max > 32 or CTB\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + if (BitDepth_Y < 8 || BitDepth_Y > 16) { + fprintf(stderr,"SPS error: bitdepth Y not in [8;16]\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (BitDepth_C < 8 || BitDepth_C > 16) { + fprintf(stderr,"SPS error: bitdepth C not in [8;16]\n"); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + + sps_read = true; + + return DE265_OK; +} + + + +void seq_parameter_set::dump(int fd) const +{ + //#if (_MSC_VER >= 1500) + //#define LOG0(t) loginfo(LogHeaders, t) + //#define LOG1(t,d) loginfo(LogHeaders, t,d) + //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2) + //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3) + + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) +#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) + + + LOG0("----------------- SPS -----------------\n"); + LOG1("video_parameter_set_id : %d\n", video_parameter_set_id); + LOG1("sps_max_sub_layers : %d\n", sps_max_sub_layers); + LOG1("sps_temporal_id_nesting_flag : %d\n", sps_temporal_id_nesting_flag); + + profile_tier_level_.dump(sps_max_sub_layers, fh); + + LOG1("seq_parameter_set_id : %d\n", seq_parameter_set_id); + LOG2("chroma_format_idc : %d (%s)\n", chroma_format_idc, + chroma_format_idc == 0 ? "monochrome" : + chroma_format_idc == 1 ? "4:2:0" : + chroma_format_idc == 2 ? "4:2:2" : + chroma_format_idc == 3 ? "4:4:4" : "unknown"); + + if (chroma_format_idc == 3) { + LOG1("separate_colour_plane_flag : %d\n", separate_colour_plane_flag); + } + + LOG1("pic_width_in_luma_samples : %d\n", pic_width_in_luma_samples); + LOG1("pic_height_in_luma_samples : %d\n", pic_height_in_luma_samples); + LOG1("conformance_window_flag : %d\n", conformance_window_flag); + + if (conformance_window_flag) { + LOG1("conf_win_left_offset : %d\n", conf_win_left_offset); + LOG1("conf_win_right_offset : %d\n", conf_win_right_offset); + LOG1("conf_win_top_offset : %d\n", conf_win_top_offset); + LOG1("conf_win_bottom_offset: %d\n", conf_win_bottom_offset); + } + + LOG1("bit_depth_luma : %d\n", bit_depth_luma); + LOG1("bit_depth_chroma : %d\n", bit_depth_chroma); + + LOG1("log2_max_pic_order_cnt_lsb : %d\n", log2_max_pic_order_cnt_lsb); + LOG1("sps_sub_layer_ordering_info_present_flag : %d\n", sps_sub_layer_ordering_info_present_flag); + + int firstLayer = (sps_sub_layer_ordering_info_present_flag ? + 0 : sps_max_sub_layers-1 ); + + for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { + LOG1("Layer %d\n",i); + LOG1(" sps_max_dec_pic_buffering : %d\n", sps_max_dec_pic_buffering[i]); + LOG1(" sps_max_num_reorder_pics : %d\n", sps_max_num_reorder_pics[i]); + LOG1(" sps_max_latency_increase_plus1 : %d\n", sps_max_latency_increase_plus1[i]); + } + + LOG1("log2_min_luma_coding_block_size : %d\n", log2_min_luma_coding_block_size); + LOG1("log2_diff_max_min_luma_coding_block_size : %d\n",log2_diff_max_min_luma_coding_block_size); + LOG1("log2_min_transform_block_size : %d\n", log2_min_transform_block_size); + LOG1("log2_diff_max_min_transform_block_size : %d\n", log2_diff_max_min_transform_block_size); + LOG1("max_transform_hierarchy_depth_inter : %d\n", max_transform_hierarchy_depth_inter); + LOG1("max_transform_hierarchy_depth_intra : %d\n", max_transform_hierarchy_depth_intra); + LOG1("scaling_list_enable_flag : %d\n", scaling_list_enable_flag); + + if (scaling_list_enable_flag) { + + LOG1("sps_scaling_list_data_present_flag : %d\n", sps_scaling_list_data_present_flag); + if (sps_scaling_list_data_present_flag) { + + LOG0("scaling list logging output not implemented"); + //assert(0); + //scaling_list_data() + } + } + + LOG1("amp_enabled_flag : %d\n", amp_enabled_flag); + LOG1("sample_adaptive_offset_enabled_flag : %d\n", sample_adaptive_offset_enabled_flag); + LOG1("pcm_enabled_flag : %d\n", pcm_enabled_flag); + + if (pcm_enabled_flag) { + LOG1("pcm_sample_bit_depth_luma : %d\n", pcm_sample_bit_depth_luma); + LOG1("pcm_sample_bit_depth_chroma : %d\n", pcm_sample_bit_depth_chroma); + LOG1("log2_min_pcm_luma_coding_block_size : %d\n", log2_min_pcm_luma_coding_block_size); + LOG1("log2_diff_max_min_pcm_luma_coding_block_size : %d\n", log2_diff_max_min_pcm_luma_coding_block_size); + LOG1("pcm_loop_filter_disable_flag : %d\n", pcm_loop_filter_disable_flag); + } + + LOG1("num_short_term_ref_pic_sets : %d\n", ref_pic_sets.size()); + + for (int i = 0; i < ref_pic_sets.size(); i++) { + LOG1("ref_pic_set[ %2d ]: ",i); + dump_compact_short_term_ref_pic_set(&ref_pic_sets[i], 16, fh); + } + + LOG1("long_term_ref_pics_present_flag : %d\n", long_term_ref_pics_present_flag); + + if (long_term_ref_pics_present_flag) { + + LOG1("num_long_term_ref_pics_sps : %d\n", num_long_term_ref_pics_sps); + + for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { + LOG3("lt_ref_pic_poc_lsb_sps[%d] : %d (used_by_curr_pic_lt_sps_flag=%d)\n", + i, lt_ref_pic_poc_lsb_sps[i], used_by_curr_pic_lt_sps_flag[i]); + } + } + + LOG1("sps_temporal_mvp_enabled_flag : %d\n", sps_temporal_mvp_enabled_flag); + LOG1("strong_intra_smoothing_enable_flag : %d\n", strong_intra_smoothing_enable_flag); + LOG1("vui_parameters_present_flag : %d\n", vui_parameters_present_flag); + + LOG1("sps_extension_present_flag : %d\n", sps_extension_present_flag); + LOG1("sps_range_extension_flag : %d\n", sps_range_extension_flag); + LOG1("sps_multilayer_extension_flag : %d\n", sps_multilayer_extension_flag); + LOG1("sps_extension_6bits : %d\n", sps_extension_6bits); + + LOG1("CtbSizeY : %d\n", CtbSizeY); + LOG1("MinCbSizeY : %d\n", MinCbSizeY); + LOG1("MaxCbSizeY : %d\n", 1<<(log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size)); + LOG1("MinTBSizeY : %d\n", 1< matrixId) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + //printf("scaling_list_pred_matrix_id_delta=%d\n", scaling_list_pred_matrix_id_delta); + + dc_coeff[sizeId][matrixId] = 16; + scaling_list_dc_coef = 16; + + if (scaling_list_pred_matrix_id_delta==0) { + if (sizeId==0) { + memcpy(curr_scaling_list, default_ScalingList_4x4, 16); + } + else { + if (canonicalMatrixId<3) + { memcpy(curr_scaling_list, default_ScalingList_8x8_intra,64); } + else + { memcpy(curr_scaling_list, default_ScalingList_8x8_inter,64); } + } + } + else { + // TODO: CHECK: for sizeID=3 and the second matrix, should we have delta=1 or delta=3 ? + if (sizeId==3) { assert(scaling_list_pred_matrix_id_delta==1); } + + int mID = matrixId - scaling_list_pred_matrix_id_delta; + + int len = (sizeId == 0 ? 16 : 64); + memcpy(curr_scaling_list, scaling_list[mID], len); + + scaling_list_dc_coef = dc_coeff[sizeId][mID]; + dc_coeff[sizeId][matrixId] = dc_coeff[sizeId][mID]; + } + } + else { + int nextCoef=8; + int coefNum = (sizeId==0 ? 16 : 64); + if (sizeId>1) { + scaling_list_dc_coef = get_svlc(br); + if (scaling_list_dc_coef < -7 || + scaling_list_dc_coef > 247) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + scaling_list_dc_coef += 8; + nextCoef=scaling_list_dc_coef; + dc_coeff[sizeId][matrixId] = scaling_list_dc_coef; + } + else { + scaling_list_dc_coef = 16; + } + //printf("DC = %d\n",scaling_list_dc_coef); + + for (int i=0;i 127) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + nextCoef = (nextCoef + scaling_list_delta_coef + 256) % 256; + curr_scaling_list[i] = nextCoef; + //printf("curr %d = %d\n",i,nextCoef); + } + } + + + // --- generate ScalingFactor arrays --- + + switch (sizeId) { + case 0: + fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0], curr_scaling_list, 0); + break; + + case 1: + fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId][0][0], curr_scaling_list, 1); + break; + + case 2: + fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId][0][0], curr_scaling_list, 2); + sclist->ScalingFactor_Size2[matrixId][0][0] = scaling_list_dc_coef; + //printf("DC coeff: %d\n", scaling_list_dc_coef); + break; + + case 3: + fill_scaling_factor(&sclist->ScalingFactor_Size3[matrixId][0][0], curr_scaling_list, 3); + sclist->ScalingFactor_Size3[matrixId][0][0] = scaling_list_dc_coef; + //printf("DC coeff: %d\n", scaling_list_dc_coef); + break; + } + } + } + + return DE265_OK; +} + + +de265_error write_scaling_list(CABAC_encoder& out, const seq_parameter_set* sps, + scaling_list_data* sclist, bool inPPS) +{ + assert(false); + // TODO + + return DE265_OK; +} + + +void set_default_scaling_lists(scaling_list_data* sclist) +{ + // 4x4 + + for (int matrixId=0;matrixId<6;matrixId++) { + fill_scaling_factor(&sclist->ScalingFactor_Size0[matrixId][0][0], + default_ScalingList_4x4, 0); + } + + // 8x8 + + for (int matrixId=0;matrixId<3;matrixId++) { + fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+0][0][0], + default_ScalingList_8x8_intra, 1); + fill_scaling_factor(&sclist->ScalingFactor_Size1[matrixId+3][0][0], + default_ScalingList_8x8_inter, 1); + } + + // 16x16 + + for (int matrixId=0;matrixId<3;matrixId++) { + fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+0][0][0], + default_ScalingList_8x8_intra, 2); + fill_scaling_factor(&sclist->ScalingFactor_Size2[matrixId+3][0][0], + default_ScalingList_8x8_inter, 2); + } + + // 32x32 + + fill_scaling_factor(&sclist->ScalingFactor_Size3[0][0][0], + default_ScalingList_8x8_intra, 3); + fill_scaling_factor(&sclist->ScalingFactor_Size3[1][0][0], + default_ScalingList_8x8_inter, 3); +} + + +de265_error seq_parameter_set::write(error_queue* errqueue, CABAC_encoder& out) +{ + out.write_bits(video_parameter_set_id, 4); + if (sps_max_sub_layers>7) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + out.write_bits(sps_max_sub_layers-1, 3); + + out.write_bit(sps_temporal_id_nesting_flag); + + profile_tier_level_.write(out, sps_max_sub_layers); + + out.write_uvlc(seq_parameter_set_id); + + + // --- encode chroma type --- + + out.write_uvlc(chroma_format_idc); + + if (chroma_format_idc<0 || + chroma_format_idc>3) { + errqueue->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + if (chroma_format_idc == 3) { + out.write_bit(separate_colour_plane_flag); + } + + + // --- picture size --- + + out.write_uvlc(pic_width_in_luma_samples); + out.write_uvlc(pic_height_in_luma_samples); + + out.write_bit(conformance_window_flag); + + if (conformance_window_flag) { + out.write_uvlc(conf_win_left_offset); + out.write_uvlc(conf_win_right_offset); + out.write_uvlc(conf_win_top_offset); + out.write_uvlc(conf_win_bottom_offset); + } + + + out.write_uvlc(bit_depth_luma-8); + out.write_uvlc(bit_depth_chroma-8); + + out.write_uvlc(log2_max_pic_order_cnt_lsb-4); + + + // --- sub_layer_ordering_info --- + + out.write_bit(sps_sub_layer_ordering_info_present_flag); + + int firstLayer = (sps_sub_layer_ordering_info_present_flag ? + 0 : sps_max_sub_layers-1 ); + + for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) { + + // sps_max_dec_pic_buffering[i] + + if (sps_max_dec_pic_buffering[i] > MAX_NUM_REF_PICS) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + out.write_uvlc(sps_max_dec_pic_buffering[i]-1); + + // sps_max_num_reorder_pics[i] + + out.write_uvlc(sps_max_num_reorder_pics[i]); + + + // sps_max_latency_increase[i] + + out.write_uvlc(sps_max_latency_increase_plus1[i]); + } + + + out.write_uvlc(log2_min_luma_coding_block_size-3); + out.write_uvlc(log2_diff_max_min_luma_coding_block_size); + out.write_uvlc(log2_min_transform_block_size-2); + out.write_uvlc(log2_diff_max_min_transform_block_size); + out.write_uvlc(max_transform_hierarchy_depth_inter); + out.write_uvlc(max_transform_hierarchy_depth_intra); + out.write_bit(scaling_list_enable_flag); + + if (scaling_list_enable_flag) { + + out.write_bit(sps_scaling_list_data_present_flag); + if (sps_scaling_list_data_present_flag) { + + de265_error err; + if ((err=write_scaling_list(out,this, &scaling_list, false)) != DE265_OK) { + return err; + } + } + } + + out.write_bit(amp_enabled_flag); + out.write_bit(sample_adaptive_offset_enabled_flag); + out.write_bit(pcm_enabled_flag); + if (pcm_enabled_flag) { + out.write_bits(pcm_sample_bit_depth_luma -1,4); + out.write_bits(pcm_sample_bit_depth_chroma-1,4); + out.write_uvlc(log2_min_pcm_luma_coding_block_size-3); + out.write_uvlc(log2_diff_max_min_pcm_luma_coding_block_size); + out.write_bit(pcm_loop_filter_disable_flag); + } + + int num_short_term_ref_pic_sets = ref_pic_sets.size(); + if (num_short_term_ref_pic_sets < 0 || + num_short_term_ref_pic_sets > 64) { + errqueue->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + out.write_uvlc(num_short_term_ref_pic_sets); + + // --- allocate reference pic set --- + + // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself + + for (int i = 0; i < num_short_term_ref_pic_sets; i++) { + + bool success = write_short_term_ref_pic_set(errqueue,this,out, + &ref_pic_sets[i], i, + ref_pic_sets, + false); + + if (!success) { + return DE265_WARNING_SPS_HEADER_INVALID; + } + + // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh); + } + + out.write_bit(long_term_ref_pics_present_flag); + + if (long_term_ref_pics_present_flag) { + + if (num_long_term_ref_pics_sps > MAX_NUM_LT_REF_PICS_SPS) { + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + out.write_uvlc(num_long_term_ref_pics_sps); + + for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) { + out.write_bits(lt_ref_pic_poc_lsb_sps[i], log2_max_pic_order_cnt_lsb); + out.write_bit (used_by_curr_pic_lt_sps_flag[i]); + } + } + + out.write_bit(sps_temporal_mvp_enabled_flag); + out.write_bit(strong_intra_smoothing_enable_flag); + out.write_bit(vui_parameters_present_flag); + +#if 0 + if (vui_parameters_present_flag) { + assert(false); + /* + vui_parameters() + sps_extension_flag + u(1) + if( sps_extension_flag ) + while( more_rbsp_data() ) + sps_extension_data_flag + u(1) + rbsp_trailing_bits() + */ + } +#endif + + out.write_bit(sps_extension_present_flag); + +#if 0 + if (sps_extension_flag) { + assert(false); + } + check_rbsp_trailing_bits(br); +#endif + + // --- compute derived values --- + +#if 0 + BitDepth_Y = bit_depth_luma; + QpBdOffset_Y = 6*(bit_depth_luma-8); + BitDepth_C = bit_depth_chroma; + QpBdOffset_C = 6*(bit_depth_chroma-8); + Log2MinCbSizeY = log2_min_luma_coding_block_size; + Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size; + MinCbSizeY = 1 << Log2MinCbSizeY; + CtbSizeY = 1 << Log2CtbSizeY; + PicWidthInMinCbsY = pic_width_in_luma_samples / MinCbSizeY; + PicWidthInCtbsY = ceil_div(pic_width_in_luma_samples, CtbSizeY); + PicHeightInMinCbsY = pic_height_in_luma_samples / MinCbSizeY; + PicHeightInCtbsY = ceil_div(pic_height_in_luma_samples,CtbSizeY); + PicSizeInMinCbsY = PicWidthInMinCbsY * PicHeightInMinCbsY; + PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY; + PicSizeInSamplesY = pic_width_in_luma_samples * pic_height_in_luma_samples; + if (chroma_format_idc==0 || separate_colour_plane_flag) { + CtbWidthC = 0; + CtbHeightC = 0; + } + else { + CtbWidthC = CtbSizeY / SubWidthC; + CtbHeightC = CtbSizeY / SubHeightC; + } + Log2MinTrafoSize = log2_min_transform_block_size; + Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size; + Log2MinPUSize = Log2MinCbSizeY-1; + PicWidthInMinPUs = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinPUSize); + PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize); + Log2MinIpcmCbSizeY = log2_min_pcm_luma_coding_block_size; + Log2MaxIpcmCbSizeY = (log2_min_pcm_luma_coding_block_size + + log2_diff_max_min_pcm_luma_coding_block_size); + // the following are not in the standard + PicWidthInTbsY = PicWidthInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); + PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize); + PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY; + sps_read = true; +#endif + + return DE265_OK; +} + + +de265_error sps_range_extension::read(error_queue* errqueue, bitreader* br) +{ + transform_skip_rotation_enabled_flag = get_bits(br,1); + transform_skip_context_enabled_flag = get_bits(br,1); + implicit_rdpcm_enabled_flag = get_bits(br,1); + explicit_rdpcm_enabled_flag = get_bits(br,1); + extended_precision_processing_flag = get_bits(br,1); + intra_smoothing_disabled_flag = get_bits(br,1); + high_precision_offsets_enabled_flag = get_bits(br,1); + persistent_rice_adaptation_enabled_flag = get_bits(br,1); + cabac_bypass_alignment_enabled_flag = get_bits(br,1); + + return DE265_OK; +} + + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +void sps_range_extension::dump(int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + + LOG0("----------------- SPS-range-extension -----------------\n"); + LOG1("transform_skip_rotation_enabled_flag : %d\n", transform_skip_rotation_enabled_flag); + LOG1("transform_skip_context_enabled_flag : %d\n", transform_skip_context_enabled_flag); + LOG1("implicit_rdpcm_enabled_flag : %d\n", implicit_rdpcm_enabled_flag); + LOG1("explicit_rdpcm_enabled_flag : %d\n", explicit_rdpcm_enabled_flag); + LOG1("extended_precision_processing_flag : %d\n", extended_precision_processing_flag); + LOG1("intra_smoothing_disabled_flag : %d\n", intra_smoothing_disabled_flag); + LOG1("high_precision_offsets_enabled_flag : %d\n", high_precision_offsets_enabled_flag); + LOG1("persistent_rice_adaptation_enabled_flag : %d\n", persistent_rice_adaptation_enabled_flag); + LOG1("cabac_bypass_alignment_enabled_flag : %d\n", cabac_bypass_alignment_enabled_flag); +} +#undef LOG1 +#undef LOG0 diff --git a/threads.cc b/threads.cc new file mode 100644 index 0000000..d21193d --- /dev/null +++ b/threads.cc @@ -0,0 +1,312 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "threads.h" +#include +#include + +#if defined(_MSC_VER) || defined(__MINGW32__) +# include +#elif defined(HAVE_ALLOCA_H) +# include +#endif + + +#ifndef _WIN32 +// #include + +#define THREAD_RESULT void* +#define THREAD_PARAM void* + +#include + +int de265_thread_create(de265_thread* t, void *(*start_routine) (void *), void *arg) { return pthread_create(t,NULL,start_routine,arg); } +void de265_thread_join(de265_thread t) { pthread_join(t,NULL); } +void de265_thread_destroy(de265_thread* t) { } +void de265_mutex_init(de265_mutex* m) { pthread_mutex_init(m,NULL); } +void de265_mutex_destroy(de265_mutex* m) { pthread_mutex_destroy(m); } +void de265_mutex_lock(de265_mutex* m) { pthread_mutex_lock(m); } +void de265_mutex_unlock(de265_mutex* m) { pthread_mutex_unlock(m); } +void de265_cond_init(de265_cond* c) { pthread_cond_init(c,NULL); } +void de265_cond_destroy(de265_cond* c) { pthread_cond_destroy(c); } +void de265_cond_broadcast(de265_cond* c,de265_mutex* m) { pthread_cond_broadcast(c); } +void de265_cond_wait(de265_cond* c,de265_mutex* m) { pthread_cond_wait(c,m); } +void de265_cond_signal(de265_cond* c) { pthread_cond_signal(c); } +#else // _WIN32 + +#define THREAD_RESULT DWORD WINAPI +#define THREAD_PARAM LPVOID + +int de265_thread_create(de265_thread* t, LPTHREAD_START_ROUTINE start_routine, void *arg) { + HANDLE handle = CreateThread(NULL, 0, start_routine, arg, 0, NULL); + if (handle == NULL) { + return -1; + } + *t = handle; + return 0; +} +void de265_thread_join(de265_thread t) { WaitForSingleObject(t, INFINITE); } +void de265_thread_destroy(de265_thread* t) { CloseHandle(*t); *t = NULL; } +void de265_mutex_init(de265_mutex* m) { *m = CreateMutex(NULL, FALSE, NULL); } +void de265_mutex_destroy(de265_mutex* m) { CloseHandle(*m); } +void de265_mutex_lock(de265_mutex* m) { WaitForSingleObject(*m, INFINITE); } +void de265_mutex_unlock(de265_mutex* m) { ReleaseMutex(*m); } +void de265_cond_init(de265_cond* c) { win32_cond_init(c); } +void de265_cond_destroy(de265_cond* c) { win32_cond_destroy(c); } +void de265_cond_broadcast(de265_cond* c,de265_mutex* m) +{ + de265_mutex_lock(m); + win32_cond_broadcast(c); + de265_mutex_unlock(m); +} +void de265_cond_wait(de265_cond* c,de265_mutex* m) { win32_cond_wait(c,m); } +void de265_cond_signal(de265_cond* c) { win32_cond_signal(c); } +#endif // _WIN32 + + + + +de265_progress_lock::de265_progress_lock() +{ + mProgress = 0; + + de265_mutex_init(&mutex); + de265_cond_init(&cond); +} + +de265_progress_lock::~de265_progress_lock() +{ + de265_mutex_destroy(&mutex); + de265_cond_destroy(&cond); +} + +void de265_progress_lock::wait_for_progress(int progress) +{ + if (mProgress >= progress) { + return; + } + + de265_mutex_lock(&mutex); + while (mProgress < progress) { + de265_cond_wait(&cond, &mutex); + } + de265_mutex_unlock(&mutex); +} + +void de265_progress_lock::set_progress(int progress) +{ + de265_mutex_lock(&mutex); + + if (progress>mProgress) { + mProgress = progress; + + de265_cond_broadcast(&cond, &mutex); + } + + de265_mutex_unlock(&mutex); +} + +void de265_progress_lock::increase_progress(int progress) +{ + de265_mutex_lock(&mutex); + + mProgress += progress; + de265_cond_broadcast(&cond, &mutex); + + de265_mutex_unlock(&mutex); +} + +int de265_progress_lock::get_progress() const +{ + return mProgress; +} + + + + +#include "libde265/decctx.h" + +#if 0 +const char* line="--------------------------------------------------"; +void printblks(const thread_pool* pool) +{ + int w = pool->tasks[0].data.task_ctb.ctx->current_sps->PicWidthInCtbsY; + int h = pool->tasks[0].data.task_ctb.ctx->current_sps->PicHeightInCtbsY; + + printf("active threads: %d queue len: %d\n",pool->num_threads_working,pool->num_tasks); + + char *const p = (char *)alloca(w * h * sizeof(char)); + assert(p != NULL); + memset(p,' ',w*h); + + for (int i=0;inum_tasks;i++) { + int b = 0; //pool->tasks[i].num_blockers; + int x = pool->tasks[i].data.task_ctb.ctb_x; + int y = pool->tasks[i].data.task_ctb.ctb_y; + p[y*w+x] = b+'0'; + } + + for (int i=0;inum_threads_working;i++) { + int x = pool->ctbx[i]; + int y = pool->ctby[i]; + p[y*w+x] = '*'; + } + + printf("+%s+\n",line+50-w); + for (int y=0;ymutex); + + while(true) { + + // wait until we can pick a task or until the pool has been stopped + + for (;;) { + // end waiting if thread-pool has been stopped or we have a task to execute + + if (pool->stopped || pool->tasks.size()>0) { + break; + } + + //printf("going idle\n"); + de265_cond_wait(&pool->cond_var, &pool->mutex); + } + + // if the pool was shut down, end the execution + + if (pool->stopped) { + de265_mutex_unlock(&pool->mutex); + return NULL; + } + + + // get a task + + thread_task* task = pool->tasks.front(); + pool->tasks.pop_front(); + + pool->num_threads_working++; + + //printblks(pool); + + de265_mutex_unlock(&pool->mutex); + + + // execute the task + + task->work(); + + // end processing and check if this was the last task to be processed + + de265_mutex_lock(&pool->mutex); + + pool->num_threads_working--; + } + de265_mutex_unlock(&pool->mutex); + + return NULL; +} + + +de265_error start_thread_pool(thread_pool* pool, int num_threads) +{ + de265_error err = DE265_OK; + + // limit number of threads to maximum + + if (num_threads > MAX_THREADS) { + num_threads = MAX_THREADS; + err = DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM; + } + + pool->num_threads = 0; // will be increased below + + de265_mutex_init(&pool->mutex); + de265_cond_init(&pool->cond_var); + + de265_mutex_lock(&pool->mutex); + pool->num_threads_working = 0; + pool->stopped = false; + de265_mutex_unlock(&pool->mutex); + + // start worker threads + + for (int i=0; ithread[i], worker_thread, pool); + if (ret != 0) { + // cerr << "pthread_create() failed: " << ret << endl; + return DE265_ERROR_CANNOT_START_THREADPOOL; + } + + pool->num_threads++; + } + + return err; +} + + +void stop_thread_pool(thread_pool* pool) +{ + de265_mutex_lock(&pool->mutex); + pool->stopped = true; + de265_mutex_unlock(&pool->mutex); + + de265_cond_broadcast(&pool->cond_var, &pool->mutex); + + for (int i=0;inum_threads;i++) { + de265_thread_join(pool->thread[i]); + de265_thread_destroy(&pool->thread[i]); + } + + de265_mutex_destroy(&pool->mutex); + de265_cond_destroy(&pool->cond_var); +} + + +void add_task(thread_pool* pool, thread_task* task) +{ + de265_mutex_lock(&pool->mutex); + if (!pool->stopped) { + + pool->tasks.push_back(task); + + // wake up one thread + + de265_cond_signal(&pool->cond_var); + } + de265_mutex_unlock(&pool->mutex); +} diff --git a/transform.cc b/transform.cc new file mode 100644 index 0000000..ef404f8 --- /dev/null +++ b/transform.cc @@ -0,0 +1,739 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "transform.h" +#include "util.h" + +#include + + +const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ }; + + +// (8.6.1) +void decode_quantization_parameters(thread_context* tctx, int xC,int yC, + int xCUBase, int yCUBase) +{ + logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC); + + const pic_parameter_set& pps = tctx->img->get_pps(); + const seq_parameter_set& sps = tctx->img->get_sps(); + slice_segment_header* shdr = tctx->shdr; + + // top left pixel position of current quantization group + int xQG = xCUBase - (xCUBase & ((1<currentQG_x && + yQG == tctx->currentQG_y) + { + return; + } + */ + + // if first QG in CU, remember last QPY of last CU previous QG + + if (xQG != tctx->currentQG_x || + yQG != tctx->currentQG_y) + { + tctx->lastQPYinPreviousQG = tctx->currentQPY; + tctx->currentQG_x = xQG; + tctx->currentQG_y = yQG; + } + + int qPY_PRED; + + // first QG in CTB row ? + + int ctbLSBMask = ((1<shdr->SliceAddrRS; + + int SliceStartX = (first_ctb_in_slice_RS % sps.PicWidthInCtbsY) * sps.CtbSizeY; + int SliceStartY = (first_ctb_in_slice_RS / sps.PicWidthInCtbsY) * sps.CtbSizeY; + + bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG); + + // first QG in tile ? + + bool firstQGInTile = false; + if (pps.tiles_enabled_flag) { + if ((xQG & ((1 << sps.Log2CtbSizeY)-1)) == 0 && + (yQG & ((1 << sps.Log2CtbSizeY)-1)) == 0) + { + int ctbX = xQG >> sps.Log2CtbSizeY; + int ctbY = yQG >> sps.Log2CtbSizeY; + + firstQGInTile = pps.is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow + } + } + + + if (firstQGInSlice || firstQGInTile || + (firstInCTBRow && pps.entropy_coding_sync_enabled_flag)) { + qPY_PRED = tctx->shdr->SliceQPY; + } + else { + qPY_PRED = tctx->lastQPYinPreviousQG; + } + + + int qPYA,qPYB; + + if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) { + int xTmp = (xQG-1) >> sps.Log2MinTrafoSize; + int yTmp = (yQG ) >> sps.Log2MinTrafoSize; + int minTbAddrA = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; + int ctbAddrA = minTbAddrA >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); + if (ctbAddrA == tctx->CtbAddrInTS) { + qPYA = tctx->img->get_QPY(xQG-1,yQG); + } + else { + qPYA = qPY_PRED; + } + } + else { + qPYA = qPY_PRED; + } + + if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) { + int xTmp = (xQG ) >> sps.Log2MinTrafoSize; + int yTmp = (yQG-1) >> sps.Log2MinTrafoSize; + int minTbAddrB = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY]; + int ctbAddrB = minTbAddrB >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize)); + if (ctbAddrB == tctx->CtbAddrInTS) { + qPYB = tctx->img->get_QPY(xQG,yQG-1); + } + else { + qPYB = qPY_PRED; + } + } + else { + qPYB = qPY_PRED; + } + + qPY_PRED = (qPYA + qPYB + 1)>>1; + + logtrace(LogTransform,"qPY_PRED = %d (%d, %d)\n",qPY_PRED, qPYA, qPYB); + + int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps.QpBdOffset_Y) % + (52 + sps.QpBdOffset_Y)) - sps.QpBdOffset_Y; + + tctx->qPYPrime = QPY + sps.QpBdOffset_Y; + if (tctx->qPYPrime<0) { + tctx->qPYPrime=0; + } + + int qPiCb = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb); + int qPiCr = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr); + + logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n", + qPiCb, pps.pic_cb_qp_offset, shdr->slice_cb_qp_offset, + qPiCr, pps.pic_cr_qp_offset, shdr->slice_cr_qp_offset); + + int qPCb,qPCr; + + if (sps.ChromaArrayType == CHROMA_420) { + qPCb = table8_22(qPiCb); + qPCr = table8_22(qPiCr); + } + else { + qPCb = qPiCb; + qPCr = qPiCr; + } + + //printf("q: %d %d\n",qPiCb, qPCb); + + tctx->qPCbPrime = qPCb + sps.QpBdOffset_C; + if (tctx->qPCbPrime<0) { + tctx->qPCbPrime = 0; + } + + tctx->qPCrPrime = qPCr + sps.QpBdOffset_C; + if (tctx->qPCrPrime<0) { + tctx->qPCrPrime = 0; + } + + /* + printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY, + sps->QpBdOffset_Y, + pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset, + pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset, + sps->QpBdOffset_C, sps->QpBdOffset_C, + tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime); + */ + + int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase); + + // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why. + // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit. + // id:000163,sig:06,src:002041,op:havoc,rep:16.bin + if (log2CbSize<3) { log2CbSize=3; } + + tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY); + tctx->currentQPY = QPY; + + /* + printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase, + xCUBase+(1<qPYPrime); +} + + + +template +void transform_coefficients(acceleration_functions* acceleration, + int16_t* coeff, int coeffStride, int nT, int trType, + pixel_t* dst, int dstStride, int bit_depth) +{ + logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); + + + if (trType==1) { + + acceleration->transform_4x4_dst_add(dst, coeff, dstStride, bit_depth); + + } else { + + /**/ if (nT==4) { acceleration->transform_add(0,dst,coeff,dstStride, bit_depth); } + else if (nT==8) { acceleration->transform_add(1,dst,coeff,dstStride, bit_depth); } + else if (nT==16) { acceleration->transform_add(2,dst,coeff,dstStride, bit_depth); } + else { acceleration->transform_add(3,dst,coeff,dstStride, bit_depth); } + } + +#if 0 + printf("decoded pixels:\n"); + for (int y=0;yimg->get_sps().BitDepth_C; + const int BitDepthY = tctx->img->get_sps().BitDepth_Y; + + for (int y=0;yBitDepthC, for which we could also eliminate one shift. The remaining + case is also one shift only. + */ + + residual[y*nT+x] += (tctx->ResScaleVal * + ((tctx->residual_luma[y*nT+x] << BitDepthC ) >> BitDepthY ) ) >> 3; + } +} + + +template +void transform_coefficients_explicit(thread_context* tctx, + int16_t* coeff, int coeffStride, int nT, int trType, + pixel_t* dst, int dstStride, int bit_depth, int cIdx) +{ + logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT); + + const acceleration_functions* acceleration = &tctx->decctx->acceleration; + + int32_t residual_buffer[32*32]; + int32_t* residual; + if (cIdx==0) { + residual = tctx->residual_luma; + } + else { + residual = residual_buffer; + } + + + // TODO + int bdShift = 20 - bit_depth; + int max_coeff_bits = 15; + + if (trType==1) { + + acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits); + + } else { + + /**/ if (nT==4) { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); } + else if (nT==8) { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); } + else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); } + else { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); } + } + + + //printBlk("prediction",(uint8_t*)dst,nT,dstStride); + //printBlk("residual",residual,nT,nT); + + if (cIdx != 0) { + if (tctx->ResScaleVal != 0) { + cross_comp_pred(tctx, residual, nT); + } + + //printBlk("cross-comp-pred modified residual",residual,nT,nT); + } + + acceleration->add_residual(dst,dstStride, residual,nT, bit_depth); +} + + +void inv_transform(acceleration_functions* acceleration, + uint8_t* dst, int dstStride, int16_t* coeff, + int log2TbSize, int trType) +{ + if (trType==1) { + assert(log2TbSize==2); + + acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride); + + } else { + acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride); + } + + +#if 0 + int nT = 1<fwd_transform_4x4_dst_8(coeff, src, srcStride); + } else { + // DCT 4x4, 8x8, 16x16, 32x32 + + acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride); + } +} + + + +static const int levelScale[] = { 40,45,51,57,64,72 }; + +// (8.6.2) and (8.6.3) +template +void scale_coefficients_internal(thread_context* tctx, + int xT,int yT, // position of TU in frame (chroma adapted) + int x0,int y0, // position of CU in frame (chroma adapted) + int nT, int cIdx, + bool transform_skip_flag, bool intra, int rdpcmMode) +{ + const seq_parameter_set& sps = tctx->img->get_sps(); + const pic_parameter_set& pps = tctx->img->get_pps(); + + int qP; + switch (cIdx) { + case 0: qP = tctx->qPYPrime; break; + case 1: qP = tctx->qPCbPrime; break; + case 2: qP = tctx->qPCrPrime; break; + default: qP = 0; assert(0); break; // should never happen + } + + logtrace(LogTransform,"qP: %d\n",qP); + + + int16_t* coeff; + int coeffStride; + + coeff = tctx->coeffBuf; + coeffStride = nT; + + + + + + pixel_t* pred; + int stride; + pred = tctx->img->get_image_plane_at_pos_NEW(cIdx, xT,yT); + stride = tctx->img->get_image_stride(cIdx); + + // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler + // can optimize away a lot of code for 8-bit pixels. + const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx)); + + //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA)); + int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA); + + bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag && + nT == 4 && + cuPredModeIntra); + + if (tctx->cu_transquant_bypass_flag) { + + int32_t residual_buffer[32*32]; + + int32_t* residual; + if (cIdx==0) residual = tctx->residual_luma; + else residual = residual_buffer; + + + // TODO: we could fold the coefficient rotation into the coefficient expansion here: + for (int i=0;inCoeff[cIdx];i++) { + int32_t currCoeff = tctx->coeffList[cIdx][i]; + tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; + } + + if (rotateCoeffs) { + tctx->decctx->acceleration.rotate_coefficients(coeff, nT); + } + + if (rdpcmMode) { + if (rdpcmMode==2) + tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT); + else + tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT); + } + else { + tctx->decctx->acceleration.transform_bypass(residual, coeff, nT); + } + + if (cIdx != 0) { + if (tctx->ResScaleVal != 0) { + cross_comp_pred(tctx, residual, nT); + } + } + + tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); + + if (rotateCoeffs) { + memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around + } + } + else { + // (8.6.3) + + int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5; + + logtrace(LogTransform,"bdShift=%d\n",bdShift); + + logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP); + + + // --- inverse quantization --- + + if (sps.scaling_list_enable_flag==0) { + + //const int m_x_y = 16; + const int m_x_y = 1; + bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers + + const int offset = (1<<(bdShift-1)); + const int fact = m_x_y * levelScale[qP%6] << (qP/6); + + for (int i=0;inCoeff[cIdx];i++) { + + // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit + int32_t currCoeff = tctx->coeffList[cIdx][i]; + + //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i], + //tctx->coeffList[cIdx][i]); + + currCoeff = Clip3(-32768,32767, + ( (currCoeff * fact + offset ) >> bdShift)); + + //logtrace(LogTransform," -> %d\n",currCoeff); + + tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; + } + } + else { + const int offset = (1<<(bdShift-1)); + + const uint8_t* sclist; + int matrixID = cIdx; + if (!intra) { + if (nT<32) { matrixID += 3; } + else { matrixID++; } + } + + switch (nT) { + case 4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break; + case 8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break; + case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break; + case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break; + default: assert(0); + } + + for (int i=0;inCoeff[cIdx];i++) { + int pos = tctx->coeffPos[cIdx][i]; + int x = pos%nT; + int y = pos/nT; + + const int m_x_y = sclist[x+y*nT]; + const int fact = m_x_y * levelScale[qP%6] << (qP/6); + + int64_t currCoeff = tctx->coeffList[cIdx][i]; + + currCoeff = Clip3(-32768,32767, + ( (currCoeff * fact + offset ) >> bdShift)); + + tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff; + } + } + + + // --- do transform or skip --- + + logtrace(LogTransform,"coefficients OUT:\n"); + for (int y=0;ydecctx->acceleration.rotate_coefficients(coeff, nT); + } + + int32_t residual_buffer[32*32]; + + int32_t* residual; + if (cIdx==0) residual = tctx->residual_luma; + else residual = residual_buffer; + + if (rdpcmMode) { + /* + if (rdpcmMode==2) + tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth); + else + tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth); + */ + + if (rdpcmMode==2) + tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift); + else + tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift); + } + else { + //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth); + + tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift); + } + + if (cIdx != 0) { + if (tctx->ResScaleVal != 0) { + cross_comp_pred(tctx, residual, nT); + } + } + + tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth); + + if (rotateCoeffs) { + memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around + } + } + else { + int trType; + + //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) { + if (nT==4 && cIdx==0 && cuPredModeIntra) { + trType=1; + } + else { + trType=0; + } + + assert(rdpcmMode==0); + + + if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) { + // cross-component-prediction: transform to residual buffer and add in a separate step + + transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType, + pred, stride, bit_depth, cIdx); + } + else { + transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType, + pred, stride, bit_depth); + } + } + } + + + logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT); + + for (int y=0;ynCoeff[cIdx];i++) { + tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0; + } +} + + +void scale_coefficients(thread_context* tctx, + int xT,int yT, // position of TU in frame (chroma adapted) + int x0,int y0, // position of CU in frame (chroma adapted) + int nT, int cIdx, + bool transform_skip_flag, bool intra, + int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical + ) +{ + if (tctx->img->high_bit_depth(cIdx)) { + scale_coefficients_internal(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, + rdpcmMode); + } else { + scale_coefficients_internal (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra, + rdpcmMode); + } +} + + +//#define QUANT_IQUANT_SHIFT 20 // Q(QP%6) * IQ(QP%6) = 2^20 +#define QUANT_SHIFT 14 // Q(4) = 2^14 +//#define SCALE_BITS 15 // Inherited from TMuC, pressumably for fractional bit estimates in RDOQ +#define MAX_TR_DYNAMIC_RANGE 15 // Maximum transform dynamic range (excluding sign bit) + + +const static uint16_t g_quantScales[6] = { + 26214,23302,20560,18396,16384,14564 +}; + +void quant_coefficients(//encoder_context* ectx, + int16_t* out_coeff, + const int16_t* in_coeff, + int log2TrSize, int qp, + bool intra) +{ + const int qpDiv6 = qp / 6; + const int qpMod6 = qp % 6; + + //int uiLog2TrSize = xLog2( iWidth - 1); + + int uiQ = g_quantScales[qpMod6]; + int bitDepth = 8; + int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize; // Represents scaling through forward transform + int qBits = QUANT_SHIFT + qpDiv6 + transformShift; + + /* TODO: originally, this was checking for intra slices, why not for intra mode ? + */ + int rnd = (intra ? 171 : 85) << (qBits-9); + + int x, y; + int uiAcSum = 0; + + int nStride = (1< ", x,y,level); + sign = (level < 0 ? -1: 1); + + level = (abs_value(level) * uiQ + rnd ) >> qBits; + uiAcSum += level; + level *= sign; + out_coeff[blockPos] = Clip3(-32768, 32767, level); + //logtrace(LogTransform,"%d\n", out_coeff[blockPos]); + } + } +} + + +void dequant_coefficients(int16_t* out_coeff, + const int16_t* in_coeff, + int log2TrSize, int qP) +{ + const int m_x_y = 1; + int bitDepth = 8; + int bdShift = bitDepth + log2TrSize - 5; + bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers + + const int offset = (1<<(bdShift-1)); + const int fact = m_x_y * levelScale[qP%6] << (qP/6); + + int blkSize = (1<> bdShift)); + + //logtrace(LogTransform," -> %d\n",currCoeff); + + out_coeff[i] = currCoeff; + } +} diff --git a/util.cc b/util.cc new file mode 100644 index 0000000..61be238 --- /dev/null +++ b/util.cc @@ -0,0 +1,247 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "util.h" +#include "de265.h" + +#include +#include +#include + + +void copy_subimage(uint8_t* dst,int dststride, + const uint8_t* src,int srcstride, + int w, int h) +{ + for (int y=0;y=2; +} +#endif + +#ifdef DE265_LOG_TRACE +void logtrace(enum LogModule module, const char* string, ...) +{ + if (verbosity<3) return; + if (current_poc < log_poc_start) { return; } + if (disable_log[module]) return; + + //if (module != LogSymbols /*&& module != LogCABAC*/) { return; } + //if (logcnt<319500) return; + + //if (module != LogCABAC) return; + + va_list va; + + if (string[0]=='$') { + int id = string[1]-'0'; + logcnt[id]++; + fprintf(stdout, "[%ld] ",logcnt[id]); + + string += 3; + } + + int noPrefix = (string[0]=='*'); + if (!noPrefix) { } // fprintf(stdout, "ERR: "); + va_start(va, string); + vfprintf(stdout, string + (noPrefix ? 1 : 0), va); + va_end(va); + fflush(stdout); +} +#endif + +void log2fh(FILE* fh, const char* string, ...) +{ + va_list va; + + int noPrefix = (string[0]=='*'); + if (!noPrefix) fprintf(stdout, "INFO: "); + va_start(va, string); + vfprintf(fh, string + (noPrefix ? 1 : 0), va); + va_end(va); + fflush(stdout); +} + + + +void printBlk(const char* title, const int16_t* data, int blksize, int stride, + const std::string& prefix) +{ + if (title) printf("%s%s:\n",prefix.c_str(),title); + + for (int y=0;y + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "visualize.h" +#include "decctx.h" + +#include + +#if 0 +void writeFrame_Y(de265_image* img,const char* filename) +{ + int w = ctx->img->get_width(); + int h = ctx->img->get_height(); + //int c_idx=0; + int ctb_size = 64; // HACK + + int stride = ctx->img->get_luma_stride(); + + for (int ctbY=0;ctbYcurrent_sps->PicHeightInCtbsY;ctbY++) + for (int ctbX=0;ctbXcurrent_sps->PicWidthInCtbsY;ctbX++) + { + int x0 = ctbX*ctb_size; + int y0 = ctbY*ctb_size; + + + uint8_t *src = ctx->img->get_image_plane_at_pos(0,x0,y0); + + printf("%s %d %d\n",filename,x0,y0); + int dx,dy; + for (dy=0;dyget_image_plane_at_pos(c, 0,y), de265_get_image_width(img,c), 1, fh); + + fflush(fh); + fclose(fh); +} + + +void set_pixel(uint8_t* img, int x,int y, int stride, uint32_t color, int pixelSize) +{ + for (int i=0;i>(i*8)) & 0xFF; + img[y*stride + x*pixelSize + i] = col; + } +} + + +void draw_block_boundary(const de265_image* srcimg, + uint8_t* img,int stride, + int x,int y,int hBlkSize, int vBlkSize, uint32_t color, int pixelSize) +{ + for (int i=0;iget_sps().pic_height_in_luma_samples) { + set_pixel(img,x,yi,stride,color,pixelSize); + } + } + + for (int i=0;iget_sps().pic_width_in_luma_samples) { + set_pixel(img,xi,y,stride,color,pixelSize); + } + } +} + + +#include "intrapred.h" + +void draw_intra_pred_mode(const de265_image* srcimg, + uint8_t* img,int stride, + int x0,int y0,int log2BlkSize, + enum IntraPredMode mode, uint32_t color,int pixelSize) +{ + int w = 1< draw square + + for (int i=-w*1/4;i<=w*1/4;i++) + { + set_pixel(img, x0+w*1/4, y0+w/2+i,stride, color, pixelSize); + set_pixel(img, x0+w*3/4, y0+w/2+i,stride, color, pixelSize); + set_pixel(img, x0+w/2+i, y0+w*1/4,stride, color, pixelSize); + set_pixel(img, x0+w/2+i, y0+w*3/4,stride, color, pixelSize); + } + } + else if (mode==1) { + // DC -> draw circle + + for (int i=-w/4;i draw line in prediction direction + + int slope = intraPredAngle_table[mode]; + bool horiz = (mode<18); + + if (horiz) { + for (int i=-w/2;i=0 && yget_sps().pic_height_in_luma_samples) { + set_pixel(img, x0+i+w/2, y, stride, color, pixelSize); + } + } + } + else { + for (int i=-w/2;i=0 && xget_sps().pic_width_in_luma_samples) { + set_pixel(img, x, y0+i+w/2, stride, color, pixelSize); + } + } + } + } +} + + +void drawTBgrid(const de265_image* srcimg, uint8_t* img, int stride, + int x0,int y0, uint32_t color, int pixelSize, int log2CbSize, int trafoDepth) +{ + int split_transform_flag = srcimg->get_split_transform_flag(x0,y0,trafoDepth); + if (split_transform_flag) { + int x1 = x0 + ((1<<(log2CbSize-trafoDepth))>>1); + int y1 = y0 + ((1<<(log2CbSize-trafoDepth))>>1); + drawTBgrid(srcimg,img,stride,x0,y0,color,pixelSize,log2CbSize,trafoDepth+1); + drawTBgrid(srcimg,img,stride,x1,y0,color,pixelSize,log2CbSize,trafoDepth+1); + drawTBgrid(srcimg,img,stride,x0,y1,color,pixelSize,log2CbSize,trafoDepth+1); + drawTBgrid(srcimg,img,stride,x1,y1,color,pixelSize,log2CbSize,trafoDepth+1); + } + else { + draw_block_boundary(srcimg,img,stride,x0,y0,1<<(log2CbSize-trafoDepth),1<<(log2CbSize-trafoDepth), color, pixelSize); + } +} + + +enum DrawMode { + Partitioning_CB, + Partitioning_TB, + Partitioning_PB, + IntraPredMode, + PBPredMode, + PBMotionVectors, + QuantP_Y +}; + + +void tint_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint32_t color, int pixelSize) +{ + for (int y=0;y>(i*8)) & 0xFF; + img[yp*stride+xp*pixelSize + i] = (img[yp*stride+xp*pixelSize + i] + col)/2; + } + } +} + +void fill_rect(uint8_t* img, int stride, int x0,int y0,int w,int h, uint32_t color, int pixelSize) +{ + for (int y=0;y>(i*8)) & 0xFF; + img[yp*stride+xp*pixelSize + i] = col; + } + } +} + + +void draw_QuantPY_block(const de265_image* srcimg,uint8_t* img,int stride, + int x0,int y0, int w,int h, int pixelSize) +{ + int q = srcimg->get_QPY(x0,y0); + + const int MIN_DRAW_Q = 20; + const int MAX_DRAW_Q = 40; + + if (qMAX_DRAW_Q) q=MAX_DRAW_Q; + + float f = ((float)q-MIN_DRAW_Q)/(MAX_DRAW_Q-MIN_DRAW_Q); + uint32_t col = 0xFF * f; + col = col | (col<<8) | (col<<16); + + fill_rect(img,stride, x0,y0,w,h, col, pixelSize); +} + + +void draw_line(uint8_t* img,int stride,uint32_t color,int pixelSize, + int width,int height, + int x0,int y0,int x1,int y1) +{ + if (x1==x0 && y1==y0) { + set_pixel(img,x0,y0,stride,color,pixelSize); + } + else if (abs(x1-x0) < abs(y1-y0)) { + for (int y=y0;y<=y1;y += Sign(y1-y0)) + { + int x = (y-y0)*(x1-x0)/(y1-y0) + x0; + + if (x>=0 && x=0 && y=0 && x=0 && yget_pred_mode(x0,y0); + + uint32_t cols[3] = { 0xff0000, 0x0000ff, 0x00ff00 }; + + tint_rect(img,stride, x0,y0,w,h, cols[predMode], pixelSize); + } + else if (what == PBMotionVectors) { + const PBMotion& mvi = srcimg->get_mv_info(x0,y0); + int x = x0+w/2; + int y = y0+h/2; + if (mvi.predFlag[0]) { + draw_line(img,stride,0xFF0000,pixelSize, + srcimg->get_width(), + srcimg->get_height(), + x,y,x+mvi.mv[0].x,y+mvi.mv[0].y); + } + if (mvi.predFlag[1]) { + draw_line(img,stride,0x00FF00,pixelSize, + srcimg->get_width(), + srcimg->get_height(), + x,y,x+mvi.mv[1].x,y+mvi.mv[1].y); + } + } +} + + +void draw_tree_grid(const de265_image* srcimg, uint8_t* img, int stride, + uint32_t color, int pixelSize, enum DrawMode what) +{ + const seq_parameter_set& sps = srcimg->get_sps(); + int minCbSize = sps.MinCbSizeY; + + for (int y0=0;y0get_log2CbSize_cbUnits(x0,y0); + if (log2CbSize==0) { + continue; + } + + int xb = x0*minCbSize; + int yb = y0*minCbSize; + + int CbSize = 1<get_PartMode(xb,yb); + + int HalfCbSize = (1<<(log2CbSize-1)); + + switch (partMode) { + case PART_2Nx2N: + draw_PB_block(srcimg,img,stride,xb,yb,CbSize,CbSize, what,color,pixelSize); + break; + case PART_NxN: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize/2,CbSize/2, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize/2, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb ,yb+HalfCbSize,CbSize/2,CbSize/2, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb+HalfCbSize,CbSize/2,CbSize/2, what,color,pixelSize); + break; + case PART_2NxN: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize/2, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb, yb+HalfCbSize,CbSize ,CbSize/2, what,color,pixelSize); + break; + case PART_Nx2N: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize/2,CbSize, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+HalfCbSize,yb, CbSize/2,CbSize, what,color,pixelSize); + break; + case PART_2NxnU: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize/4, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb, yb+CbSize/4 ,CbSize ,CbSize*3/4, what,color,pixelSize); + break; + case PART_2NxnD: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize ,CbSize*3/4, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb, yb+CbSize*3/4,CbSize ,CbSize/4, what,color,pixelSize); + break; + case PART_nLx2N: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize/4 ,CbSize, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+CbSize/4 ,yb, CbSize*3/4,CbSize, what,color,pixelSize); + break; + case PART_nRx2N: + draw_PB_block(srcimg,img,stride,xb, yb, CbSize*3/4,CbSize, what,color,pixelSize); + draw_PB_block(srcimg,img,stride,xb+CbSize*3/4,yb, CbSize/4 ,CbSize, what,color,pixelSize); + break; + default: + assert(false); + break; + } + } + else if (what==IntraPredMode) { + enum PredMode predMode = srcimg->get_pred_mode(xb,yb); + if (predMode == MODE_INTRA) { + enum PartMode partMode = srcimg->get_PartMode(xb,yb); + + int HalfCbSize = (1<<(log2CbSize-1)); + + switch (partMode) { + case PART_2Nx2N: + draw_intra_pred_mode(srcimg,img,stride,xb,yb,log2CbSize, + srcimg->get_IntraPredMode(xb,yb), color,pixelSize); + break; + case PART_NxN: + draw_intra_pred_mode(srcimg,img,stride,xb, yb, log2CbSize-1, + srcimg->get_IntraPredMode(xb,yb), color,pixelSize); + draw_intra_pred_mode(srcimg,img,stride,xb+HalfCbSize,yb, log2CbSize-1, + srcimg->get_IntraPredMode(xb+HalfCbSize,yb), color,pixelSize); + draw_intra_pred_mode(srcimg,img,stride,xb ,yb+HalfCbSize,log2CbSize-1, + srcimg->get_IntraPredMode(xb,yb+HalfCbSize), color,pixelSize); + draw_intra_pred_mode(srcimg,img,stride,xb+HalfCbSize,yb+HalfCbSize,log2CbSize-1, + srcimg->get_IntraPredMode(xb+HalfCbSize,yb+HalfCbSize), color,pixelSize); + break; + default: + assert(false); + break; + } + } + } + } +} + + +LIBDE265_API void draw_CB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) +{ + draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_CB); +} + +LIBDE265_API void draw_TB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) +{ + draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_TB); +} + +LIBDE265_API void draw_PB_grid(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) +{ + draw_tree_grid(img,dst,stride,color,pixelSize, Partitioning_PB); +} + +LIBDE265_API void draw_intra_pred_modes(const de265_image* img, uint8_t* dst, int stride, uint32_t color,int pixelSize) +{ + draw_tree_grid(img,dst,stride,color,pixelSize, IntraPredMode); +} + +LIBDE265_API void draw_PB_pred_modes(const de265_image* img, uint8_t* dst, int stride, int pixelSize) +{ + draw_tree_grid(img,dst,stride,0,pixelSize, PBPredMode); +} + +LIBDE265_API void draw_QuantPY(const de265_image* img, uint8_t* dst, int stride, int pixelSize) +{ + draw_tree_grid(img,dst,stride,0,pixelSize, QuantP_Y); +} + +LIBDE265_API void draw_Motion(const de265_image* img, uint8_t* dst, int stride, int pixelSize) +{ + draw_tree_grid(img,dst,stride,0,pixelSize, PBMotionVectors); +} + +LIBDE265_API void draw_Slices(const de265_image* img, uint8_t* dst, int stride, int pixelSize) +{ + const seq_parameter_set& sps = img->get_sps(); + + // --- mark first CTB in slice (red - independent / green - dependent) --- + + for (int ctby=0;ctby0 || ctby>0) { prevCtbRS = img->get_pps().CtbAddrTStoRS[ img->get_pps().CtbAddrRStoTS[ctbAddrRS] -1 ]; } + + if (prevCtbRS<0 || + img->get_SliceHeaderIndex_atIndex(ctbAddrRS) != + img->get_SliceHeaderIndex_atIndex(prevCtbRS)) { + int step=2; + int fillcolor = 0xFF0000; + + if (img->get_SliceHeaderCtb(ctbx,ctby)->dependent_slice_segment_flag) { + step=2; + fillcolor = 0x00FF00; + } + + for (int x=0;x<1<0 && (img->get_SliceHeaderIndexCtb(ctbx ,ctby) != + img->get_SliceHeaderIndexCtb(ctbx-1,ctby))) { + int x = ctbx << sps.Log2CtbSizeY; + int y0 = ctby << sps.Log2CtbSizeY; + + for (int y=y0; + (y0 && (img->get_SliceHeaderIndexCtb(ctbx,ctby ) != + img->get_SliceHeaderIndexCtb(ctbx,ctby-1))) { + int x0 = ctbx << sps.Log2CtbSizeY; + int y = ctby << sps.Log2CtbSizeY; + + for (int x=x0 ; + (xget_sps(); + const pic_parameter_set& pps = img->get_pps(); + + + for (int tx=1;tx + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "vps.h" +#include "util.h" +#include "decctx.h" + +#include + + +void profile_data::set_defaults(enum profile_idc profile, int level_major, int level_minor) +{ + profile_present_flag = 1; + + profile_space = 0; + tier_flag = 0; + profile_idc = profile; + + for (int i=0;i<32;i++) { + profile_compatibility_flag[i]=0; + } + + switch (profile) { + case Profile_Main: + profile_compatibility_flag[Profile_Main]=1; + profile_compatibility_flag[Profile_Main10]=1; + break; + case Profile_Main10: + profile_compatibility_flag[Profile_Main10]=1; + break; + default: + assert(0); + } + + progressive_source_flag = 0; + interlaced_source_flag = 0; + non_packed_constraint_flag = 0; + frame_only_constraint_flag = 0; + + + // --- level --- + + level_present_flag = 1; + level_idc = level_major*30 + level_minor*3; +} + + +void video_parameter_set::set_defaults(enum profile_idc profile, int level_major, int level_minor) +{ + video_parameter_set_id = 0; + vps_max_layers = 1; // always =1 in current version of standard + vps_max_sub_layers = 1; // temporal sub-layers + vps_temporal_id_nesting_flag = 1; + + profile_tier_level_.general.set_defaults(profile,level_major,level_minor); + + vps_sub_layer_ordering_info_present_flag = 0; + layer[0].vps_max_dec_pic_buffering = 1; + layer[0].vps_max_num_reorder_pics = 0; + layer[0].vps_max_latency_increase = 0; + + vps_max_layer_id = 0; + vps_num_layer_sets = 1; + + layer_id_included_flag.resize(vps_num_layer_sets); + + + // --- timing info --- + + vps_timing_info_present_flag = 0; + vps_num_units_in_tick = 0; + vps_time_scale = 0; + vps_poc_proportional_to_timing_flag = 0; + + vps_num_ticks_poc_diff_one = 0; + vps_num_hrd_parameters = 0; + + + // --- vps extension --- + + vps_extension_flag = 0; +} + + +de265_error video_parameter_set::read(error_queue* errqueue, bitreader* reader) +{ + int vlc; + + video_parameter_set_id = vlc = get_bits(reader, 4); + if (vlc >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + + skip_bits(reader, 2); + vps_max_layers = vlc = get_bits(reader,6) +1; + if (vlc > 63) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; // vps_max_layers_minus1 (range 0...63) + + vps_max_sub_layers = vlc = get_bits(reader,3) +1; + if (vlc >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + + vps_temporal_id_nesting_flag = get_bits(reader,1); + skip_bits(reader, 16); + + profile_tier_level_.read(reader, vps_max_sub_layers); + + /* + read_bit_rate_pic_rate_info(reader, &bit_rate_pic_rate_info, + 0, vps_max_sub_layers-1); + */ + + vps_sub_layer_ordering_info_present_flag = get_bits(reader,1); + //assert(vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS); + + int firstLayerRead = vps_sub_layer_ordering_info_present_flag ? 0 : (vps_max_sub_layers-1); + + for (int i=firstLayerRead;i=1024 || + vps_num_layer_sets == UVLC_ERROR) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + vps_num_layer_sets += 1; + + layer_id_included_flag.resize(vps_num_layer_sets); + + for (int i=1; i <= vps_num_layer_sets-1; i++) + { + layer_id_included_flag[i].resize(vps_max_layer_id+1); + + for (int j=0; j <= vps_max_layer_id; j++) + { + layer_id_included_flag[i][j] = get_bits(reader,1); + } + } + + vps_timing_info_present_flag = get_bits(reader,1); + + if (vps_timing_info_present_flag) { + vps_num_units_in_tick = get_bits(reader,32); + vps_time_scale = get_bits(reader,32); + vps_poc_proportional_to_timing_flag = get_bits(reader,1); + + if (vps_poc_proportional_to_timing_flag) { + vps_num_ticks_poc_diff_one = get_uvlc(reader)+1; + vps_num_hrd_parameters = get_uvlc(reader); + + if (vps_num_hrd_parameters >= 1024 || vps_num_hrd_parameters < 0) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + hrd_layer_set_idx .resize(vps_num_hrd_parameters); + cprms_present_flag.resize(vps_num_hrd_parameters); + + for (int i=0; i 0) { + cprms_present_flag[i] = get_bits(reader,1); + } + + //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) + + return DE265_OK; // TODO: decode hrd_parameters() + } + } + } + + vps_extension_flag = get_bits(reader,1); + + if (vps_extension_flag) { + /* + while( more_rbsp_data() ) + vps_extension_data_flag u(1) + rbsp_trailing_bits() + */ + } + + return DE265_OK; +} + + +de265_error video_parameter_set::write(error_queue* errqueue, CABAC_encoder& out) const +{ + if (video_parameter_set_id >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + out.write_bits(video_parameter_set_id,4); + + out.write_bits(0x3,2); + out.write_bits(vps_max_layers-1,6); + + if (vps_max_sub_layers >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + out.write_bits(vps_max_sub_layers-1,3); + + out.write_bit(vps_temporal_id_nesting_flag); + out.write_bits(0xFFFF, 16); + + profile_tier_level_.write(out, vps_max_sub_layers); + + /* + read_bit_rate_pic_rate_info(reader, &bit_rate_pic_rate_info, + 0, vps_max_sub_layers-1); + */ + + out.write_bit(vps_sub_layer_ordering_info_present_flag); + //assert(vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS); + + int firstLayerRead = vps_sub_layer_ordering_info_present_flag ? 0 : (vps_max_sub_layers-1); + + for (int i=firstLayerRead;i=1024) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; + } + + out.write_bits(vps_max_layer_id,6); + out.write_uvlc(vps_num_layer_sets-1); + + for (int i=1; i <= vps_num_layer_sets-1; i++) + for (int j=0; j <= vps_max_layer_id; j++) + { + out.write_bit(layer_id_included_flag[i][j]); + } + + out.write_bit(vps_timing_info_present_flag); + + if (vps_timing_info_present_flag) { + out.write_bits(vps_num_units_in_tick,32); + out.write_bits(vps_time_scale ,32); + out.write_bit (vps_poc_proportional_to_timing_flag); + + if (vps_poc_proportional_to_timing_flag) { + out.write_uvlc(vps_num_ticks_poc_diff_one-1); + out.write_uvlc(vps_num_hrd_parameters); + + for (int i=0; i 0) { + out.write_bit(cprms_present_flag[i]); + } + + //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) + + return DE265_OK; // TODO: decode hrd_parameters() + } + } + } + + out.write_bit(vps_extension_flag); + + if (vps_extension_flag) { + /* + while( more_rbsp_data() ) + vps_extension_data_flag u(1) + rbsp_trailing_bits() + */ + } + + return DE265_OK; +} + + +void profile_data::read(bitreader* reader) +{ + if (profile_present_flag) { + profile_space = get_bits(reader,2); + tier_flag = get_bits(reader,1); + profile_idc = (enum profile_idc)get_bits(reader,5); + + for (int i=0; i<32; i++) { + profile_compatibility_flag[i] = get_bits(reader,1); + } + + progressive_source_flag = get_bits(reader,1); + interlaced_source_flag = get_bits(reader,1); + non_packed_constraint_flag = get_bits(reader,1); + frame_only_constraint_flag = get_bits(reader,1); + skip_bits(reader,44); + } + + if (level_present_flag) { + level_idc = get_bits(reader,8); + } +} + + +void profile_tier_level::read(bitreader* reader, + int max_sub_layers) +{ + // --- read the general profile --- + + general.profile_present_flag = true; + general.level_present_flag = true; + general.read(reader); + + + // --- read the profile/levels of the sub-layers --- + + for (int i=0; i 1) + { + for (int i=max_sub_layers-1; i<8; i++) + { + skip_bits(reader,2); + } + } + + for (int i=0; i 1) + { + for (int i=max_sub_layers-1; i<8; i++) + { + out.skip_bits(2); + } + } + + for (int i=0; ibit_rate_info_present_flag[i] = get_bits(reader,1); + hdr->pic_rate_info_present_flag[i] = get_bits(reader,1); + + if (hdr->bit_rate_info_present_flag[i]) { + hdr->avg_bit_rate[i] = get_bits(reader,16); + hdr->max_bit_rate[i] = get_bits(reader,16); + } + + if (hdr->pic_rate_info_present_flag[i]) { + hdr->constant_pic_rate_idc[i] = get_bits(reader,2); + hdr->avg_pic_rate[i] = get_bits(reader,16); + } + } +} +*/ + + + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) +#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) + +void video_parameter_set::dump(int fd) const +{ + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + + LOG0("----------------- VPS -----------------\n"); + LOG1("video_parameter_set_id : %d\n", video_parameter_set_id); + LOG1("vps_max_layers : %d\n", vps_max_layers); + LOG1("vps_max_sub_layers : %d\n", vps_max_sub_layers); + LOG1("vps_temporal_id_nesting_flag : %d\n", vps_temporal_id_nesting_flag); + + profile_tier_level_.dump(vps_max_sub_layers, fh); + //dump_bit_rate_pic_rate_info(&bit_rate_pic_rate_info, 0, vps_max_sub_layers-1); + + LOG1("vps_sub_layer_ordering_info_present_flag : %d\n", + vps_sub_layer_ordering_info_present_flag); + + if (vps_sub_layer_ordering_info_present_flag) { + for (int i=0;i 0) { + LOG2("cprms_present_flag[%d] = %d\n", i, cprms_present_flag[i]); + } + + //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1) + + return; // TODO: decode hrd_parameters() + } + } + } + + LOG1("vps_extension_flag = %d\n", vps_extension_flag); +} + + +static const char* profile_name(profile_idc p) +{ + switch (p) { + case Profile_Main: return "Main"; + case Profile_Main10: return "Main10"; + case Profile_MainStillPicture: return "MainStillPicture"; + case Profile_FormatRangeExtensions: return "FormatRangeExtensions"; + default: + return "(unknown)"; + } +} + + +void profile_data::dump(bool general, FILE* fh) const +{ + const char* prefix = (general ? "general" : "sub_layer"); + + if (profile_present_flag) { + LOG2(" %s_profile_space : %d\n", prefix,profile_space); + LOG2(" %s_tier_flag : %d\n", prefix,tier_flag); + LOG2(" %s_profile_idc : %s\n", prefix, profile_name(profile_idc)); + + LOG1(" %s_profile_compatibility_flags: ", prefix); + for (int i=0; i<32; i++) { + if (i) LOG0("*,"); + LOG1("*%d",profile_compatibility_flag[i]); + } + LOG0("*\n"); + LOG2(" %s_progressive_source_flag : %d\n",prefix,progressive_source_flag); + LOG2(" %s_interlaced_source_flag : %d\n",prefix,interlaced_source_flag); + LOG2(" %s_non_packed_constraint_flag : %d\n",prefix,non_packed_constraint_flag); + LOG2(" %s_frame_only_constraint_flag : %d\n",prefix,frame_only_constraint_flag); + } + + if (level_present_flag) { + LOG3(" %s_level_idc : %d (%4.2f)\n", prefix,level_idc, level_idc/30.0f); + } +} + + +void profile_tier_level::dump(int max_sub_layers, FILE* fh) const +{ + general.dump(true, fh); + + for (int i=0; ibit_rate_info_present_flag[i]) { + LOG(" avg_bit_rate : %d\n", hdr->avg_bit_rate[i]); + LOG(" max_bit_rate : %d\n", hdr->max_bit_rate[i]); + } + + if (hdr->pic_rate_info_present_flag[i]) { + LOG(" constant_pic_rate_idc : %d\n", hdr->constant_pic_rate_idc[i]); + LOG(" avg_pic_rate[i] : %d\n", hdr->avg_pic_rate[i]); + } + } +} +*/ diff --git a/vui.cc b/vui.cc new file mode 100644 index 0000000..5524fa8 --- /dev/null +++ b/vui.cc @@ -0,0 +1,425 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "vui.h" +#include "decctx.h" + +#include +#include +#include + +#define READ_VLC_OFFSET(variable, vlctype, offset) \ + if ((vlc = get_ ## vlctype(br)) == UVLC_ERROR) { \ + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); \ + return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \ + } \ + variable = vlc + offset; + +#define READ_VLC(variable, vlctype) READ_VLC_OFFSET(variable,vlctype,0) + + +#define NUM_SAR_PRESETS 17 + +static uint16_t sar_presets[NUM_SAR_PRESETS+1][2] = { + { 0,0 }, + { 1,1 }, + { 12,11 }, + { 10,11 }, + { 16,11 }, + { 40,33 }, + { 24,11 }, + { 20,11 }, + { 32,11 }, + { 80,33 }, + { 18,11 }, + { 15,11 }, + { 64,33 }, + { 160,99 }, + { 4,3 }, + { 3,2 }, + { 2,1 } +}; + +#define EXTENDED_SAR 255 + + +const char* get_video_format_name(enum VideoFormat format) +{ + switch (format) { + case VideoFormat_Component: return "component"; + case VideoFormat_PAL: return "PAL"; + case VideoFormat_NTSC: return "NTSC"; + case VideoFormat_SECAM: return "SECAM"; + case VideoFormat_MAC: return "MAC"; + default: return "unspecified"; + } +} + + +video_usability_information::video_usability_information() +{ + aspect_ratio_info_present_flag = false; + sar_width = 0; + sar_height = 0; + + + // --- overscan --- + + overscan_info_present_flag = false; + overscan_appropriate_flag = false; + + + // --- video signal type --- + + video_signal_type_present_flag = false; + video_format = VideoFormat_Unspecified; + video_full_range_flag = false; + colour_description_present_flag = false; + colour_primaries = 2; + transfer_characteristics = 2; + matrix_coeffs = 2; + + // --- chroma / interlaced --- + + chroma_loc_info_present_flag = false; + chroma_sample_loc_type_top_field = 0; + chroma_sample_loc_type_bottom_field = 0; + + neutral_chroma_indication_flag = false; + field_seq_flag = false; + frame_field_info_present_flag = false; + + // --- default display window --- + + default_display_window_flag = false; + def_disp_win_left_offset = 0; + def_disp_win_right_offset = 0; + def_disp_win_top_offset = 0; + def_disp_win_bottom_offset = 0; + + + // --- timing --- + + vui_timing_info_present_flag = false; + vui_num_units_in_tick = 0; + vui_time_scale = 0; + + vui_poc_proportional_to_timing_flag = false; + vui_num_ticks_poc_diff_one = 1; + + + // --- hrd parameters --- + + vui_hrd_parameters_present_flag = false; + //hrd_parameters vui_hrd_parameters; + + + // --- bitstream restriction --- + + bitstream_restriction_flag = false; + tiles_fixed_structure_flag = false; + motion_vectors_over_pic_boundaries_flag = true; + restricted_ref_pic_lists_flag = false; + min_spatial_segmentation_idc = 0; + max_bytes_per_pic_denom = 2; + max_bits_per_min_cu_denom = 1; + log2_max_mv_length_horizontal = 15; + log2_max_mv_length_vertical = 15; +} + + +de265_error video_usability_information::read(error_queue* errqueue, bitreader* br, + const seq_parameter_set* sps) +{ + int vlc; + + + // --- sample aspect ratio (SAR) --- + + aspect_ratio_info_present_flag = get_bits(br,1); + if (aspect_ratio_info_present_flag) { + int aspect_ratio_idc = get_bits(br,8); + if (aspect_ratio_idc <= NUM_SAR_PRESETS) { + sar_width = sar_presets[aspect_ratio_idc][0]; + sar_height = sar_presets[aspect_ratio_idc][1]; + } + else if (aspect_ratio_idc == EXTENDED_SAR) { + sar_width = get_bits(br,16); + sar_height = get_bits(br,16); + } + else { + sar_width = 0; + sar_height = 0; + } + } + else { + sar_width = 0; + sar_height = 0; + } + + + // --- overscan --- + + overscan_info_present_flag = get_bits(br,1); + if (overscan_info_present_flag) { + overscan_appropriate_flag = get_bits(br,1); + } + + + // --- video signal type --- + + { // defaults + video_format = VideoFormat_Unspecified; + video_full_range_flag = false; + colour_primaries = 2; + transfer_characteristics = 2; + matrix_coeffs = 2; + } + + video_signal_type_present_flag = get_bits(br,1); + if (video_signal_type_present_flag) { + int video_format_idc = get_bits(br,3); + if (video_format_idc > 5) { + video_format_idc = VideoFormat_Unspecified; + } + video_format = (VideoFormat)video_format_idc; + + video_full_range_flag = get_bits(br,1); + + colour_description_present_flag = get_bits(br,1); + if (colour_description_present_flag) { + colour_primaries = get_bits(br,8); + if (colour_primaries == 0 || + colour_primaries == 3 || + colour_primaries >= 11) { + colour_primaries = 2; + } + + transfer_characteristics = get_bits(br,8); + if (transfer_characteristics == 0 || + transfer_characteristics == 3 || + transfer_characteristics >= 18) { + transfer_characteristics = 2; + } + + matrix_coeffs = get_bits(br,8); + if (matrix_coeffs == 0 || + matrix_coeffs >= 11) { + matrix_coeffs = 2; + } + } + } + + + // --- chroma / interlaced --- + + chroma_loc_info_present_flag = get_bits(br,1); + if (chroma_loc_info_present_flag) { + READ_VLC(chroma_sample_loc_type_top_field, uvlc); + READ_VLC(chroma_sample_loc_type_bottom_field, uvlc); + } + else { + chroma_sample_loc_type_top_field = 0; + chroma_sample_loc_type_bottom_field = 0; + } + + neutral_chroma_indication_flag = get_bits(br,1); + field_seq_flag = get_bits(br,1); + frame_field_info_present_flag = get_bits(br,1); + + + // --- default display window --- + + default_display_window_flag = get_bits(br,1); + if (default_display_window_flag) { + READ_VLC(def_disp_win_left_offset ,uvlc); + READ_VLC(def_disp_win_right_offset ,uvlc); + READ_VLC(def_disp_win_top_offset ,uvlc); + READ_VLC(def_disp_win_bottom_offset,uvlc); + } + else { + def_disp_win_left_offset =0; + def_disp_win_right_offset =0; + def_disp_win_top_offset =0; + def_disp_win_bottom_offset=0; + } + + + // --- timing --- + + vui_timing_info_present_flag = get_bits(br,1); + if (vui_timing_info_present_flag) { + vui_num_units_in_tick = get_bits(br,32); + vui_time_scale = get_bits(br,32); + } + + vui_poc_proportional_to_timing_flag = get_bits(br,1); + READ_VLC_OFFSET(vui_num_ticks_poc_diff_one, uvlc, 1); + + + // --- hrd parameters --- + + vui_hrd_parameters_present_flag = get_bits(br,1); + if (vui_hrd_parameters_present_flag) { + return DE265_ERROR_NOT_IMPLEMENTED_YET; + //hrd_parameters vui_hrd_parameters; + } + + + // --- bitstream restriction --- + + bitstream_restriction_flag = get_bits(br,1); + if (bitstream_restriction_flag) { + tiles_fixed_structure_flag = get_bits(br,1); + motion_vectors_over_pic_boundaries_flag = get_bits(br,1); + restricted_ref_pic_lists_flag = get_bits(br,1); + + READ_VLC(min_spatial_segmentation_idc, uvlc); + if (min_spatial_segmentation_idc > 4095) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + min_spatial_segmentation_idc = 0; + } + + READ_VLC(max_bytes_per_pic_denom, uvlc); + if (max_bytes_per_pic_denom > 16) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + max_bytes_per_pic_denom = 2; + } + + READ_VLC(max_bits_per_min_cu_denom, uvlc); + if (max_bits_per_min_cu_denom > 16) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + max_bits_per_min_cu_denom = 1; + } + + READ_VLC(log2_max_mv_length_horizontal, uvlc); + if (log2_max_mv_length_horizontal > 15) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + log2_max_mv_length_horizontal = 15; + } + + READ_VLC(log2_max_mv_length_vertical, uvlc); + if (log2_max_mv_length_vertical > 15) { + errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false); + log2_max_mv_length_vertical = 15; + } + } + else { + tiles_fixed_structure_flag = false; + motion_vectors_over_pic_boundaries_flag = true; + restricted_ref_pic_lists_flag = false; // NOTE: default not specified in standard 2014/10 + + min_spatial_segmentation_idc = 0; + max_bytes_per_pic_denom = 2; + max_bits_per_min_cu_denom = 1; + log2_max_mv_length_horizontal = 15; + log2_max_mv_length_vertical = 15; + } + + //vui_read = true; + + return DE265_OK; +} + + +void video_usability_information::dump(int fd) const +{ + //#if (_MSC_VER >= 1500) + //#define LOG0(t) loginfo(LogHeaders, t) + //#define LOG1(t,d) loginfo(LogHeaders, t,d) + //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2) + //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3) + + FILE* fh; + if (fd==1) fh=stdout; + else if (fd==2) fh=stderr; + else { return; } + +#define LOG0(t) log2fh(fh, t) +#define LOG1(t,d) log2fh(fh, t,d) +#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2) +#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3) + + LOG0("----------------- VUI -----------------\n"); + LOG2("sample aspect ratio : %d:%d\n", sar_width,sar_height); + LOG1("overscan_info_present_flag : %d\n", overscan_info_present_flag); + LOG1("overscan_appropriate_flag : %d\n", overscan_appropriate_flag); + + LOG1("video_signal_type_present_flag: %d\n", video_signal_type_present_flag); + if (video_signal_type_present_flag) { + LOG1(" video_format : %s\n", get_video_format_name(video_format)); + LOG1(" video_full_range_flag : %d\n", video_full_range_flag); + LOG1(" colour_description_present_flag : %d\n", colour_description_present_flag); + LOG1(" colour_primaries : %d\n", colour_primaries); + LOG1(" transfer_characteristics : %d\n", transfer_characteristics); + LOG1(" matrix_coeffs : %d\n", matrix_coeffs); + } + + LOG1("chroma_loc_info_present_flag: %d\n", chroma_loc_info_present_flag); + if (chroma_loc_info_present_flag) { + LOG1(" chroma_sample_loc_type_top_field : %d\n", chroma_sample_loc_type_top_field); + LOG1(" chroma_sample_loc_type_bottom_field: %d\n", chroma_sample_loc_type_bottom_field); + } + + LOG1("neutral_chroma_indication_flag: %d\n", neutral_chroma_indication_flag); + LOG1("field_seq_flag : %d\n", field_seq_flag); + LOG1("frame_field_info_present_flag : %d\n", frame_field_info_present_flag); + + LOG1("default_display_window_flag : %d\n", default_display_window_flag); + LOG1(" def_disp_win_left_offset : %d\n", def_disp_win_left_offset); + LOG1(" def_disp_win_right_offset : %d\n", def_disp_win_right_offset); + LOG1(" def_disp_win_top_offset : %d\n", def_disp_win_top_offset); + LOG1(" def_disp_win_bottom_offset : %d\n", def_disp_win_bottom_offset); + + LOG1("vui_timing_info_present_flag : %d\n", vui_timing_info_present_flag); + if (vui_timing_info_present_flag) { + LOG1(" vui_num_units_in_tick : %d\n", vui_num_units_in_tick); + LOG1(" vui_time_scale : %d\n", vui_time_scale); + } + + LOG1("vui_poc_proportional_to_timing_flag : %d\n", vui_poc_proportional_to_timing_flag); + LOG1("vui_num_ticks_poc_diff_one : %d\n", vui_num_ticks_poc_diff_one); + + LOG1("vui_hrd_parameters_present_flag : %d\n", vui_hrd_parameters_present_flag); + if (vui_hrd_parameters_present_flag) { + //hrd_parameters vui_hrd_parameters; + } + + + // --- bitstream restriction --- + + LOG1("bitstream_restriction_flag : %d\n", bitstream_restriction_flag); + if (bitstream_restriction_flag) { + LOG1(" tiles_fixed_structure_flag : %d\n", tiles_fixed_structure_flag); + LOG1(" motion_vectors_over_pic_boundaries_flag : %d\n", motion_vectors_over_pic_boundaries_flag); + LOG1(" restricted_ref_pic_lists_flag : %d\n", restricted_ref_pic_lists_flag); + LOG1(" min_spatial_segmentation_idc : %d\n", min_spatial_segmentation_idc); + LOG1(" max_bytes_per_pic_denom : %d\n", max_bytes_per_pic_denom); + LOG1(" max_bits_per_min_cu_denom : %d\n", max_bits_per_min_cu_denom); + LOG1(" log2_max_mv_length_horizontal : %d\n", log2_max_mv_length_horizontal); + LOG1(" log2_max_mv_length_vertical : %d\n", log2_max_mv_length_vertical); + } + +#undef LOG0 +#undef LOG1 +#undef LOG2 +#undef LOG3 + //#endif +} diff --git a/x86/.deps/libde265_x86_la-sse.Plo b/x86/.deps/libde265_x86_la-sse.Plo new file mode 100644 index 0000000..d9c963b --- /dev/null +++ b/x86/.deps/libde265_x86_la-sse.Plo @@ -0,0 +1,64 @@ +libde265_x86_la-sse.lo: sse.cc /usr/include/stdc-predef.h \ + ../../libde265/x86/sse.h ../../libde265/acceleration.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h \ + /usr/include/stdint.h /usr/include/bits/libc-header-start.h \ + /usr/include/features.h /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h /usr/include/bits/long-double.h \ + /usr/include/gnu/stubs.h /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h /usr/include/bits/timesize.h \ + /usr/include/bits/typesizes.h /usr/include/bits/time64.h \ + /usr/include/bits/wchar.h /usr/include/bits/stdint-intn.h \ + /usr/include/bits/stdint-uintn.h /usr/include/assert.h \ + ../../libde265/x86/sse-motion.h ../../libde265/x86/sse-dct.h \ + ../../config.h /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/cpuid.h + +/usr/include/stdc-predef.h: + +../../libde265/x86/sse.h: + +../../libde265/acceleration.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h: + +/usr/include/stdint.h: + +/usr/include/bits/libc-header-start.h: + +/usr/include/features.h: + +/usr/include/sys/cdefs.h: + +/usr/include/bits/wordsize.h: + +/usr/include/bits/long-double.h: + +/usr/include/gnu/stubs.h: + +/usr/include/gnu/stubs-64.h: + +/usr/include/bits/types.h: + +/usr/include/bits/timesize.h: + +/usr/include/bits/typesizes.h: + +/usr/include/bits/time64.h: + +/usr/include/bits/wchar.h: + +/usr/include/bits/stdint-intn.h: + +/usr/include/bits/stdint-uintn.h: + +/usr/include/assert.h: + +../../libde265/x86/sse-motion.h: + +../../libde265/x86/sse-dct.h: + +../../config.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/cpuid.h: diff --git a/x86/.deps/libde265_x86_sse_la-sse-dct.Plo b/x86/.deps/libde265_x86_sse_la-sse-dct.Plo new file mode 100644 index 0000000..418d8f2 --- /dev/null +++ b/x86/.deps/libde265_x86_sse_la-sse-dct.Plo @@ -0,0 +1,431 @@ +libde265_x86_sse_la-sse-dct.lo: sse-dct.cc /usr/include/stdc-predef.h \ + ../../libde265/x86/sse-dct.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h \ + /usr/include/stdint.h /usr/include/bits/libc-header-start.h \ + /usr/include/features.h /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h /usr/include/bits/long-double.h \ + /usr/include/gnu/stubs.h /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h /usr/include/bits/timesize.h \ + /usr/include/bits/typesizes.h /usr/include/bits/time64.h \ + /usr/include/bits/wchar.h /usr/include/bits/stdint-intn.h \ + /usr/include/bits/stdint-uintn.h ../../libde265/util.h ../../config.h \ + /usr/include/inttypes.h /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdarg.h \ + /usr/include/bits/types/__fpos_t.h /usr/include/bits/types/__mbstate_t.h \ + /usr/include/bits/types/__fpos64_t.h /usr/include/bits/types/__FILE.h \ + /usr/include/bits/types/FILE.h /usr/include/bits/types/struct_FILE.h \ + /usr/include/bits/types/cookie_io_functions_t.h \ + /usr/include/bits/stdio_lim.h /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h /usr/include/c++/9.2.0/string \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++config.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/os_defines.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/cpu_defines.h \ + /usr/include/c++/9.2.0/bits/stringfwd.h \ + /usr/include/c++/9.2.0/bits/memoryfwd.h \ + /usr/include/c++/9.2.0/bits/char_traits.h \ + /usr/include/c++/9.2.0/bits/stl_algobase.h \ + /usr/include/c++/9.2.0/bits/functexcept.h \ + /usr/include/c++/9.2.0/bits/exception_defines.h \ + /usr/include/c++/9.2.0/bits/cpp_type_traits.h \ + /usr/include/c++/9.2.0/ext/type_traits.h \ + /usr/include/c++/9.2.0/ext/numeric_traits.h \ + /usr/include/c++/9.2.0/bits/stl_pair.h \ + /usr/include/c++/9.2.0/bits/move.h \ + /usr/include/c++/9.2.0/bits/concept_check.h \ + /usr/include/c++/9.2.0/type_traits \ + /usr/include/c++/9.2.0/bits/stl_iterator_base_types.h \ + /usr/include/c++/9.2.0/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9.2.0/debug/assertions.h \ + /usr/include/c++/9.2.0/bits/stl_iterator.h \ + /usr/include/c++/9.2.0/bits/ptr_traits.h \ + /usr/include/c++/9.2.0/debug/debug.h \ + /usr/include/c++/9.2.0/bits/predefined_ops.h \ + /usr/include/c++/9.2.0/bits/postypes.h /usr/include/c++/9.2.0/cwchar \ + /usr/include/wchar.h /usr/include/bits/floatn.h \ + /usr/include/bits/floatn-common.h /usr/include/bits/types/wint_t.h \ + /usr/include/bits/types/mbstate_t.h /usr/include/bits/types/locale_t.h \ + /usr/include/bits/types/__locale_t.h /usr/include/c++/9.2.0/cstdint \ + /usr/include/c++/9.2.0/bits/allocator.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++allocator.h \ + /usr/include/c++/9.2.0/ext/new_allocator.h /usr/include/c++/9.2.0/new \ + /usr/include/c++/9.2.0/exception /usr/include/c++/9.2.0/bits/exception.h \ + /usr/include/c++/9.2.0/bits/exception_ptr.h \ + /usr/include/c++/9.2.0/bits/cxxabi_init_exception.h \ + /usr/include/c++/9.2.0/typeinfo /usr/include/c++/9.2.0/bits/hash_bytes.h \ + /usr/include/c++/9.2.0/bits/nested_exception.h \ + /usr/include/c++/9.2.0/bits/localefwd.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++locale.h \ + /usr/include/c++/9.2.0/clocale /usr/include/locale.h \ + /usr/include/bits/locale.h /usr/include/c++/9.2.0/iosfwd \ + /usr/include/c++/9.2.0/cctype /usr/include/ctype.h /usr/include/endian.h \ + /usr/include/bits/endian.h /usr/include/bits/byteswap.h \ + /usr/include/bits/uintn-identity.h \ + /usr/include/c++/9.2.0/bits/ostream_insert.h \ + /usr/include/c++/9.2.0/bits/cxxabi_forced.h \ + /usr/include/c++/9.2.0/bits/stl_function.h \ + /usr/include/c++/9.2.0/backward/binders.h \ + /usr/include/c++/9.2.0/bits/range_access.h \ + /usr/include/c++/9.2.0/initializer_list \ + /usr/include/c++/9.2.0/bits/basic_string.h \ + /usr/include/c++/9.2.0/ext/atomicity.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h \ + /usr/include/pthread.h /usr/include/sched.h \ + /usr/include/bits/types/time_t.h \ + /usr/include/bits/types/struct_timespec.h /usr/include/bits/sched.h \ + /usr/include/bits/types/struct_sched_param.h /usr/include/bits/cpu-set.h \ + /usr/include/time.h /usr/include/bits/time.h /usr/include/bits/timex.h \ + /usr/include/bits/types/struct_timeval.h \ + /usr/include/bits/types/clock_t.h /usr/include/bits/types/struct_tm.h \ + /usr/include/bits/types/clockid_t.h /usr/include/bits/types/timer_t.h \ + /usr/include/bits/types/struct_itimerspec.h \ + /usr/include/bits/pthreadtypes.h /usr/include/bits/thread-shared-types.h \ + /usr/include/bits/pthreadtypes-arch.h /usr/include/bits/setjmp.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/atomic_word.h \ + /usr/include/c++/9.2.0/ext/alloc_traits.h \ + /usr/include/c++/9.2.0/bits/alloc_traits.h \ + /usr/include/c++/9.2.0/ext/string_conversions.h \ + /usr/include/c++/9.2.0/cstdlib /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \ + /usr/include/sys/types.h /usr/include/sys/select.h \ + /usr/include/bits/select.h /usr/include/bits/types/sigset_t.h \ + /usr/include/bits/types/__sigset_t.h /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h /usr/include/bits/stdlib-float.h \ + /usr/include/c++/9.2.0/bits/std_abs.h /usr/include/c++/9.2.0/cstdio \ + /usr/include/c++/9.2.0/cerrno /usr/include/errno.h \ + /usr/include/bits/errno.h /usr/include/linux/errno.h \ + /usr/include/asm/errno.h /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h /usr/include/bits/types/error_t.h \ + /usr/include/c++/9.2.0/bits/functional_hash.h \ + /usr/include/c++/9.2.0/bits/basic_string.tcc ../../libde265/de265.h \ + ../../libde265/de265-version.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/emmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/xmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mm_malloc.h \ + /usr/include/c++/9.2.0/stdlib.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/tmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/pmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/smmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/popcntintrin.h + +/usr/include/stdc-predef.h: + +../../libde265/x86/sse-dct.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h: + +/usr/include/stdint.h: + +/usr/include/bits/libc-header-start.h: + +/usr/include/features.h: + +/usr/include/sys/cdefs.h: + +/usr/include/bits/wordsize.h: + +/usr/include/bits/long-double.h: + +/usr/include/gnu/stubs.h: + +/usr/include/gnu/stubs-64.h: + +/usr/include/bits/types.h: + +/usr/include/bits/timesize.h: + +/usr/include/bits/typesizes.h: + +/usr/include/bits/time64.h: + +/usr/include/bits/wchar.h: + +/usr/include/bits/stdint-intn.h: + +/usr/include/bits/stdint-uintn.h: + +../../libde265/util.h: + +../../config.h: + +/usr/include/inttypes.h: + +/usr/include/stdio.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdarg.h: + +/usr/include/bits/types/__fpos_t.h: + +/usr/include/bits/types/__mbstate_t.h: + +/usr/include/bits/types/__fpos64_t.h: + +/usr/include/bits/types/__FILE.h: + +/usr/include/bits/types/FILE.h: + +/usr/include/bits/types/struct_FILE.h: + +/usr/include/bits/types/cookie_io_functions_t.h: + +/usr/include/bits/stdio_lim.h: + +/usr/include/bits/sys_errlist.h: + +/usr/include/bits/stdio.h: + +/usr/include/c++/9.2.0/string: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++config.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/os_defines.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/cpu_defines.h: + +/usr/include/c++/9.2.0/bits/stringfwd.h: + +/usr/include/c++/9.2.0/bits/memoryfwd.h: + +/usr/include/c++/9.2.0/bits/char_traits.h: + +/usr/include/c++/9.2.0/bits/stl_algobase.h: + +/usr/include/c++/9.2.0/bits/functexcept.h: + +/usr/include/c++/9.2.0/bits/exception_defines.h: + +/usr/include/c++/9.2.0/bits/cpp_type_traits.h: + +/usr/include/c++/9.2.0/ext/type_traits.h: + +/usr/include/c++/9.2.0/ext/numeric_traits.h: + +/usr/include/c++/9.2.0/bits/stl_pair.h: + +/usr/include/c++/9.2.0/bits/move.h: + +/usr/include/c++/9.2.0/bits/concept_check.h: + +/usr/include/c++/9.2.0/type_traits: + +/usr/include/c++/9.2.0/bits/stl_iterator_base_types.h: + +/usr/include/c++/9.2.0/bits/stl_iterator_base_funcs.h: + +/usr/include/c++/9.2.0/debug/assertions.h: + +/usr/include/c++/9.2.0/bits/stl_iterator.h: + +/usr/include/c++/9.2.0/bits/ptr_traits.h: + +/usr/include/c++/9.2.0/debug/debug.h: + +/usr/include/c++/9.2.0/bits/predefined_ops.h: + +/usr/include/c++/9.2.0/bits/postypes.h: + +/usr/include/c++/9.2.0/cwchar: + +/usr/include/wchar.h: + +/usr/include/bits/floatn.h: + +/usr/include/bits/floatn-common.h: + +/usr/include/bits/types/wint_t.h: + +/usr/include/bits/types/mbstate_t.h: + +/usr/include/bits/types/locale_t.h: + +/usr/include/bits/types/__locale_t.h: + +/usr/include/c++/9.2.0/cstdint: + +/usr/include/c++/9.2.0/bits/allocator.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++allocator.h: + +/usr/include/c++/9.2.0/ext/new_allocator.h: + +/usr/include/c++/9.2.0/new: + +/usr/include/c++/9.2.0/exception: + +/usr/include/c++/9.2.0/bits/exception.h: + +/usr/include/c++/9.2.0/bits/exception_ptr.h: + +/usr/include/c++/9.2.0/bits/cxxabi_init_exception.h: + +/usr/include/c++/9.2.0/typeinfo: + +/usr/include/c++/9.2.0/bits/hash_bytes.h: + +/usr/include/c++/9.2.0/bits/nested_exception.h: + +/usr/include/c++/9.2.0/bits/localefwd.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++locale.h: + +/usr/include/c++/9.2.0/clocale: + +/usr/include/locale.h: + +/usr/include/bits/locale.h: + +/usr/include/c++/9.2.0/iosfwd: + +/usr/include/c++/9.2.0/cctype: + +/usr/include/ctype.h: + +/usr/include/endian.h: + +/usr/include/bits/endian.h: + +/usr/include/bits/byteswap.h: + +/usr/include/bits/uintn-identity.h: + +/usr/include/c++/9.2.0/bits/ostream_insert.h: + +/usr/include/c++/9.2.0/bits/cxxabi_forced.h: + +/usr/include/c++/9.2.0/bits/stl_function.h: + +/usr/include/c++/9.2.0/backward/binders.h: + +/usr/include/c++/9.2.0/bits/range_access.h: + +/usr/include/c++/9.2.0/initializer_list: + +/usr/include/c++/9.2.0/bits/basic_string.h: + +/usr/include/c++/9.2.0/ext/atomicity.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h: + +/usr/include/pthread.h: + +/usr/include/sched.h: + +/usr/include/bits/types/time_t.h: + +/usr/include/bits/types/struct_timespec.h: + +/usr/include/bits/sched.h: + +/usr/include/bits/types/struct_sched_param.h: + +/usr/include/bits/cpu-set.h: + +/usr/include/time.h: + +/usr/include/bits/time.h: + +/usr/include/bits/timex.h: + +/usr/include/bits/types/struct_timeval.h: + +/usr/include/bits/types/clock_t.h: + +/usr/include/bits/types/struct_tm.h: + +/usr/include/bits/types/clockid_t.h: + +/usr/include/bits/types/timer_t.h: + +/usr/include/bits/types/struct_itimerspec.h: + +/usr/include/bits/pthreadtypes.h: + +/usr/include/bits/thread-shared-types.h: + +/usr/include/bits/pthreadtypes-arch.h: + +/usr/include/bits/setjmp.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/atomic_word.h: + +/usr/include/c++/9.2.0/ext/alloc_traits.h: + +/usr/include/c++/9.2.0/bits/alloc_traits.h: + +/usr/include/c++/9.2.0/ext/string_conversions.h: + +/usr/include/c++/9.2.0/cstdlib: + +/usr/include/stdlib.h: + +/usr/include/bits/waitflags.h: + +/usr/include/bits/waitstatus.h: + +/usr/include/sys/types.h: + +/usr/include/sys/select.h: + +/usr/include/bits/select.h: + +/usr/include/bits/types/sigset_t.h: + +/usr/include/bits/types/__sigset_t.h: + +/usr/include/alloca.h: + +/usr/include/bits/stdlib-bsearch.h: + +/usr/include/bits/stdlib-float.h: + +/usr/include/c++/9.2.0/bits/std_abs.h: + +/usr/include/c++/9.2.0/cstdio: + +/usr/include/c++/9.2.0/cerrno: + +/usr/include/errno.h: + +/usr/include/bits/errno.h: + +/usr/include/linux/errno.h: + +/usr/include/asm/errno.h: + +/usr/include/asm-generic/errno.h: + +/usr/include/asm-generic/errno-base.h: + +/usr/include/bits/types/error_t.h: + +/usr/include/c++/9.2.0/bits/functional_hash.h: + +/usr/include/c++/9.2.0/bits/basic_string.tcc: + +../../libde265/de265.h: + +../../libde265/de265-version.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/emmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/xmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mm_malloc.h: + +/usr/include/c++/9.2.0/stdlib.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/tmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/pmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/smmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/popcntintrin.h: diff --git a/x86/.deps/libde265_x86_sse_la-sse-motion.Plo b/x86/.deps/libde265_x86_sse_la-sse-motion.Plo new file mode 100644 index 0000000..492d1fd --- /dev/null +++ b/x86/.deps/libde265_x86_sse_la-sse-motion.Plo @@ -0,0 +1,432 @@ +libde265_x86_sse_la-sse-motion.lo: sse-motion.cc \ + /usr/include/stdc-predef.h ../../config.h /usr/include/stdio.h \ + /usr/include/bits/libc-header-start.h /usr/include/features.h \ + /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \ + /usr/include/bits/long-double.h /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdarg.h \ + /usr/include/bits/types.h /usr/include/bits/timesize.h \ + /usr/include/bits/typesizes.h /usr/include/bits/time64.h \ + /usr/include/bits/types/__fpos_t.h /usr/include/bits/types/__mbstate_t.h \ + /usr/include/bits/types/__fpos64_t.h /usr/include/bits/types/__FILE.h \ + /usr/include/bits/types/FILE.h /usr/include/bits/types/struct_FILE.h \ + /usr/include/bits/types/cookie_io_functions_t.h \ + /usr/include/bits/stdio_lim.h /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/emmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/xmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mm_malloc.h \ + /usr/include/c++/9.2.0/stdlib.h /usr/include/c++/9.2.0/cstdlib \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++config.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/os_defines.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/cpu_defines.h \ + /usr/include/stdlib.h /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h /usr/include/bits/floatn.h \ + /usr/include/bits/floatn-common.h /usr/include/bits/types/locale_t.h \ + /usr/include/bits/types/__locale_t.h /usr/include/sys/types.h \ + /usr/include/bits/types/clock_t.h /usr/include/bits/types/clockid_t.h \ + /usr/include/bits/types/time_t.h /usr/include/bits/types/timer_t.h \ + /usr/include/bits/stdint-intn.h /usr/include/endian.h \ + /usr/include/bits/endian.h /usr/include/bits/byteswap.h \ + /usr/include/bits/uintn-identity.h /usr/include/sys/select.h \ + /usr/include/bits/select.h /usr/include/bits/types/sigset_t.h \ + /usr/include/bits/types/__sigset_t.h \ + /usr/include/bits/types/struct_timeval.h \ + /usr/include/bits/types/struct_timespec.h \ + /usr/include/bits/pthreadtypes.h /usr/include/bits/thread-shared-types.h \ + /usr/include/bits/pthreadtypes-arch.h /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h /usr/include/bits/stdlib-float.h \ + /usr/include/c++/9.2.0/bits/std_abs.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/tmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/pmmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/smmintrin.h \ + /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/popcntintrin.h \ + sse-motion.h /usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h \ + /usr/include/stdint.h /usr/include/bits/wchar.h \ + /usr/include/bits/stdint-uintn.h ../../libde265/util.h \ + /usr/include/inttypes.h /usr/include/c++/9.2.0/string \ + /usr/include/c++/9.2.0/bits/stringfwd.h \ + /usr/include/c++/9.2.0/bits/memoryfwd.h \ + /usr/include/c++/9.2.0/bits/char_traits.h \ + /usr/include/c++/9.2.0/bits/stl_algobase.h \ + /usr/include/c++/9.2.0/bits/functexcept.h \ + /usr/include/c++/9.2.0/bits/exception_defines.h \ + /usr/include/c++/9.2.0/bits/cpp_type_traits.h \ + /usr/include/c++/9.2.0/ext/type_traits.h \ + /usr/include/c++/9.2.0/ext/numeric_traits.h \ + /usr/include/c++/9.2.0/bits/stl_pair.h \ + /usr/include/c++/9.2.0/bits/move.h \ + /usr/include/c++/9.2.0/bits/concept_check.h \ + /usr/include/c++/9.2.0/type_traits \ + /usr/include/c++/9.2.0/bits/stl_iterator_base_types.h \ + /usr/include/c++/9.2.0/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9.2.0/debug/assertions.h \ + /usr/include/c++/9.2.0/bits/stl_iterator.h \ + /usr/include/c++/9.2.0/bits/ptr_traits.h \ + /usr/include/c++/9.2.0/debug/debug.h \ + /usr/include/c++/9.2.0/bits/predefined_ops.h \ + /usr/include/c++/9.2.0/bits/postypes.h /usr/include/c++/9.2.0/cwchar \ + /usr/include/wchar.h /usr/include/bits/types/wint_t.h \ + /usr/include/bits/types/mbstate_t.h /usr/include/c++/9.2.0/cstdint \ + /usr/include/c++/9.2.0/bits/allocator.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++allocator.h \ + /usr/include/c++/9.2.0/ext/new_allocator.h /usr/include/c++/9.2.0/new \ + /usr/include/c++/9.2.0/exception /usr/include/c++/9.2.0/bits/exception.h \ + /usr/include/c++/9.2.0/bits/exception_ptr.h \ + /usr/include/c++/9.2.0/bits/cxxabi_init_exception.h \ + /usr/include/c++/9.2.0/typeinfo /usr/include/c++/9.2.0/bits/hash_bytes.h \ + /usr/include/c++/9.2.0/bits/nested_exception.h \ + /usr/include/c++/9.2.0/bits/localefwd.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++locale.h \ + /usr/include/c++/9.2.0/clocale /usr/include/locale.h \ + /usr/include/bits/locale.h /usr/include/c++/9.2.0/iosfwd \ + /usr/include/c++/9.2.0/cctype /usr/include/ctype.h \ + /usr/include/c++/9.2.0/bits/ostream_insert.h \ + /usr/include/c++/9.2.0/bits/cxxabi_forced.h \ + /usr/include/c++/9.2.0/bits/stl_function.h \ + /usr/include/c++/9.2.0/backward/binders.h \ + /usr/include/c++/9.2.0/bits/range_access.h \ + /usr/include/c++/9.2.0/initializer_list \ + /usr/include/c++/9.2.0/bits/basic_string.h \ + /usr/include/c++/9.2.0/ext/atomicity.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h \ + /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \ + /usr/include/bits/types/struct_sched_param.h /usr/include/bits/cpu-set.h \ + /usr/include/time.h /usr/include/bits/time.h /usr/include/bits/timex.h \ + /usr/include/bits/types/struct_tm.h \ + /usr/include/bits/types/struct_itimerspec.h /usr/include/bits/setjmp.h \ + /usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/atomic_word.h \ + /usr/include/c++/9.2.0/ext/alloc_traits.h \ + /usr/include/c++/9.2.0/bits/alloc_traits.h \ + /usr/include/c++/9.2.0/ext/string_conversions.h \ + /usr/include/c++/9.2.0/cstdio /usr/include/c++/9.2.0/cerrno \ + /usr/include/errno.h /usr/include/bits/errno.h \ + /usr/include/linux/errno.h /usr/include/asm/errno.h \ + /usr/include/asm-generic/errno.h /usr/include/asm-generic/errno-base.h \ + /usr/include/bits/types/error_t.h \ + /usr/include/c++/9.2.0/bits/functional_hash.h \ + /usr/include/c++/9.2.0/bits/basic_string.tcc ../../libde265/de265.h \ + ../../libde265/de265-version.h + +/usr/include/stdc-predef.h: + +../../config.h: + +/usr/include/stdio.h: + +/usr/include/bits/libc-header-start.h: + +/usr/include/features.h: + +/usr/include/sys/cdefs.h: + +/usr/include/bits/wordsize.h: + +/usr/include/bits/long-double.h: + +/usr/include/gnu/stubs.h: + +/usr/include/gnu/stubs-64.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stddef.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdarg.h: + +/usr/include/bits/types.h: + +/usr/include/bits/timesize.h: + +/usr/include/bits/typesizes.h: + +/usr/include/bits/time64.h: + +/usr/include/bits/types/__fpos_t.h: + +/usr/include/bits/types/__mbstate_t.h: + +/usr/include/bits/types/__fpos64_t.h: + +/usr/include/bits/types/__FILE.h: + +/usr/include/bits/types/FILE.h: + +/usr/include/bits/types/struct_FILE.h: + +/usr/include/bits/types/cookie_io_functions_t.h: + +/usr/include/bits/stdio_lim.h: + +/usr/include/bits/sys_errlist.h: + +/usr/include/bits/stdio.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/emmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/xmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/mm_malloc.h: + +/usr/include/c++/9.2.0/stdlib.h: + +/usr/include/c++/9.2.0/cstdlib: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++config.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/os_defines.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/cpu_defines.h: + +/usr/include/stdlib.h: + +/usr/include/bits/waitflags.h: + +/usr/include/bits/waitstatus.h: + +/usr/include/bits/floatn.h: + +/usr/include/bits/floatn-common.h: + +/usr/include/bits/types/locale_t.h: + +/usr/include/bits/types/__locale_t.h: + +/usr/include/sys/types.h: + +/usr/include/bits/types/clock_t.h: + +/usr/include/bits/types/clockid_t.h: + +/usr/include/bits/types/time_t.h: + +/usr/include/bits/types/timer_t.h: + +/usr/include/bits/stdint-intn.h: + +/usr/include/endian.h: + +/usr/include/bits/endian.h: + +/usr/include/bits/byteswap.h: + +/usr/include/bits/uintn-identity.h: + +/usr/include/sys/select.h: + +/usr/include/bits/select.h: + +/usr/include/bits/types/sigset_t.h: + +/usr/include/bits/types/__sigset_t.h: + +/usr/include/bits/types/struct_timeval.h: + +/usr/include/bits/types/struct_timespec.h: + +/usr/include/bits/pthreadtypes.h: + +/usr/include/bits/thread-shared-types.h: + +/usr/include/bits/pthreadtypes-arch.h: + +/usr/include/alloca.h: + +/usr/include/bits/stdlib-bsearch.h: + +/usr/include/bits/stdlib-float.h: + +/usr/include/c++/9.2.0/bits/std_abs.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/tmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/pmmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/smmintrin.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/popcntintrin.h: + +sse-motion.h: + +/usr/lib/gcc/x86_64-pc-linux-gnu/9.2.0/include/stdint.h: + +/usr/include/stdint.h: + +/usr/include/bits/wchar.h: + +/usr/include/bits/stdint-uintn.h: + +../../libde265/util.h: + +/usr/include/inttypes.h: + +/usr/include/c++/9.2.0/string: + +/usr/include/c++/9.2.0/bits/stringfwd.h: + +/usr/include/c++/9.2.0/bits/memoryfwd.h: + +/usr/include/c++/9.2.0/bits/char_traits.h: + +/usr/include/c++/9.2.0/bits/stl_algobase.h: + +/usr/include/c++/9.2.0/bits/functexcept.h: + +/usr/include/c++/9.2.0/bits/exception_defines.h: + +/usr/include/c++/9.2.0/bits/cpp_type_traits.h: + +/usr/include/c++/9.2.0/ext/type_traits.h: + +/usr/include/c++/9.2.0/ext/numeric_traits.h: + +/usr/include/c++/9.2.0/bits/stl_pair.h: + +/usr/include/c++/9.2.0/bits/move.h: + +/usr/include/c++/9.2.0/bits/concept_check.h: + +/usr/include/c++/9.2.0/type_traits: + +/usr/include/c++/9.2.0/bits/stl_iterator_base_types.h: + +/usr/include/c++/9.2.0/bits/stl_iterator_base_funcs.h: + +/usr/include/c++/9.2.0/debug/assertions.h: + +/usr/include/c++/9.2.0/bits/stl_iterator.h: + +/usr/include/c++/9.2.0/bits/ptr_traits.h: + +/usr/include/c++/9.2.0/debug/debug.h: + +/usr/include/c++/9.2.0/bits/predefined_ops.h: + +/usr/include/c++/9.2.0/bits/postypes.h: + +/usr/include/c++/9.2.0/cwchar: + +/usr/include/wchar.h: + +/usr/include/bits/types/wint_t.h: + +/usr/include/bits/types/mbstate_t.h: + +/usr/include/c++/9.2.0/cstdint: + +/usr/include/c++/9.2.0/bits/allocator.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++allocator.h: + +/usr/include/c++/9.2.0/ext/new_allocator.h: + +/usr/include/c++/9.2.0/new: + +/usr/include/c++/9.2.0/exception: + +/usr/include/c++/9.2.0/bits/exception.h: + +/usr/include/c++/9.2.0/bits/exception_ptr.h: + +/usr/include/c++/9.2.0/bits/cxxabi_init_exception.h: + +/usr/include/c++/9.2.0/typeinfo: + +/usr/include/c++/9.2.0/bits/hash_bytes.h: + +/usr/include/c++/9.2.0/bits/nested_exception.h: + +/usr/include/c++/9.2.0/bits/localefwd.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/c++locale.h: + +/usr/include/c++/9.2.0/clocale: + +/usr/include/locale.h: + +/usr/include/bits/locale.h: + +/usr/include/c++/9.2.0/iosfwd: + +/usr/include/c++/9.2.0/cctype: + +/usr/include/ctype.h: + +/usr/include/c++/9.2.0/bits/ostream_insert.h: + +/usr/include/c++/9.2.0/bits/cxxabi_forced.h: + +/usr/include/c++/9.2.0/bits/stl_function.h: + +/usr/include/c++/9.2.0/backward/binders.h: + +/usr/include/c++/9.2.0/bits/range_access.h: + +/usr/include/c++/9.2.0/initializer_list: + +/usr/include/c++/9.2.0/bits/basic_string.h: + +/usr/include/c++/9.2.0/ext/atomicity.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/gthr-default.h: + +/usr/include/pthread.h: + +/usr/include/sched.h: + +/usr/include/bits/sched.h: + +/usr/include/bits/types/struct_sched_param.h: + +/usr/include/bits/cpu-set.h: + +/usr/include/time.h: + +/usr/include/bits/time.h: + +/usr/include/bits/timex.h: + +/usr/include/bits/types/struct_tm.h: + +/usr/include/bits/types/struct_itimerspec.h: + +/usr/include/bits/setjmp.h: + +/usr/include/c++/9.2.0/x86_64-pc-linux-gnu/bits/atomic_word.h: + +/usr/include/c++/9.2.0/ext/alloc_traits.h: + +/usr/include/c++/9.2.0/bits/alloc_traits.h: + +/usr/include/c++/9.2.0/ext/string_conversions.h: + +/usr/include/c++/9.2.0/cstdio: + +/usr/include/c++/9.2.0/cerrno: + +/usr/include/errno.h: + +/usr/include/bits/errno.h: + +/usr/include/linux/errno.h: + +/usr/include/asm/errno.h: + +/usr/include/asm-generic/errno.h: + +/usr/include/asm-generic/errno-base.h: + +/usr/include/bits/types/error_t.h: + +/usr/include/c++/9.2.0/bits/functional_hash.h: + +/usr/include/c++/9.2.0/bits/basic_string.tcc: + +../../libde265/de265.h: + +../../libde265/de265-version.h: diff --git a/x86/CMakeLists.txt b/x86/CMakeLists.txt new file mode 100644 index 0000000..0fd6fcf --- /dev/null +++ b/x86/CMakeLists.txt @@ -0,0 +1,23 @@ +set (x86_sources + sse.cc sse.h +) + +set (x86_sse_sources + sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc +) + +add_library(x86 OBJECT ${x86_sources}) + +add_library(x86_sse OBJECT ${x86_sse_sources}) + +set(sse_flags "") + +if(NOT MSVC) + set(sse_flags "${sse_flags} -msse4.1") +endif() + +set(X86_OBJECTS $ $ PARENT_SCOPE) + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + SET_TARGET_PROPERTIES(x86_sse PROPERTIES COMPILE_FLAGS "${sse_flags}") +endif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") diff --git a/x86/Makefile b/x86/Makefile new file mode 100644 index 0000000..003f863 --- /dev/null +++ b/x86/Makefile @@ -0,0 +1,703 @@ +# Makefile.in generated by automake 1.16.1 from Makefile.am. +# libde265/x86/Makefile. Generated from Makefile.in by configure. + +# Copyright (C) 1994-2018 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + + + +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/libde265 +pkgincludedir = $(includedir)/libde265 +pkglibdir = $(libdir)/libde265 +pkglibexecdir = $(libexecdir)/libde265 +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = x86_64-pc-linux-gnu +host_triplet = x86_64-pc-linux-gnu +target_triplet = x86_64-pc-linux-gnu +#am__append_1 = -DHAVE_VISIBILITY +#am__append_2 = -DHAVE_VISIBILITY +subdir = libde265/x86 +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libde265_x86_la_DEPENDENCIES = libde265_x86_sse.la +am_libde265_x86_la_OBJECTS = libde265_x86_la-sse.lo +libde265_x86_la_OBJECTS = $(am_libde265_x86_la_OBJECTS) +AM_V_lt = $(am__v_lt_$(V)) +am__v_lt_ = $(am__v_lt_$(AM_DEFAULT_VERBOSITY)) +am__v_lt_0 = --silent +am__v_lt_1 = +libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +libde265_x86_sse_la_LIBADD = +am_libde265_x86_sse_la_OBJECTS = libde265_x86_sse_la-sse-motion.lo \ + libde265_x86_sse_la-sse-dct.lo +libde265_x86_sse_la_OBJECTS = $(am_libde265_x86_sse_la_OBJECTS) +libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +AM_V_P = $(am__v_P_$(V)) +am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY)) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_$(V)) +am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_$(V)) +am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I. -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/libde265_x86_la-sse.Plo \ + ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo \ + ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo +am__mv = mv -f +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_$(V)) +am__v_CXX_ = $(am__v_CXX_$(AM_DEFAULT_VERBOSITY)) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_$(V)) +am__v_CXXLD_ = $(am__v_CXXLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_$(V)) +am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_$(V)) +am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libde265_x86_la_SOURCES) $(libde265_x86_sse_la_SOURCES) +DIST_SOURCES = $(libde265_x86_la_SOURCES) \ + $(libde265_x86_sse_la_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = ${SHELL} /home/dima/git/libde265/missing aclocal-1.16 +ALLOCA = +AMTAR = $${TAR-tar} +AM_DEFAULT_VERBOSITY = 1 +AR = ar +AUTOCONF = ${SHELL} /home/dima/git/libde265/missing autoconf +AUTOHEADER = ${SHELL} /home/dima/git/libde265/missing autoheader +AUTOMAKE = ${SHELL} /home/dima/git/libde265/missing automake-1.16 +AWK = gawk +CC = gcc +CCAS = gcc +CCASDEPMODE = depmode=gcc3 +CCASFLAGS = -g -O2 +CCDEPMODE = depmode=gcc3 +CFLAGS = -g -O2 -std=c99 -Wall +CPP = gcc -E +CPPFLAGS = +CXX = g++ +CXXCPP = g++ -E +CXXDEPMODE = depmode=gcc3 +CXXFLAGS = -g -O2 -Werror=return-type -Werror=unused-result -Werror=reorder -DDE265_LOG_ERROR +CYGPATH_W = echo +DEFS = -DHAVE_CONFIG_H +DEPDIR = .deps +DLLTOOL = false +DSYMUTIL = +DUMPBIN = +ECHO_C = +ECHO_N = -n +ECHO_T = +EGREP = /usr/bin/grep -E +EXEEXT = +FGREP = /usr/bin/grep -F +GREP = /usr/bin/grep +HAVE_CXX11 = +INSTALL = /usr/bin/install -c +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_PROGRAM = ${INSTALL} +INSTALL_SCRIPT = ${INSTALL} +INSTALL_STRIP_PROGRAM = $(install_sh) -c -s +LD = /usr/bin/ld -m elf_x86_64 +LDFLAGS = +LIBDE265_AGE = 0 +LIBDE265_CURRENT = 0 +LIBDE265_REVISION = 12 +LIBOBJS = +LIBS = -lpthread -lm +LIBTOOL = $(SHELL) $(top_builddir)/libtool +LIPO = +LN_S = ln -s +LTLIBOBJS = +LT_SYS_LIBRARY_PATH = +MAKEINFO = ${SHELL} /home/dima/git/libde265/missing makeinfo +MANIFEST_TOOL = : +MKDIR_P = /usr/bin/mkdir -p +NM = /usr/bin/nm -B +NMEDIT = +NUMERIC_VERSION = 0x01000500 +OBJDUMP = objdump +OBJEXT = o +OTOOL = +OTOOL64 = +PACKAGE = libde265 +PACKAGE_BUGREPORT = farin@struktur.de +PACKAGE_NAME = libde265 +PACKAGE_STRING = libde265 1.0.5 +PACKAGE_TARNAME = libde265 +PACKAGE_URL = +PACKAGE_VERSION = 1.0.5 +PATH_SEPARATOR = : +PKG_CONFIG = /usr/bin/pkg-config +PKG_CONFIG_LIBDIR = +PKG_CONFIG_PATH = +QTCHOOSER = +QTMOC = /usr/bin/moc-qt5 +QT_CFLAGS = -I/usr/include/qt/QtCore -I/usr/include/qt -I/usr/include/qt/QtGui -DQT_WIDGETS_LIB -I/usr/include/qt/QtWidgets -DQT_GUI_LIB -DQT_CORE_LIB +QT_LIBS = -lQt5Widgets -lQt5Gui -lQt5Core +RANLIB = ranlib +SDL_CFLAGS = -I/usr/include/SDL -D_GNU_SOURCE=1 -D_REENTRANT +SDL_LIBS = -lSDL -lpthread +SED = /usr/bin/sed +SET_MAKE = +SHELL = /bin/sh +STRIP = strip +SWSCALE_CFLAGS = +SWSCALE_LIBS = -lswscale +VERSION = 1.0.5 +VIDEOGFX_CFLAGS = +VIDEOGFX_LIBS = +abs_builddir = /home/dima/git/libde265/libde265/x86 +abs_srcdir = /home/dima/git/libde265/libde265/x86 +abs_top_builddir = /home/dima/git/libde265 +abs_top_srcdir = /home/dima/git/libde265 +ac_ct_AR = ar +ac_ct_CC = gcc +ac_ct_CXX = g++ +ac_ct_DUMPBIN = +am__include = include +am__leading_dot = . +am__quote = +am__tar = $${TAR-tar} chof - "$$tardir" +am__untar = $${TAR-tar} xf - +bindir = ${exec_prefix}/bin +build = x86_64-pc-linux-gnu +build_alias = +build_cpu = x86_64 +build_os = linux-gnu +build_vendor = pc +builddir = . +datadir = ${datarootdir} +datarootdir = ${prefix}/share +docdir = ${datarootdir}/doc/${PACKAGE_TARNAME} +dvidir = ${docdir} +exec_prefix = ${prefix} +host = x86_64-pc-linux-gnu +host_alias = +host_cpu = x86_64 +host_os = linux-gnu +host_vendor = pc +htmldir = ${docdir} +includedir = ${prefix}/include +infodir = ${datarootdir}/info +install_sh = ${SHELL} /home/dima/git/libde265/install-sh +libdir = ${exec_prefix}/lib +libexecdir = ${exec_prefix}/libexec +localedir = ${datarootdir}/locale +localstatedir = ${prefix}/var +mandir = ${datarootdir}/man +mkdir_p = $(MKDIR_P) +oldincludedir = /usr/include +pdfdir = ${docdir} +prefix = /usr/local +program_transform_name = s,x,x, +psdir = ${docdir} +sbindir = ${exec_prefix}/sbin +sharedstatedir = ${prefix}/com +srcdir = . +sysconfdir = ${prefix}/etc +target = x86_64-pc-linux-gnu +target_alias = +target_cpu = x86_64 +target_os = linux-gnu +target_vendor = pc +top_build_prefix = ../../ +top_builddir = ../.. +top_srcdir = ../.. +noinst_LTLIBRARIES = libde265_x86.la libde265_x86_sse.la +libde265_x86_la_CXXFLAGS = -I$(top_srcdir)/libde265 \ + $(CFLAG_VISIBILITY) $(am__append_1) +libde265_x86_la_SOURCES = sse.cc sse.h +libde265_x86_la_LIBADD = libde265_x86_sse.la + +# SSE4 specific functions +libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I$(top_srcdir) \ + -I$(top_srcdir)/libde265 $(CFLAG_VISIBILITY) $(am__append_2) +libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc +EXTRA_DIST = \ + CMakeLists.txt + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/x86/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu libde265/x86/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libde265_x86.la: $(libde265_x86_la_OBJECTS) $(libde265_x86_la_DEPENDENCIES) $(EXTRA_libde265_x86_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_x86_la_LINK) $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS) + +libde265_x86_sse.la: $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_DEPENDENCIES) $(EXTRA_libde265_x86_sse_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_x86_sse_la_LINK) $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +include ./$(DEPDIR)/libde265_x86_la-sse.Plo # am--include-marker +include ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo # am--include-marker +include ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.cc.o: + $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CXX)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: + $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +# $(AM_V_CXX)source='$<' object='$@' libtool=no \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: + $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< + $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +# $(AM_V_CXX)source='$<' object='$@' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LTCXXCOMPILE) -c -o $@ $< + +libde265_x86_la-sse.lo: sse.cc + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo +# $(AM_V_CXX)source='sse.cc' object='libde265_x86_la-sse.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc + +libde265_x86_sse_la-sse-motion.lo: sse-motion.cc + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo +# $(AM_V_CXX)source='sse-motion.cc' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc + +libde265_x86_sse_la-sse-dct.lo: sse-dct.cc + $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc + $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo +# $(AM_V_CXX)source='sse-dct.cc' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes \ +# DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) \ +# $(AM_V_CXX_no)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/libde265_x86_la-sse.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/libde265_x86_la-sse.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-generic clean-libtool clean-noinstLTLIBRARIES \ + cscopelist-am ctags ctags-am distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am html html-am info info-am install install-am \ + install-data install-data-am install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/x86/Makefile.am b/x86/Makefile.am new file mode 100644 index 0000000..1f0a33a --- /dev/null +++ b/x86/Makefile.am @@ -0,0 +1,22 @@ +noinst_LTLIBRARIES = libde265_x86.la libde265_x86_sse.la + +libde265_x86_la_CXXFLAGS = -I$(top_srcdir)/libde265 $(CFLAG_VISIBILITY) +libde265_x86_la_SOURCES = sse.cc sse.h +libde265_x86_la_LIBADD = libde265_x86_sse.la + +if HAVE_VISIBILITY + libde265_x86_la_CXXFLAGS += -DHAVE_VISIBILITY +endif + + +# SSE4 specific functions + +libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I$(top_srcdir) -I$(top_srcdir)/libde265 $(CFLAG_VISIBILITY) +libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc + +if HAVE_VISIBILITY + libde265_x86_sse_la_CXXFLAGS += -DHAVE_VISIBILITY +endif + +EXTRA_DIST = \ + CMakeLists.txt diff --git a/x86/Makefile.in b/x86/Makefile.in new file mode 100644 index 0000000..eb494fe --- /dev/null +++ b/x86/Makefile.in @@ -0,0 +1,703 @@ +# Makefile.in generated by automake 1.16.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2018 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +target_triplet = @target@ +@HAVE_VISIBILITY_TRUE@am__append_1 = -DHAVE_VISIBILITY +@HAVE_VISIBILITY_TRUE@am__append_2 = -DHAVE_VISIBILITY +subdir = libde265/x86 +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \ + $(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \ + $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \ + $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \ + $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libde265_x86_la_DEPENDENCIES = libde265_x86_sse.la +am_libde265_x86_la_OBJECTS = libde265_x86_la-sse.lo +libde265_x86_la_OBJECTS = $(am_libde265_x86_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +libde265_x86_sse_la_LIBADD = +am_libde265_x86_sse_la_OBJECTS = libde265_x86_sse_la-sse-motion.lo \ + libde265_x86_sse_la-sse-dct.lo +libde265_x86_sse_la_OBJECTS = $(am_libde265_x86_sse_la_OBJECTS) +libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \ + $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \ + $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__maybe_remake_depfiles = depfiles +am__depfiles_remade = ./$(DEPDIR)/libde265_x86_la-sse.Plo \ + ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo \ + ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo +am__mv = mv -f +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_@AM_V@) +am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) +am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libde265_x86_la_SOURCES) $(libde265_x86_sse_la_SOURCES) +DIST_SOURCES = $(libde265_x86_la_SOURCES) \ + $(libde265_x86_sse_la_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +HAVE_CXX11 = @HAVE_CXX11@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBDE265_AGE = @LIBDE265_AGE@ +LIBDE265_CURRENT = @LIBDE265_CURRENT@ +LIBDE265_REVISION = @LIBDE265_REVISION@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +NUMERIC_VERSION = @NUMERIC_VERSION@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PKG_CONFIG = @PKG_CONFIG@ +PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ +PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ +QTCHOOSER = @QTCHOOSER@ +QTMOC = @QTMOC@ +QT_CFLAGS = @QT_CFLAGS@ +QT_LIBS = @QT_LIBS@ +RANLIB = @RANLIB@ +SDL_CFLAGS = @SDL_CFLAGS@ +SDL_LIBS = @SDL_LIBS@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +SWSCALE_CFLAGS = @SWSCALE_CFLAGS@ +SWSCALE_LIBS = @SWSCALE_LIBS@ +VERSION = @VERSION@ +VIDEOGFX_CFLAGS = @VIDEOGFX_CFLAGS@ +VIDEOGFX_LIBS = @VIDEOGFX_LIBS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target = @target@ +target_alias = @target_alias@ +target_cpu = @target_cpu@ +target_os = @target_os@ +target_vendor = @target_vendor@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +noinst_LTLIBRARIES = libde265_x86.la libde265_x86_sse.la +libde265_x86_la_CXXFLAGS = -I$(top_srcdir)/libde265 \ + $(CFLAG_VISIBILITY) $(am__append_1) +libde265_x86_la_SOURCES = sse.cc sse.h +libde265_x86_la_LIBADD = libde265_x86_sse.la + +# SSE4 specific functions +libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I$(top_srcdir) \ + -I$(top_srcdir)/libde265 $(CFLAG_VISIBILITY) $(am__append_2) +libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc +EXTRA_DIST = \ + CMakeLists.txt + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/x86/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu libde265/x86/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libde265_x86.la: $(libde265_x86_la_OBJECTS) $(libde265_x86_la_DEPENDENCIES) $(EXTRA_libde265_x86_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_x86_la_LINK) $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS) + +libde265_x86_sse.la: $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_DEPENDENCIES) $(EXTRA_libde265_x86_sse_la_DEPENDENCIES) + $(AM_V_CXXLD)$(libde265_x86_sse_la_LINK) $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_x86_la-sse.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo@am__quote@ # am--include-marker +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo@am__quote@ # am--include-marker + +$(am__depfiles_remade): + @$(MKDIR_P) $(@D) + @echo '# dummy' >$@-t && $(am__mv) $@-t $@ + +am--depfiles: $(am__depfiles_remade) + +.cc.o: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< + +libde265_x86_la-sse.lo: sse.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse.cc' object='libde265_x86_la-sse.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc + +libde265_x86_sse_la-sse-motion.lo: sse-motion.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse-motion.cc' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc + +libde265_x86_sse_la-sse-dct.lo: sse-dct.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='sse-dct.cc' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(BUILT_SOURCES) + $(MAKE) $(AM_MAKEFLAGS) distdir-am + +distdir-am: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f ./$(DEPDIR)/libde265_x86_la-sse.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f ./$(DEPDIR)/libde265_x86_la-sse.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo + -rm -f ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \ + clean-generic clean-libtool clean-noinstLTLIBRARIES \ + cscopelist-am ctags ctags-am distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am html html-am info info-am install install-am \ + install-data install-data-am install-dvi install-dvi-am \ + install-exec install-exec-am install-html install-html-am \ + install-info install-info-am install-man install-pdf \ + install-pdf-am install-ps install-ps-am install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/x86/sse-dct.cc b/x86/sse-dct.cc new file mode 100644 index 0000000..3a9b7ba --- /dev/null +++ b/x86/sse-dct.cc @@ -0,0 +1,7094 @@ +/* + * H.265 video codec. + * Copyright (c) 2013 openHEVC contributors + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#include "x86/sse-dct.h" +#include "libde265/util.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include // SSE2 +#include // SSSE3 + +#if HAVE_SSE4_1 +#include // SSE4.1 +#endif + + +ALIGNED_16(static const int16_t) transform4x4_luma[8][8] = +{ + { 29, +84, 29, +84, 29, +84, 29, +84 }, + { +74, +55, +74, +55, +74, +55, +74, +55 }, + { 55, -29, 55, -29, 55, -29, 55, -29 }, + { +74, -84, +74, -84, +74, -84, +74, -84 }, + { 74, -74, 74, -74, 74, -74, 74, -74 }, + { 0, +74, 0, +74, 0, +74, 0, +74 }, + { 84, +55, 84, +55, 84, +55, 84, +55 }, + { -74, -29, -74, -29, -74, -29, -74, -29 } +}; + +ALIGNED_16(static const int16_t) transform4x4[4][8] = { + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 } +}; + +ALIGNED_16(static const int16_t) transform8x8[12][8] = +{ + { 89, 75, 89, 75, 89, 75, 89, 75 }, + { 50, 18, 50, 18, 50, 18, 50, 18 }, + { 75, -18, 75, -18, 75, -18, 75, -18 }, + { -89, -50, -89, -50,-89, -50,-89, -50 }, + { 50, -89, 50, -89, 50, -89, 50, -89 }, + { 18, 75, 18, 75, 18, 75, 18, 75 }, + { 18, -50, 18, -50, 18, -50, 18, -50 }, + { 75, -89, 75, -89, 75, -89, 75, -89 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 } +}; + +ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] = +{ + {/*1-3*/ /*2-6*/ + { 90, 87, 90, 87, 90, 87, 90, 87 }, + { 87, 57, 87, 57, 87, 57, 87, 57 }, + { 80, 9, 80, 9, 80, 9, 80, 9 }, + { 70, -43, 70, -43, 70, -43, 70, -43 }, + { 57, -80, 57, -80, 57, -80, 57, -80 }, + { 43, -90, 43, -90, 43, -90, 43, -90 }, + { 25, -70, 25, -70, 25, -70, 25, -70 }, + { 9, -25, 9, -25, 9, -25, 9, -25 }, + },{ /*5-7*/ /*10-14*/ + { 80, 70, 80, 70, 80, 70, 80, 70 }, + { 9, -43, 9, -43, 9, -43, 9, -43 }, + { -70, -87, -70, -87, -70, -87, -70, -87 }, + { -87, 9, -87, 9, -87, 9, -87, 9 }, + { -25, 90, -25, 90, -25, 90, -25, 90 }, + { 57, 25, 57, 25, 57, 25, 57, 25 }, + { 90, -80, 90, -80, 90, -80, 90, -80 }, + { 43, -57, 43, -57, 43, -57, 43, -57 }, + },{ /*9-11*/ /*18-22*/ + { 57, 43, 57, 43, 57, 43, 57, 43 }, + { -80, -90, -80, -90, -80, -90, -80, -90 }, + { -25, 57, -25, 57, -25, 57, -25, 57 }, + { 90, 25, 90, 25, 90, 25, 90, 25 }, + { -9, -87, -9, -87, -9, -87, -9, -87 }, + { -87, 70, -87, 70, -87, 70, -87, 70 }, + { 43, 9, 43, 9, 43, 9, 43, 9 }, + { 70, -80, 70, -80, 70, -80, 70, -80 }, + },{/*13-15*/ /* 26-30 */ + { 25, 9, 25, 9, 25, 9, 25, 9 }, + { -70, -25, -70, -25, -70, -25, -70, -25 }, + { 90, 43, 90, 43, 90, 43, 90, 43 }, + { -80, -57, -80, -57, -80, -57, -80, -57 }, + { 43, 70, 43, 70, 43, 70, 43, 70 }, + { 9, -80, 9, -80, 9, -80, 9, -80 }, + { -57, 87, -57, 87, -57, 87, -57, 87 }, + { 87, -90, 87, -90, 87, -90, 87, -90 }, + } +}; + +ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] = +{ + { /*2-6*/ /*4-12*/ + { 89, 75, 89, 75, 89, 75, 89, 75 }, + { 75, -18, 75, -18, 75, -18, 75, -18 }, + { 50, -89, 50, -89, 50, -89, 50, -89 }, + { 18, -50, 18, -50, 18, -50, 18, -50 }, + },{ /*10-14*/ /*20-28*/ + { 50, 18, 50, 18, 50, 18, 50, 18 }, + { -89, -50, -89, -50, -89, -50, -89, -50 }, + { 18, 75, 18, 75, 18, 75, 18, 75 }, + { 75, -89, 75, -89, 75, -89, 75, -89 }, + } +}; + +ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] = +{ + {/*4-12*/ /*8-24*/ + { 83, 36, 83, 36, 83, 36, 83, 36 }, + { 36, -83, 36, -83, 36, -83, 36, -83 }, + },{ /*0-8*/ /*0-16*/ + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, -64, 64, -64, 64, -64, 64, -64 }, + } +}; + + +ALIGNED_16(static const int16_t) transform32x32[8][16][8] = +{ + { /* 1-3 */ + { 90, 90, 90, 90, 90, 90, 90, 90 }, + { 90, 82, 90, 82, 90, 82, 90, 82 }, + { 88, 67, 88, 67, 88, 67, 88, 67 }, + { 85, 46, 85, 46, 85, 46, 85, 46 }, + { 82, 22, 82, 22, 82, 22, 82, 22 }, + { 78, -4, 78, -4, 78, -4, 78, -4 }, + { 73, -31, 73, -31, 73, -31, 73, -31 }, + { 67, -54, 67, -54, 67, -54, 67, -54 }, + { 61, -73, 61, -73, 61, -73, 61, -73 }, + { 54, -85, 54, -85, 54, -85, 54, -85 }, + { 46, -90, 46, -90, 46, -90, 46, -90 }, + { 38, -88, 38, -88, 38, -88, 38, -88 }, + { 31, -78, 31, -78, 31, -78, 31, -78 }, + { 22, -61, 22, -61, 22, -61, 22, -61 }, + { 13, -38, 13, -38, 13, -38, 13, -38 }, + { 4, -13, 4, -13, 4, -13, 4, -13 }, + },{/* 5-7 */ + { 88, 85, 88, 85, 88, 85, 88, 85 }, + { 67, 46, 67, 46, 67, 46, 67, 46 }, + { 31, -13, 31, -13, 31, -13, 31, -13 }, + { -13, -67, -13, -67, -13, -67, -13, -67 }, + { -54, -90, -54, -90, -54, -90, -54, -90 }, + { -82, -73, -82, -73, -82, -73, -82, -73 }, + { -90, -22, -90, -22, -90, -22, -90, -22 }, + { -78, 38, -78, 38, -78, 38, -78, 38 }, + { -46, 82, -46, 82, -46, 82, -46, 82 }, + { -4, 88, -4, 88, -4, 88, -4, 88 }, + { 38, 54, 38, 54, 38, 54, 38, 54 }, + { 73, -4, 73, -4, 73, -4, 73, -4 }, + { 90, -61, 90, -61, 90, -61, 90, -61 }, + { 85, -90, 85, -90, 85, -90, 85, -90 }, + { 61, -78, 61, -78, 61, -78, 61, -78 }, + { 22, -31, 22, -31, 22, -31, 22, -31 }, + },{/* 9-11 */ + { 82, 78, 82, 78, 82, 78, 82, 78 }, + { 22, -4, 22, -4, 22, -4, 22, -4 }, + { -54, -82, -54, -82, -54, -82, -54, -82 }, + { -90, -73, -90, -73, -90, -73, -90, -73 }, + { -61, 13, -61, 13, -61, 13, -61, 13 }, + { 13, 85, 13, 85, 13, 85, 13, 85 }, + { 78, 67, 78, 67, 78, 67, 78, 67 }, + { 85, -22, 85, -22, 85, -22, 85, -22 }, + { 31, -88, 31, -88, 31, -88, 31, -88 }, + { -46, -61, -46, -61, -46, -61, -46, -61 }, + { -90, 31, -90, 31, -90, 31, -90, 31 }, + { -67, 90, -67, 90, -67, 90, -67, 90 }, + { 4, 54, 4, 54, 4, 54, 4, 54 }, + { 73, -38, 73, -38, 73, -38, 73, -38 }, + { 88, -90, 88, -90, 88, -90, 88, -90 }, + { 38, -46, 38, -46, 38, -46, 38, -46 }, + },{/* 13-15 */ + { 73, 67, 73, 67, 73, 67, 73, 67 }, + { -31, -54, -31, -54, -31, -54, -31, -54 }, + { -90, -78, -90, -78, -90, -78, -90, -78 }, + { -22, 38, -22, 38, -22, 38, -22, 38 }, + { 78, 85, 78, 85, 78, 85, 78, 85 }, + { 67, -22, 67, -22, 67, -22, 67, -22 }, + { -38, -90, -38, -90, -38, -90, -38, -90 }, + { -90, 4, -90, 4, -90, 4, -90, 4 }, + { -13, 90, -13, 90, -13, 90, -13, 90 }, + { 82, 13, 82, 13, 82, 13, 82, 13 }, + { 61, -88, 61, -88, 61, -88, 61, -88 }, + { -46, -31, -46, -31, -46, -31, -46, -31 }, + { -88, 82, -88, 82, -88, 82, -88, 82 }, + { -4, 46, -4, 46, -4, 46, -4, 46 }, + { 85, -73, 85, -73, 85, -73, 85, -73 }, + { 54, -61, 54, -61, 54, -61, 54, -61 }, + },{/* 17-19 */ + { 61, 54, 61, 54, 61, 54, 61, 54 }, + { -73, -85, -73, -85, -73, -85, -73, -85 }, + { -46, -4, -46, -4, -46, -4, -46, -4 }, + { 82, 88, 82, 88, 82, 88, 82, 88 }, + { 31, -46, 31, -46, 31, -46, 31, -46 }, + { -88, -61, -88, -61, -88, -61, -88, -61 }, + { -13, 82, -13, 82, -13, 82, -13, 82 }, + { 90, 13, 90, 13, 90, 13, 90, 13 }, + { -4, -90, -4, -90, -4, -90, -4, -90 }, + { -90, 38, -90, 38, -90, 38, -90, 38 }, + { 22, 67, 22, 67, 22, 67, 22, 67 }, + { 85, -78, 85, -78, 85, -78, 85, -78 }, + { -38, -22, -38, -22, -38, -22, -38, -22 }, + { -78, 90, -78, 90, -78, 90, -78, 90 }, + { 54, -31, 54, -31, 54, -31, 54, -31 }, + { 67, -73, 67, -73, 67, -73, 67, -73 }, + },{ /* 21-23 */ + { 46, 38, 46, 38, 46, 38, 46, 38 }, + { -90, -88, -90, -88, -90, -88, -90, -88 }, + { 38, 73, 38, 73, 38, 73, 38, 73 }, + { 54, -4, 54, -4, 54, -4, 54, -4 }, + { -90, -67, -90, -67, -90, -67, -90, -67 }, + { 31, 90, 31, 90, 31, 90, 31, 90 }, + { 61, -46, 61, -46, 61, -46, 61, -46 }, + { -88, -31, -88, -31, -88, -31, -88, -31 }, + { 22, 85, 22, 85, 22, 85, 22, 85 }, + { 67, -78, 67, -78, 67, -78, 67, -78 }, + { -85, 13, -85, 13, -85, 13, -85, 13 }, + { 13, 61, 13, 61, 13, 61, 13, 61 }, + { 73, -90, 73, -90, 73, -90, 73, -90 }, + { -82, 54, -82, 54, -82, 54, -82, 54 }, + { 4, 22, 4, 22, 4, 22, 4, 22 }, + { 78, -82, 78, -82, 78, -82, 78, -82 }, + },{ /* 25-27 */ + { 31, 22, 31, 22, 31, 22, 31, 22 }, + { -78, -61, -78, -61, -78, -61, -78, -61 }, + { 90, 85, 90, 85, 90, 85, 90, 85 }, + { -61, -90, -61, -90, -61, -90, -61, -90 }, + { 4, 73, 4, 73, 4, 73, 4, 73 }, + { 54, -38, 54, -38, 54, -38, 54, -38 }, + { -88, -4, -88, -4, -88, -4, -88, -4 }, + { 82, 46, 82, 46, 82, 46, 82, 46 }, + { -38, -78, -38, -78, -38, -78, -38, -78 }, + { -22, 90, -22, 90, -22, 90, -22, 90 }, + { 73, -82, 73, -82, 73, -82, 73, -82 }, + { -90, 54, -90, 54, -90, 54, -90, 54 }, + { 67, -13, 67, -13, 67, -13, 67, -13 }, + { -13, -31, -13, -31, -13, -31, -13, -31 }, + { -46, 67, -46, 67, -46, 67, -46, 67 }, + { 85, -88, 85, -88, 85, -88, 85, -88 }, + },{/* 29-31 */ + { 13, 4, 13, 4, 13, 4, 13, 4 }, + { -38, -13, -38, -13, -38, -13, -38, -13 }, + { 61, 22, 61, 22, 61, 22, 61, 22 }, + { -78, -31, -78, -31, -78, -31, -78, -31 }, + { 88, 38, 88, 38, 88, 38, 88, 38 }, + { -90, -46, -90, -46, -90, -46, -90, -46 }, + { 85, 54, 85, 54, 85, 54, 85, 54 }, + { -73, -61, -73, -61, -73, -61, -73, -61 }, + { 54, 67, 54, 67, 54, 67, 54, 67 }, + { -31, -73, -31, -73, -31, -73, -31, -73 }, + { 4, 78, 4, 78, 4, 78, 4, 78 }, + { 22, -82, 22, -82, 22, -82, 22, -82 }, + { -46, 85, -46, 85, -46, 85, -46, 85 }, + { 67, -88, 67, -88, 67, -88, 67, -88 }, + { -82, 90, -82, 90, -82, 90, -82, 90 }, + { 90, -90, 90, -90, 90, -90, 90, -90 }, + } +}; + +#define shift_1st 7 +#define add_1st (1 << (shift_1st - 1)) + + +void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) +{ + uint8_t *dst = (uint8_t*)_dst; + ptrdiff_t stride = _stride; + int shift = 5; + int offset = 16; + __m128i r0,r1,r2,r3,r4,r5,r6,r9; + + r9= _mm_setzero_si128(); + //r8= _mm_set_epi32(0,0,0,-1); + r2= _mm_set1_epi16(offset); + + r0= _mm_load_si128((__m128i*)(coeffs)); + r1= _mm_load_si128((__m128i*)(coeffs+8)); + + + r0= _mm_adds_epi16(r0,r2); + r1= _mm_adds_epi16(r1,r2); + + r0= _mm_srai_epi16(r0,shift); + r1= _mm_srai_epi16(r1,shift); + + r3= _mm_loadl_epi64((__m128i*)(dst)); + r4= _mm_loadl_epi64((__m128i*)(dst + stride)); + r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride)); + r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride)); + + r3= _mm_unpacklo_epi8(r3,r9); + r4= _mm_unpacklo_epi8(r4,r9); + r5= _mm_unpacklo_epi8(r5,r9); + r6= _mm_unpacklo_epi8(r6,r9); + r3= _mm_unpacklo_epi64(r3,r4); + r4= _mm_unpacklo_epi64(r5,r6); + + + r3= _mm_adds_epi16(r3,r0); + r4= _mm_adds_epi16(r4,r1); + + r3= _mm_packus_epi16(r3,r4); + //r8= _mm_set_epi32(0,0,0,-1); + + //_mm_maskmoveu_si128(r3,r8,(char *) (dst)); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3); + + r3= _mm_srli_si128(r3,4); + //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride)); + *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3); + + r3= _mm_srli_si128(r3,4); + //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride)); + *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3); + + r3= _mm_srli_si128(r3,4); + //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride)); + *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3); +} + + + +#if HAVE_SSE4_1 +void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride; + const int16_t *src = coeffs; + __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA, + m128iD; + m128iAdd = _mm_set1_epi32(64); + + S0 = _mm_load_si128((__m128i *) (src)); + S8 = _mm_load_si128((__m128i *) (src + 8)); + + m128iAC = _mm_unpacklo_epi16(S0, S8); + m128iBD = _mm_unpackhi_epi16(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[0]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[1]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_1st); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[2]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[3]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_1st); + + m128iA = _mm_packs_epi32(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[4]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[5]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_1st); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[6]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[7]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_1st); + + m128iD = _mm_packs_epi32(S0, S8); + + S0 = _mm_unpacklo_epi16(m128iA, m128iD); + S8 = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iA = _mm_unpacklo_epi16(S0, S8); + m128iD = _mm_unpackhi_epi16(S0, S8); + + /* ################### */ + m128iAdd = _mm_set1_epi32(add_2nd); + + m128iAC = _mm_unpacklo_epi16(m128iA, m128iD); + m128iBD = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[0]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[1]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_2nd); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[2]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[3]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_2nd); + + m128iA = _mm_packs_epi32(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[4]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[5]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_2nd); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[6]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[7]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_2nd); + + m128iD = _mm_packs_epi32(S0, S8); + +// _mm_storeu_si128((__m128i *) (src), m128iA); +// _mm_storeu_si128((__m128i *) (src + 8), m128iD); + + S0 = _mm_move_epi64(m128iA); //contains row 0 + S8 = _mm_move_epi64(m128iD); //row 2 + m128iA = _mm_srli_si128(m128iA, 8); // row 1 + m128iD = _mm_srli_si128(m128iD, 8); // row 3 + m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA); + m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD); + S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2); + + //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(S8, m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); +} +#endif // SSE4.1 + +#if 0 +void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i,j; + uint8_t shift_2nd = 10; // 20 - Bit depth + uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) + + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride/(sizeof(uint16_t)); + int16_t *src = coeffs; + __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA, + m128iD; + + m128iAdd = _mm_set1_epi32(64); + + S0 = _mm_loadu_si128((__m128i *) (src)); + S8 = _mm_loadu_si128((__m128i *) (src + 8)); + + m128iAC = _mm_unpacklo_epi16(S0, S8); + m128iBD = _mm_unpackhi_epi16(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_loadu_si128((__m128i *) (transform4x4_luma[0]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_loadu_si128((__m128i *) (transform4x4_luma[1]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_1st); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_loadu_si128((__m128i *) (transform4x4_luma[2]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_loadu_si128((__m128i *) (transform4x4_luma[3]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_1st); + + m128iA = _mm_packs_epi32(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_loadu_si128((__m128i *) (transform4x4_luma[4]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_loadu_si128((__m128i *) (transform4x4_luma[5]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_1st); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_loadu_si128((__m128i *) (transform4x4_luma[6]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_loadu_si128((__m128i *) (transform4x4_luma[7]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_1st); + + m128iD = _mm_packs_epi32(S0, S8); + + S0 = _mm_unpacklo_epi16(m128iA, m128iD); + S8 = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iA = _mm_unpacklo_epi16(S0, S8); + m128iD = _mm_unpackhi_epi16(S0, S8); + + /* ################### */ + m128iAdd = _mm_set1_epi32(add_2nd); + + m128iAC = _mm_unpacklo_epi16(m128iA, m128iD); + m128iBD = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[0]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[1]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_2nd); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[2]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[3]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_2nd); + + m128iA = _mm_packs_epi32(S0, S8); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[4]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[5]))); + S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S0 = _mm_add_epi32(S0, m128iAdd); + S0 = _mm_srai_epi32(S0, shift_2nd); + + m128iTmp1 = _mm_madd_epi16(m128iAC, + _mm_load_si128((__m128i *) (transform4x4_luma[6]))); + m128iTmp2 = _mm_madd_epi16(m128iBD, + _mm_load_si128((__m128i *) (transform4x4_luma[7]))); + S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_add_epi32(S8, m128iAdd); + S8 = _mm_srai_epi32(S8, shift_2nd); + + m128iD = _mm_packs_epi32(S0, S8); + + _mm_storeu_si128((__m128i *) (src), m128iA); + _mm_storeu_si128((__m128i *) (src + 8), m128iD); + j = 0; + for (i = 0; i < 2; i++) { + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); + j += 1; + dst += stride; + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); + j += 1; + dst += stride; + } + +} +#endif + + +#if HAVE_SSE4_1 +void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride; + const int16_t *src = coeffs; + + __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2; + S0 = _mm_load_si128((__m128i *) (src)); + S8 = _mm_load_si128((__m128i *) (src + 8)); + m128iAdd = _mm_set1_epi32(add_1st); + + m128Tmp = _mm_unpacklo_epi16(S0, S8); + E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); + E1 = _mm_add_epi32(E1, m128iAdd); + + E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); + E2 = _mm_add_epi32(E2, m128iAdd); + + m128Tmp = _mm_unpackhi_epi16(S0, S8); + O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); + O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); + + m128iA = _mm_add_epi32(E1, O1); + m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum + m128Tmp = _mm_add_epi32(E2, O2); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum + m128iA = _mm_packs_epi32(m128iA, m128Tmp); + + m128iD = _mm_sub_epi32(E2, O2); + m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum + + m128Tmp = _mm_sub_epi32(E1, O1); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum + + m128iD = _mm_packs_epi32(m128iD, m128Tmp); + + S0 = _mm_unpacklo_epi16(m128iA, m128iD); + S8 = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iA = _mm_unpacklo_epi16(S0, S8); + m128iD = _mm_unpackhi_epi16(S0, S8); + + /* ########################## */ + + m128iAdd = _mm_set1_epi32(add_2nd); + m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD); + E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); + E1 = _mm_add_epi32(E1, m128iAdd); + + E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); + E2 = _mm_add_epi32(E2, m128iAdd); + + m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD); + O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); + O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); + + m128iA = _mm_add_epi32(E1, O1); + m128iA = _mm_srai_epi32(m128iA, shift_2nd); + m128Tmp = _mm_add_epi32(E2, O2); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); + m128iA = _mm_packs_epi32(m128iA, m128Tmp); + + m128iD = _mm_sub_epi32(E2, O2); + m128iD = _mm_srai_epi32(m128iD, shift_2nd); + + m128Tmp = _mm_sub_epi32(E1, O1); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); + + m128iD = _mm_packs_epi32(m128iD, m128Tmp); + + S0 = _mm_move_epi64(m128iA); //contains row 0 + S8 = _mm_move_epi64(m128iD); //row 2 + m128iA = _mm_srli_si128(m128iA, 8); // row 1 + m128iD = _mm_srli_si128(m128iD, 8); // row 3 + m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA); + m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD); + S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2); + S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2); + + //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(S8, m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); + + dst += stride; + + m128iA = _mm_loadl_epi64((__m128i *) dst); + m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); + m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA); + m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); + //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); + *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); +} +#endif + +#if 0 +void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i; + uint8_t shift_2nd = 10; // 20 - Bit depth + uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) + + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride/2; + int16_t *src = coeffs; + + int j; + __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD; + S0 = _mm_load_si128((__m128i *) (src)); + S8 = _mm_load_si128((__m128i *) (src + 8)); + m128iAdd = _mm_set1_epi32(add_1st); + + m128Tmp = _mm_unpacklo_epi16(S0, S8); + E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); + E1 = _mm_add_epi32(E1, m128iAdd); + + E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); + E2 = _mm_add_epi32(E2, m128iAdd); + + m128Tmp = _mm_unpackhi_epi16(S0, S8); + O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); + O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); + + m128iA = _mm_add_epi32(E1, O1); + m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum + m128Tmp = _mm_add_epi32(E2, O2); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum + m128iA = _mm_packs_epi32(m128iA, m128Tmp); + + m128iD = _mm_sub_epi32(E2, O2); + m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum + + m128Tmp = _mm_sub_epi32(E1, O1); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum + + m128iD = _mm_packs_epi32(m128iD, m128Tmp); + + S0 = _mm_unpacklo_epi16(m128iA, m128iD); + S8 = _mm_unpackhi_epi16(m128iA, m128iD); + + m128iA = _mm_unpacklo_epi16(S0, S8); + m128iD = _mm_unpackhi_epi16(S0, S8); + + /* ########################## */ + + m128iAdd = _mm_set1_epi32(add_2nd); + m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD); + E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); + E1 = _mm_add_epi32(E1, m128iAdd); + + E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); + E2 = _mm_add_epi32(E2, m128iAdd); + + m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD); + O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); + O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); + + m128iA = _mm_add_epi32(E1, O1); + m128iA = _mm_srai_epi32(m128iA, shift_2nd); + m128Tmp = _mm_add_epi32(E2, O2); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); + m128iA = _mm_packs_epi32(m128iA, m128Tmp); + + m128iD = _mm_sub_epi32(E2, O2); + m128iD = _mm_srai_epi32(m128iD, shift_2nd); + + m128Tmp = _mm_sub_epi32(E1, O1); + m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); + + m128iD = _mm_packs_epi32(m128iD, m128Tmp); + _mm_storeu_si128((__m128i *) (src), m128iA); + _mm_storeu_si128((__m128i *) (src + 8), m128iD); + j = 0; + for (i = 0; i < 2; i++) { + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); + j += 1; + dst += stride; + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); + j += 1; + dst += stride; + } +} +#endif + +#if HAVE_SSE4_1 +void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride / sizeof(uint8_t); + const int16_t *src = coeffs; + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, + E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, + + O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h, + T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11; + T0= _mm_load_si128((__m128i *) (transform8x8[0])); + T1= _mm_load_si128((__m128i *) (transform8x8[1])); + T2= _mm_load_si128((__m128i *) (transform8x8[2])); + T3= _mm_load_si128((__m128i *) (transform8x8[3])); + T4= _mm_load_si128((__m128i *) (transform8x8[4])); + T5= _mm_load_si128((__m128i *) (transform8x8[5])); + T6= _mm_load_si128((__m128i *) (transform8x8[6])); + T7= _mm_load_si128((__m128i *) (transform8x8[7])); + T8= _mm_load_si128((__m128i *) (transform8x8[8])); + T9= _mm_load_si128((__m128i *) (transform8x8[9])); + T10= _mm_load_si128((__m128i *) (transform8x8[10])); + T11= _mm_load_si128((__m128i *) (transform8x8[11])); + + m128iAdd = _mm_set1_epi32(add_1st); + + m128iS1 = _mm_load_si128((__m128i *) (src + 8)); + m128iS3 = _mm_load_si128((__m128i *) (src + 24)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, T0); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, T0); + m128iS5 = _mm_load_si128((__m128i *) (src + 40)); + m128iS7 = _mm_load_si128((__m128i *) (src + 56)); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, T1); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, T1); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, T2); + E1h = _mm_madd_epi16(m128Tmp1, T2); + E2l = _mm_madd_epi16(m128Tmp2, T3); + E2h = _mm_madd_epi16(m128Tmp3, T3); + + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, T4); + E1h = _mm_madd_epi16(m128Tmp1, T4); + E2l = _mm_madd_epi16(m128Tmp2, T5); + E2h = _mm_madd_epi16(m128Tmp3, T5); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, T6); + E1h = _mm_madd_epi16(m128Tmp1, T6); + E2l = _mm_madd_epi16(m128Tmp2, T7); + E2h = _mm_madd_epi16(m128Tmp3, T7); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + /* ------- */ + + m128iS0 = _mm_load_si128((__m128i *) (src + 0)); + m128iS4 = _mm_load_si128((__m128i *) (src + 32)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, T8); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, T8); + + EE1l = _mm_madd_epi16(m128Tmp0, T9); + EE1h = _mm_madd_epi16(m128Tmp1, T9); + + /* ------- */ + + m128iS2 = _mm_load_si128((__m128i *) (src + 16)); + m128iS6 = _mm_load_si128((__m128i *) (src + 48)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, T10); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, T10); + E01l = _mm_madd_epi16(m128Tmp0, T11); + E01h = _mm_madd_epi16(m128Tmp1, T11); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st)); + /* Invers matrix */ + + E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); + O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); + m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); + m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); + m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); + m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); + m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); + m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); + m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + + m128iAdd = _mm_set1_epi32(add_2nd); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, T0); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, T0); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, T1); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, T1); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, T2); + E1h = _mm_madd_epi16(m128Tmp1, T2); + E2l = _mm_madd_epi16(m128Tmp2, T3); + E2h = _mm_madd_epi16(m128Tmp3, T3); + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, T4); + E1h = _mm_madd_epi16(m128Tmp1, T4); + E2l = _mm_madd_epi16(m128Tmp2, T5); + E2h = _mm_madd_epi16(m128Tmp3, T5); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, T6); + E1h = _mm_madd_epi16(m128Tmp1, T6); + E2l = _mm_madd_epi16(m128Tmp2, T7); + E2h = _mm_madd_epi16(m128Tmp3, T7); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, T8); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, T8); + EE1l = _mm_madd_epi16(m128Tmp0, T9); + EE1h = _mm_madd_epi16(m128Tmp1, T9); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, T10); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, T10); + E01l = _mm_madd_epi16(m128Tmp0, T11); + E01h = _mm_madd_epi16(m128Tmp1, T11); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd)); + + E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); + O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); + m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); + m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); + m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); + m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); + m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); + m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); + m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS0); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS1); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS2); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS3); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS4); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS5); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS6); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + + E0l = _mm_loadl_epi64((__m128i *) dst); + E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); + + E0l = _mm_adds_epi16(E0l, m128iS7); + E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *) dst, E0l); + dst += stride; + +} +#endif + +#if 0 +void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i; + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride / sizeof(uint16_t); + int16_t *src = coeffs; + uint8_t shift_2nd = 10; // 20 - Bit depth + uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) + + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, + E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, + O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; + int j; + m128iAdd = _mm_set1_epi32(add_1st); + + m128iS1 = _mm_load_si128((__m128i *) (src + 8)); + m128iS3 = _mm_load_si128((__m128i *) (src + 24)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[0]))); + m128iS5 = _mm_load_si128((__m128i *) (src + 40)); + m128iS7 = _mm_load_si128((__m128i *) (src + 56)); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[1]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[1]))); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[2]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[2]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[3]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[3]))); + + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[4]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[4]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[5]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[5]))); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[6]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[6]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[7]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[7]))); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + /* ------- */ + + m128iS0 = _mm_load_si128((__m128i *) (src + 0)); + m128iS4 = _mm_load_si128((__m128i *) (src + 32)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[8]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[8]))); + + EE1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[9]))); + EE1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[9]))); + + /* ------- */ + + m128iS2 = _mm_load_si128((__m128i *) (src + 16)); + m128iS6 = _mm_load_si128((__m128i *) (src + 48)); + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[10]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[10]))); + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[11]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[11]))); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st)); + /* Invers matrix */ + + E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); + O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); + m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); + m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); + m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); + m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); + m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); + m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); + m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); + m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); + m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); + m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); + + m128iAdd = _mm_set1_epi32(add_2nd); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[0]))); + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[1]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[1]))); + O0l = _mm_add_epi32(E1l, E2l); + O0h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[2]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[2]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[3]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[3]))); + O1l = _mm_add_epi32(E1l, E2l); + O1h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[4]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[4]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[5]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[5]))); + O2l = _mm_add_epi32(E1l, E2l); + O2h = _mm_add_epi32(E1h, E2h); + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[6]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[6]))); + E2l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform8x8[7]))); + E2h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform8x8[7]))); + O3h = _mm_add_epi32(E1h, E2h); + O3l = _mm_add_epi32(E1l, E2l); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); + EE0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[8]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); + EE0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[8]))); + EE1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[9]))); + EE1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[9]))); + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[10]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[10]))); + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform8x8[11]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform8x8[11]))); + E0l = _mm_add_epi32(EE0l, E00l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E0h = _mm_add_epi32(EE0h, E00h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E3l = _mm_sub_epi32(EE0l, E00l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E3h = _mm_sub_epi32(EE0h, E00h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E1l = _mm_add_epi32(EE1l, E01l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E1h = _mm_add_epi32(EE1h, E01h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2l = _mm_sub_epi32(EE1l, E01l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E2h = _mm_sub_epi32(EE1h, E01h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd)); + + _mm_store_si128((__m128i *) (src), m128iS0); + _mm_store_si128((__m128i *) (src + 8), m128iS1); + _mm_store_si128((__m128i *) (src + 16), m128iS2); + _mm_store_si128((__m128i *) (src + 24), m128iS3); + _mm_store_si128((__m128i *) (src + 32), m128iS4); + _mm_store_si128((__m128i *) (src + 40), m128iS5); + _mm_store_si128((__m128i *) (src + 48), m128iS6); + _mm_store_si128((__m128i *) (src + 56), m128iS7); + + j = 0; + for (i = 0; i < 4; i++) { + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10); + dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10); + dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10); + dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10); + dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10); + j += 1; + dst += stride; + dst[0] = av_clip_uintp2(dst[0] + src[j],10); + dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10); + dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10); + dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10); + dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10); + dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10); + dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10); + dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10); + j += 1; + dst += stride; + } + +} +#endif + + +#if HAVE_SSE4_1 +void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + int i; + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride / sizeof(uint8_t); + const int16_t *src = coeffs; + int32_t shift; + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, + m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, + m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, + E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, + O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, + E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; + __m128i E4l, E5l, E6l, E7l; + __m128i E4h, E5h, E6h, E7h; + __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; + __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31; + + + /*__m128i T00,T01, T02, T03, T04, T05, T06, T07; + __m128i T10,T11, T12, T13, T14, T15, T16, T17; + __m128i T20,T21, T22, T23, T24, T25, T26, T27; + __m128i T30,T31, T32, T33, T34, T35, T36, T37; + + __m128i U00,U01, U02, U03, U10, U11, U12, U13; + + __m128i V00,V01, V10, V11;*/ + + + const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0])); + const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1])); + const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2])); + const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3])); + const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4])); + const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5])); + const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6])); + const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7])); + const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0])); + const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1])); + const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2])); + const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3])); + const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4])); + const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5])); + const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6])); + const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7])); + const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0])); + const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1])); + const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2])); + const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3])); + const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4])); + const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5])); + const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6])); + const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7])); + const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0])); + const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1])); + const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2])); + const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3])); + const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4])); + const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5])); + const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6])); + const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7])); + + const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0])); + const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1])); + const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2])); + const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3])); + const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0])); + const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1])); + const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2])); + const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3])); + + const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0])); + const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1])); + const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0])); + const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1])); + + + + int j; + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 16)); + m128iS2 = _mm_load_si128((__m128i *) (src + 32)); + m128iS3 = _mm_load_si128((__m128i *) (src + 48)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 64)); + m128iS5 = _mm_load_si128((__m128i *) (src + 80)); + m128iS6 = _mm_load_si128((__m128i *) (src + 96)); + m128iS7 = _mm_load_si128((__m128i *) (src + 112)); + m128iS8 = _mm_load_si128((__m128i *) (src + 128)); + m128iS9 = _mm_load_si128((__m128i *) (src + 144)); + m128iS10 = _mm_load_si128((__m128i *) (src + 160)); + m128iS11 = _mm_load_si128((__m128i *) (src + 176)); + m128iS12 = _mm_load_si128((__m128i *) (src + 192)); + m128iS13 = _mm_load_si128((__m128i *) (src + 208)); + m128iS14 = _mm_load_si128((__m128i *) (src + 224)); + m128iS15 = _mm_load_si128((__m128i *) (src + 240)); + shift = shift_1st; + m128iAdd = _mm_set1_epi32(add_1st); + + for (j = 0; j < 2; j++) { + for (i = 0; i < 16; i += 8) { + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E0l = _mm_madd_epi16(m128Tmp0,T00); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E0h = _mm_madd_epi16(m128Tmp1,T00); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E1l = _mm_madd_epi16(m128Tmp2,T10); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E1h = _mm_madd_epi16(m128Tmp3,T10); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); + E2l = _mm_madd_epi16(m128Tmp4,T20); + m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); + E2h = _mm_madd_epi16(m128Tmp5,T20); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); + E3l = _mm_madd_epi16(m128Tmp6,T30); + m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); + E3h = _mm_madd_epi16(m128Tmp7,T30); + + O0l = _mm_add_epi32(E0l, E1l); + O0l = _mm_add_epi32(O0l, E2l); + O0l = _mm_add_epi32(O0l, E3l); + + O0h = _mm_add_epi32(E0h, E1h); + O0h = _mm_add_epi32(O0h, E2h); + O0h = _mm_add_epi32(O0h, E3h); + + /* Compute O1*/ + E0l = _mm_madd_epi16(m128Tmp0,T01); + E0h = _mm_madd_epi16(m128Tmp1,T01); + E1l = _mm_madd_epi16(m128Tmp2,T11); + E1h = _mm_madd_epi16(m128Tmp3,T11); + E2l = _mm_madd_epi16(m128Tmp4,T21); + E2h = _mm_madd_epi16(m128Tmp5,T21); + E3l = _mm_madd_epi16(m128Tmp6,T31); + E3h = _mm_madd_epi16(m128Tmp7,T31); + O1l = _mm_add_epi32(E0l, E1l); + O1l = _mm_add_epi32(O1l, E2l); + O1l = _mm_add_epi32(O1l, E3l); + O1h = _mm_add_epi32(E0h, E1h); + O1h = _mm_add_epi32(O1h, E2h); + O1h = _mm_add_epi32(O1h, E3h); + + /* Compute O2*/ + E0l = _mm_madd_epi16(m128Tmp0,T02); + E0h = _mm_madd_epi16(m128Tmp1,T02); + E1l = _mm_madd_epi16(m128Tmp2,T12); + E1h = _mm_madd_epi16(m128Tmp3,T12); + E2l = _mm_madd_epi16(m128Tmp4,T22); + E2h = _mm_madd_epi16(m128Tmp5,T22); + E3l = _mm_madd_epi16(m128Tmp6,T32); + E3h = _mm_madd_epi16(m128Tmp7,T32); + O2l = _mm_add_epi32(E0l, E1l); + O2l = _mm_add_epi32(O2l, E2l); + O2l = _mm_add_epi32(O2l, E3l); + + O2h = _mm_add_epi32(E0h, E1h); + O2h = _mm_add_epi32(O2h, E2h); + O2h = _mm_add_epi32(O2h, E3h); + + /* Compute O3*/ + E0l = _mm_madd_epi16(m128Tmp0,T03); + E0h = _mm_madd_epi16(m128Tmp1,T03); + E1l = _mm_madd_epi16(m128Tmp2,T13); + E1h = _mm_madd_epi16(m128Tmp3,T13); + E2l = _mm_madd_epi16(m128Tmp4,T23); + E2h = _mm_madd_epi16(m128Tmp5,T23); + E3l = _mm_madd_epi16(m128Tmp6,T33); + E3h = _mm_madd_epi16(m128Tmp7,T33); + + O3l = _mm_add_epi32(E0l, E1l); + O3l = _mm_add_epi32(O3l, E2l); + O3l = _mm_add_epi32(O3l, E3l); + + O3h = _mm_add_epi32(E0h, E1h); + O3h = _mm_add_epi32(O3h, E2h); + O3h = _mm_add_epi32(O3h, E3h); + + /* Compute O4*/ + + E0l = _mm_madd_epi16(m128Tmp0,T04); + E0h = _mm_madd_epi16(m128Tmp1,T04); + E1l = _mm_madd_epi16(m128Tmp2,T14); + E1h = _mm_madd_epi16(m128Tmp3,T14); + E2l = _mm_madd_epi16(m128Tmp4,T24); + E2h = _mm_madd_epi16(m128Tmp5,T24); + E3l = _mm_madd_epi16(m128Tmp6,T34); + E3h = _mm_madd_epi16(m128Tmp7,T34); + + O4l = _mm_add_epi32(E0l, E1l); + O4l = _mm_add_epi32(O4l, E2l); + O4l = _mm_add_epi32(O4l, E3l); + + O4h = _mm_add_epi32(E0h, E1h); + O4h = _mm_add_epi32(O4h, E2h); + O4h = _mm_add_epi32(O4h, E3h); + + /* Compute O5*/ + E0l = _mm_madd_epi16(m128Tmp0,T05); + E0h = _mm_madd_epi16(m128Tmp1,T05); + E1l = _mm_madd_epi16(m128Tmp2,T15); + E1h = _mm_madd_epi16(m128Tmp3,T15); + E2l = _mm_madd_epi16(m128Tmp4,T25); + E2h = _mm_madd_epi16(m128Tmp5,T25); + E3l = _mm_madd_epi16(m128Tmp6,T35); + E3h = _mm_madd_epi16(m128Tmp7,T35); + + O5l = _mm_add_epi32(E0l, E1l); + O5l = _mm_add_epi32(O5l, E2l); + O5l = _mm_add_epi32(O5l, E3l); + + O5h = _mm_add_epi32(E0h, E1h); + O5h = _mm_add_epi32(O5h, E2h); + O5h = _mm_add_epi32(O5h, E3h); + + /* Compute O6*/ + + E0l = _mm_madd_epi16(m128Tmp0,T06); + E0h = _mm_madd_epi16(m128Tmp1,T06); + E1l = _mm_madd_epi16(m128Tmp2,T16); + E1h = _mm_madd_epi16(m128Tmp3,T16); + E2l = _mm_madd_epi16(m128Tmp4,T26); + E2h = _mm_madd_epi16(m128Tmp5,T26); + E3l = _mm_madd_epi16(m128Tmp6,T36); + E3h = _mm_madd_epi16(m128Tmp7,T36); + + O6l = _mm_add_epi32(E0l, E1l); + O6l = _mm_add_epi32(O6l, E2l); + O6l = _mm_add_epi32(O6l, E3l); + + O6h = _mm_add_epi32(E0h, E1h); + O6h = _mm_add_epi32(O6h, E2h); + O6h = _mm_add_epi32(O6h, E3h); + + /* Compute O7*/ + + E0l = _mm_madd_epi16(m128Tmp0,T07); + E0h = _mm_madd_epi16(m128Tmp1,T07); + E1l = _mm_madd_epi16(m128Tmp2,T17); + E1h = _mm_madd_epi16(m128Tmp3,T17); + E2l = _mm_madd_epi16(m128Tmp4,T27); + E2h = _mm_madd_epi16(m128Tmp5,T27); + E3l = _mm_madd_epi16(m128Tmp6,T37); + E3h = _mm_madd_epi16(m128Tmp7,T37); + + O7l = _mm_add_epi32(E0l, E1l); + O7l = _mm_add_epi32(O7l, E2l); + O7l = _mm_add_epi32(O7l, E3l); + + O7h = _mm_add_epi32(E0h, E1h); + O7h = _mm_add_epi32(O7h, E2h); + O7h = _mm_add_epi32(O7h, E3h); + + /* Compute E0 */ + + + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E0l = _mm_madd_epi16(m128Tmp0,U00); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E0h = _mm_madd_epi16(m128Tmp1,U00); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp2,U10)); + m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp3,U10)); + + /* Compute E1 */ + E1l = _mm_madd_epi16(m128Tmp0,U01); + E1h = _mm_madd_epi16(m128Tmp1,U01); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp2,U11)); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp3,U11)); + + /* Compute E2 */ + E2l = _mm_madd_epi16(m128Tmp0,U02); + E2h = _mm_madd_epi16(m128Tmp1,U02); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp2,U12)); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp3,U12)); + /* Compute E3 */ + E3l = _mm_madd_epi16(m128Tmp0,U03); + E3h = _mm_madd_epi16(m128Tmp1,U03); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp2,U13)); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp3,U13)); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); + E00l = _mm_madd_epi16(m128Tmp0,V00); + m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); + E00h = _mm_madd_epi16(m128Tmp1,V00); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8); + EE0l = _mm_madd_epi16(m128Tmp2,V10); + m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8); + EE0h = _mm_madd_epi16(m128Tmp3,V10); + + E01l = _mm_madd_epi16(m128Tmp0,V01); + E01h = _mm_madd_epi16(m128Tmp1,V01); + + EE1l = _mm_madd_epi16(m128Tmp2,V11); + EE1h = _mm_madd_epi16(m128Tmp3,V11); + + /* Compute EE */ + EE2l = _mm_sub_epi32(EE1l, E01l); + EE3l = _mm_sub_epi32(EE0l, E00l); + EE2h = _mm_sub_epi32(EE1h, E01h); + EE3h = _mm_sub_epi32(EE0h, E00h); + + EE0l = _mm_add_epi32(EE0l, E00l); + EE1l = _mm_add_epi32(EE1l, E01l); + EE0h = _mm_add_epi32(EE0h, E00h); + EE1h = _mm_add_epi32(EE1h, E01h); + + /* Compute E */ + + E4l = _mm_sub_epi32(EE3l, E3l); + E4l = _mm_add_epi32(E4l, m128iAdd); + + E5l = _mm_sub_epi32(EE2l, E2l); + E5l = _mm_add_epi32(E5l, m128iAdd); + + E6l = _mm_sub_epi32(EE1l, E1l); + E6l = _mm_add_epi32(E6l, m128iAdd); + + E7l = _mm_sub_epi32(EE0l, E0l); + E7l = _mm_add_epi32(E7l, m128iAdd); + + E4h = _mm_sub_epi32(EE3h, E3h); + E4h = _mm_add_epi32(E4h, m128iAdd); + + E5h = _mm_sub_epi32(EE2h, E2h); + E5h = _mm_add_epi32(E5h, m128iAdd); + + E6h = _mm_sub_epi32(EE1h, E1h); + E6h = _mm_add_epi32(E6h, m128iAdd); + + E7h = _mm_sub_epi32(EE0h, E0h); + E7h = _mm_add_epi32(E7h, m128iAdd); + + E0l = _mm_add_epi32(EE0l, E0l); + E0l = _mm_add_epi32(E0l, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E1l); + E1l = _mm_add_epi32(E1l, m128iAdd); + + E2l = _mm_add_epi32(EE2l, E2l); + E2l = _mm_add_epi32(E2l, m128iAdd); + + E3l = _mm_add_epi32(EE3l, E3l); + E3l = _mm_add_epi32(E3l, m128iAdd); + + E0h = _mm_add_epi32(EE0h, E0h); + E0h = _mm_add_epi32(E0h, m128iAdd); + + E1h = _mm_add_epi32(EE1h, E1h); + E1h = _mm_add_epi32(E1h, m128iAdd); + + E2h = _mm_add_epi32(EE2h, E2h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + E3h = _mm_add_epi32(EE3h, E3h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); + + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); + + m128iS15 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); + m128iS14 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); + m128iS13 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); + m128iS12 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); + + m128iS11 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); + m128iS10 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); + m128iS9 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); + m128iS8 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); + + + + if (!j) { //first pass + + /* Inverse the matrix */ + E0l = _mm_unpacklo_epi16(m128iS0, m128iS8); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS9); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS10); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS11); + E4l = _mm_unpacklo_epi16(m128iS4, m128iS12); + E5l = _mm_unpacklo_epi16(m128iS5, m128iS13); + E6l = _mm_unpacklo_epi16(m128iS6, m128iS14); + E7l = _mm_unpacklo_epi16(m128iS7, m128iS15); + + E0h = _mm_unpackhi_epi16(m128iS0, m128iS8); + E1h = _mm_unpackhi_epi16(m128iS1, m128iS9); + E2h = _mm_unpackhi_epi16(m128iS2, m128iS10); + E3h = _mm_unpackhi_epi16(m128iS3, m128iS11); + E4h = _mm_unpackhi_epi16(m128iS4, m128iS12); + E5h = _mm_unpackhi_epi16(m128iS5, m128iS13); + E6h = _mm_unpackhi_epi16(m128iS6, m128iS14); + E7h = _mm_unpackhi_epi16(m128iS7, m128iS15); + + m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l); + m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l); + m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l); + m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l); + m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l); + m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + if (!i) { + + r0= m128iS0; //0 + r1= m128iS1; //16 + r2= m128iS2; //32 + r3= m128iS3; //48 + r4= m128iS4; //64 + r5= m128iS5; //80 + r6= m128iS6; //96 + r7= m128iS7; //112 + r8= m128iS8; //128 + r9= m128iS9; //144 + r10= m128iS10; //160 + r11= m128iS11; //176 + r12= m128iS12; //192 + r13= m128iS13; //208 + r14= m128iS14; //224 + r15= m128iS15; //240 + + + + m128iS0 = _mm_load_si128((__m128i *) (src + 8)); + m128iS1 = _mm_load_si128((__m128i *) (src + 24)); + m128iS2 = _mm_load_si128((__m128i *) (src + 40)); + m128iS3 = _mm_load_si128((__m128i *) (src + 56)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 72)); + m128iS5 = _mm_load_si128((__m128i *) (src + 88)); + m128iS6 = _mm_load_si128((__m128i *) (src + 104)); + m128iS7 = _mm_load_si128((__m128i *) (src + 120)); + m128iS8 = _mm_load_si128((__m128i *) (src + 136)); + m128iS9 = _mm_load_si128((__m128i *) (src + 152)); + m128iS10 = _mm_load_si128((__m128i *) (src + 168)); + m128iS11 = _mm_load_si128((__m128i *) (src + 184)); + m128iS12 = _mm_load_si128((__m128i *) (src + 200)); + m128iS13 = _mm_load_si128((__m128i *) (src + 216)); + m128iS14 = _mm_load_si128((__m128i *) (src + 232)); + m128iS15 = _mm_load_si128((__m128i *) (src + 248)); + } else { + + r16= m128iS0; //8 + r17= m128iS1; //24 + r18= m128iS2; //40 + r19= m128iS3; //56 + r20= m128iS4; //72 + r21= m128iS5; //88 + r22= m128iS6; //104 + r23= m128iS7; //120 + r24= m128iS8; //136 + r25= m128iS9; //152 + r26= m128iS10; //168 + r27= m128iS11; //184 + r28= m128iS12; //200 + r29= m128iS13; //216 + r30= m128iS14; //232 + r31= m128iS15; //248 + + //prepare next iteration : + + m128iS0= r0; + m128iS1= r2; + m128iS2= r4; + m128iS3= r6; + m128iS4= r8; + m128iS5= r10; + m128iS6= r12; + m128iS7= r14; + m128iS8= r16; + m128iS9= r18; + m128iS10=r20; + m128iS11=r22; + m128iS12=r24; + m128iS13=r26; + m128iS14=r28; + m128iS15=r30; + + shift = shift_2nd; + m128iAdd = _mm_set1_epi32(add_2nd); + } + + } else { + + //transpose half matrix : + //instead of having 1 register = 1 half-column, + //1 register = 1 half-row. + E0l = _mm_unpacklo_epi16(m128iS0, m128iS1); + E1l = _mm_unpacklo_epi16(m128iS2, m128iS3); + E2l = _mm_unpacklo_epi16(m128iS4, m128iS5); + E3l = _mm_unpacklo_epi16(m128iS6, m128iS7); + E4l = _mm_unpacklo_epi16(m128iS8, m128iS9); + E5l = _mm_unpacklo_epi16(m128iS10, m128iS11); + E6l = _mm_unpacklo_epi16(m128iS12, m128iS13); + E7l = _mm_unpacklo_epi16(m128iS14, m128iS15); + + O0l = _mm_unpackhi_epi16(m128iS0, m128iS1); + O1l = _mm_unpackhi_epi16(m128iS2, m128iS3); + O2l = _mm_unpackhi_epi16(m128iS4, m128iS5); + O3l = _mm_unpackhi_epi16(m128iS6, m128iS7); + O4l = _mm_unpackhi_epi16(m128iS8, m128iS9); + O5l = _mm_unpackhi_epi16(m128iS10, m128iS11); + O6l = _mm_unpackhi_epi16(m128iS12, m128iS13); + O7l = _mm_unpackhi_epi16(m128iS14, m128iS15); + + + m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l); + m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l); + + m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l); + m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l); + + r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); //1st half 1st row + r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); //2nd half 1st row + + + r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); //1st half 2nd row + r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); //2nd hald 2nd row + + m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l); + m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l); + m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l); + m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l); + + + r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); + r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); + + r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); + r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); + + m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l); + m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l); + m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l); + m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l); + + r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); + r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); + + + r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); + r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); + + m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l); + m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l); + m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l); + m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l); + + r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); + r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); + + + r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); + r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); + + dst = (uint8_t*) (_dst + (i*stride)); + m128Tmp0= _mm_setzero_si128(); + m128Tmp1= _mm_load_si128((__m128i*)dst); + m128Tmp2= _mm_load_si128((__m128i*)(dst+stride)); + m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride)); + m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride)); + m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride)); + m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride)); + m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride)); + E0l= _mm_load_si128((__m128i*)(dst+7*stride)); + + + r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0)); + r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0)); + r0= _mm_packus_epi16(r0,r2); + + + + + r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0)); + r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0)); + r4= _mm_packus_epi16(r4,r6); + + + r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0)); + r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0)); + r8= _mm_packus_epi16(r8,r10); + + + r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0)); + r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0)); + r12= _mm_packus_epi16(r12,r14); + + + r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0)); + r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0)); + r16= _mm_packus_epi16(r16,r18); + + + r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0)); + r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0)); + r20= _mm_packus_epi16(r20,r22); + + + r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0)); + r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0)); + r24= _mm_packus_epi16(r24,r26); + + + + r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0)); + r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0)); + r28= _mm_packus_epi16(r28,r30); + + _mm_store_si128((__m128i*)dst,r0); + _mm_store_si128((__m128i*)(dst+stride),r4); + _mm_store_si128((__m128i*)(dst+2*stride),r8); + _mm_store_si128((__m128i*)(dst+3*stride),r12); + _mm_store_si128((__m128i*)(dst+4*stride),r16); + _mm_store_si128((__m128i*)(dst+5*stride),r20); + _mm_store_si128((__m128i*)(dst+6*stride),r24); + _mm_store_si128((__m128i*)(dst+7*stride),r28); + + + + if (!i) { + //first half done, can store ! + + + m128iS0= r1; + m128iS1= r3; + m128iS2= r5; + m128iS3= r7; + m128iS4= r9; + m128iS5= r11; + m128iS6= r13; + m128iS7= r15; + m128iS8= r17; + m128iS9= r19; + m128iS10=r21; + m128iS11=r23; + m128iS12=r25; + m128iS13=r27; + m128iS14=r29; + m128iS15=r31; + } + } + } + } +} +#endif + + +#if 0 +void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i; + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride / 2; + int16_t *src = coeffs; + int32_t shift; + uint8_t shift_2nd = 10; //20 - bit depth + uint16_t add_2nd = 1 << 9; //shift - 1; + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, + m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, + m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, + E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, + O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, + E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; + __m128i E4l, E5l, E6l, E7l; + __m128i E4h, E5h, E6h, E7h; + int j; + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 16)); + m128iS2 = _mm_load_si128((__m128i *) (src + 32)); + m128iS3 = _mm_load_si128((__m128i *) (src + 48)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 64)); + m128iS5 = _mm_load_si128((__m128i *) (src + 80)); + m128iS6 = _mm_load_si128((__m128i *) (src + 96)); + m128iS7 = _mm_load_si128((__m128i *) (src + 112)); + m128iS8 = _mm_load_si128((__m128i *) (src + 128)); + m128iS9 = _mm_load_si128((__m128i *) (src + 144)); + m128iS10 = _mm_load_si128((__m128i *) (src + 160)); + m128iS11 = _mm_load_si128((__m128i *) (src + 176)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 192)); + m128iS13 = _mm_load_si128((__m128i *) (src + 208)); + m128iS14 = _mm_load_si128((__m128i *) (src + 224)); + m128iS15 = _mm_load_si128((__m128i *) (src + 240)); + shift = shift_1st; + m128iAdd = _mm_set1_epi32(add_1st); + + for (j = 0; j < 2; j++) { + for (i = 0; i < 16; i += 8) { + + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][0]))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][0]))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][0]))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][0]))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][0]))); + + O0l = _mm_add_epi32(E0l, E1l); + O0l = _mm_add_epi32(O0l, E2l); + O0l = _mm_add_epi32(O0l, E3l); + + O0h = _mm_add_epi32(E0h, E1h); + O0h = _mm_add_epi32(O0h, E2h); + O0h = _mm_add_epi32(O0h, E3h); + + /* Compute O1*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][1]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][1]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][1]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][1]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][1]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][1]))); + O1l = _mm_add_epi32(E0l, E1l); + O1l = _mm_add_epi32(O1l, E2l); + O1l = _mm_add_epi32(O1l, E3l); + O1h = _mm_add_epi32(E0h, E1h); + O1h = _mm_add_epi32(O1h, E2h); + O1h = _mm_add_epi32(O1h, E3h); + + /* Compute O2*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][2]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][2]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][2]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][2]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][2]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][2]))); + O2l = _mm_add_epi32(E0l, E1l); + O2l = _mm_add_epi32(O2l, E2l); + O2l = _mm_add_epi32(O2l, E3l); + + O2h = _mm_add_epi32(E0h, E1h); + O2h = _mm_add_epi32(O2h, E2h); + O2h = _mm_add_epi32(O2h, E3h); + + /* Compute O3*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][3]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][3]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][3]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][3]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][3]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][3]))); + + O3l = _mm_add_epi32(E0l, E1l); + O3l = _mm_add_epi32(O3l, E2l); + O3l = _mm_add_epi32(O3l, E3l); + + O3h = _mm_add_epi32(E0h, E1h); + O3h = _mm_add_epi32(O3h, E2h); + O3h = _mm_add_epi32(O3h, E3h); + + /* Compute O4*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][4]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][4]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][4]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][4]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][4]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][4]))); + + O4l = _mm_add_epi32(E0l, E1l); + O4l = _mm_add_epi32(O4l, E2l); + O4l = _mm_add_epi32(O4l, E3l); + + O4h = _mm_add_epi32(E0h, E1h); + O4h = _mm_add_epi32(O4h, E2h); + O4h = _mm_add_epi32(O4h, E3h); + + /* Compute O5*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][5]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][5]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][5]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][5]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][5]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][5]))); + + O5l = _mm_add_epi32(E0l, E1l); + O5l = _mm_add_epi32(O5l, E2l); + O5l = _mm_add_epi32(O5l, E3l); + + O5h = _mm_add_epi32(E0h, E1h); + O5h = _mm_add_epi32(O5h, E2h); + O5h = _mm_add_epi32(O5h, E3h); + + /* Compute O6*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][6]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][6]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][6]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][6]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][6]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][6]))); + + O6l = _mm_add_epi32(E0l, E1l); + O6l = _mm_add_epi32(O6l, E2l); + O6l = _mm_add_epi32(O6l, E3l); + + O6h = _mm_add_epi32(E0h, E1h); + O6h = _mm_add_epi32(O6h, E2h); + O6h = _mm_add_epi32(O6h, E3h); + + /* Compute O7*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_1[1][7]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_1[1][7]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform16x16_1[2][7]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform16x16_1[2][7]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform16x16_1[3][7]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform16x16_1[3][7]))); + + O7l = _mm_add_epi32(E0l, E1l); + O7l = _mm_add_epi32(O7l, E2l); + O7l = _mm_add_epi32(O7l, E3l); + + O7h = _mm_add_epi32(E0h, E1h); + O7h = _mm_add_epi32(O7h, E2h); + O7h = _mm_add_epi32(O7h, E3h); + + /* Compute E0 */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + + /* Compute E1 */ + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + + /* Compute E2 */ + E2l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E2h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + /* Compute E3 */ + E3l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E3h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8); + EE0l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8); + EE0h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + + EE1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + EE1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + + /* Compute EE */ + EE2l = _mm_sub_epi32(EE1l, E01l); + EE3l = _mm_sub_epi32(EE0l, E00l); + EE2h = _mm_sub_epi32(EE1h, E01h); + EE3h = _mm_sub_epi32(EE0h, E00h); + + EE0l = _mm_add_epi32(EE0l, E00l); + EE1l = _mm_add_epi32(EE1l, E01l); + EE0h = _mm_add_epi32(EE0h, E00h); + EE1h = _mm_add_epi32(EE1h, E01h); + + /* Compute E */ + + E4l = _mm_sub_epi32(EE3l, E3l); + E4l = _mm_add_epi32(E4l, m128iAdd); + + E5l = _mm_sub_epi32(EE2l, E2l); + E5l = _mm_add_epi32(E5l, m128iAdd); + + E6l = _mm_sub_epi32(EE1l, E1l); + E6l = _mm_add_epi32(E6l, m128iAdd); + + E7l = _mm_sub_epi32(EE0l, E0l); + E7l = _mm_add_epi32(E7l, m128iAdd); + + E4h = _mm_sub_epi32(EE3h, E3h); + E4h = _mm_add_epi32(E4h, m128iAdd); + + E5h = _mm_sub_epi32(EE2h, E2h); + E5h = _mm_add_epi32(E5h, m128iAdd); + + E6h = _mm_sub_epi32(EE1h, E1h); + E6h = _mm_add_epi32(E6h, m128iAdd); + + E7h = _mm_sub_epi32(EE0h, E0h); + E7h = _mm_add_epi32(E7h, m128iAdd); + + E0l = _mm_add_epi32(EE0l, E0l); + E0l = _mm_add_epi32(E0l, m128iAdd); + + E1l = _mm_add_epi32(EE1l, E1l); + E1l = _mm_add_epi32(E1l, m128iAdd); + + E2l = _mm_add_epi32(EE2l, E2l); + E2l = _mm_add_epi32(E2l, m128iAdd); + + E3l = _mm_add_epi32(EE3l, E3l); + E3l = _mm_add_epi32(E3l, m128iAdd); + + E0h = _mm_add_epi32(EE0h, E0h); + E0h = _mm_add_epi32(E0h, m128iAdd); + + E1h = _mm_add_epi32(EE1h, E1h); + E1h = _mm_add_epi32(E1h, m128iAdd); + + E2h = _mm_add_epi32(EE2h, E2h); + E2h = _mm_add_epi32(E2h, m128iAdd); + + E3h = _mm_add_epi32(EE3h, E3h); + E3h = _mm_add_epi32(E3h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); + + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); + + m128iS15 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); + m128iS14 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); + m128iS13 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); + m128iS12 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); + + m128iS11 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); + m128iS10 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); + m128iS9 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); + m128iS8 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); + + if (!j) { + /* Inverse the matrix */ + E0l = _mm_unpacklo_epi16(m128iS0, m128iS8); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS9); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS10); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS11); + E4l = _mm_unpacklo_epi16(m128iS4, m128iS12); + E5l = _mm_unpacklo_epi16(m128iS5, m128iS13); + E6l = _mm_unpacklo_epi16(m128iS6, m128iS14); + E7l = _mm_unpacklo_epi16(m128iS7, m128iS15); + + O0l = _mm_unpackhi_epi16(m128iS0, m128iS8); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS9); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS10); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS11); + O4l = _mm_unpackhi_epi16(m128iS4, m128iS12); + O5l = _mm_unpackhi_epi16(m128iS5, m128iS13); + O6l = _mm_unpackhi_epi16(m128iS6, m128iS14); + O7l = _mm_unpackhi_epi16(m128iS7, m128iS15); + + m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l); + m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l); + m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l); + m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l); + m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l); + m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l); + m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l); + m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l); + m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l); + m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l); + m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l); + m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l); + m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + /* */ + _mm_store_si128((__m128i *) (src + i), m128iS0); + _mm_store_si128((__m128i *) (src + 16 + i), m128iS1); + _mm_store_si128((__m128i *) (src + 32 + i), m128iS2); + _mm_store_si128((__m128i *) (src + 48 + i), m128iS3); + _mm_store_si128((__m128i *) (src + 64 + i), m128iS4); + _mm_store_si128((__m128i *) (src + 80 + i), m128iS5); + _mm_store_si128((__m128i *) (src + 96 + i), m128iS6); + _mm_store_si128((__m128i *) (src + 112 + i), m128iS7); + _mm_store_si128((__m128i *) (src + 128 + i), m128iS8); + _mm_store_si128((__m128i *) (src + 144 + i), m128iS9); + _mm_store_si128((__m128i *) (src + 160 + i), m128iS10); + _mm_store_si128((__m128i *) (src + 176 + i), m128iS11); + _mm_store_si128((__m128i *) (src + 192 + i), m128iS12); + _mm_store_si128((__m128i *) (src + 208 + i), m128iS13); + _mm_store_si128((__m128i *) (src + 224 + i), m128iS14); + _mm_store_si128((__m128i *) (src + 240 + i), m128iS15); + + if (!i) { + m128iS0 = _mm_load_si128((__m128i *) (src + 8)); + m128iS1 = _mm_load_si128((__m128i *) (src + 24)); + m128iS2 = _mm_load_si128((__m128i *) (src + 40)); + m128iS3 = _mm_load_si128((__m128i *) (src + 56)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 72)); + m128iS5 = _mm_load_si128((__m128i *) (src + 88)); + m128iS6 = _mm_load_si128((__m128i *) (src + 104)); + m128iS7 = _mm_load_si128((__m128i *) (src + 120)); + m128iS8 = _mm_load_si128((__m128i *) (src + 136)); + m128iS9 = _mm_load_si128((__m128i *) (src + 152)); + m128iS10 = _mm_load_si128((__m128i *) (src + 168)); + m128iS11 = _mm_load_si128((__m128i *) (src + 184)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 200)); + m128iS13 = _mm_load_si128((__m128i *) (src + 216)); + m128iS14 = _mm_load_si128((__m128i *) (src + 232)); + m128iS15 = _mm_load_si128((__m128i *) (src + 248)); + } else { + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224)); + m128iS8 = _mm_load_si128((__m128i *) (src + 8)); + m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8)); + m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8)); + m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8)); + m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8)); + m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8)); + m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8)); + shift = shift_2nd; + m128iAdd = _mm_set1_epi32(add_2nd); + } + + } else { + int k, m = 0; + _mm_storeu_si128((__m128i *) (src), m128iS0); + _mm_storeu_si128((__m128i *) (src + 8), m128iS1); + _mm_storeu_si128((__m128i *) (src + 32), m128iS2); + _mm_storeu_si128((__m128i *) (src + 40), m128iS3); + _mm_storeu_si128((__m128i *) (src + 64), m128iS4); + _mm_storeu_si128((__m128i *) (src + 72), m128iS5); + _mm_storeu_si128((__m128i *) (src + 96), m128iS6); + _mm_storeu_si128((__m128i *) (src + 104), m128iS7); + _mm_storeu_si128((__m128i *) (src + 128), m128iS8); + _mm_storeu_si128((__m128i *) (src + 136), m128iS9); + _mm_storeu_si128((__m128i *) (src + 160), m128iS10); + _mm_storeu_si128((__m128i *) (src + 168), m128iS11); + _mm_storeu_si128((__m128i *) (src + 192), m128iS12); + _mm_storeu_si128((__m128i *) (src + 200), m128iS13); + _mm_storeu_si128((__m128i *) (src + 224), m128iS14); + _mm_storeu_si128((__m128i *) (src + 232), m128iS15); + dst = (uint16_t*) _dst + (i * stride); + + for (k = 0; k < 8; k++) { + dst[0] = av_clip_uintp2(dst[0] + src[m],10); + dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10); + dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10); + dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10); + dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10); + dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10); + dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10); + dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10); + + dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10); + dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10); + dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10); + dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10); + dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10); + dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10); + dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10); + dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10); + m += 1; + dst += stride; + } + if (!i) { + m128iS0 = _mm_load_si128((__m128i *) (src + 16)); + m128iS1 = _mm_load_si128((__m128i *) (src + 48)); + m128iS2 = _mm_load_si128((__m128i *) (src + 80)); + m128iS3 = _mm_loadu_si128((__m128i *) (src + 112)); + m128iS4 = _mm_load_si128((__m128i *) (src + 144)); + m128iS5 = _mm_load_si128((__m128i *) (src + 176)); + m128iS6 = _mm_load_si128((__m128i *) (src + 208)); + m128iS7 = _mm_load_si128((__m128i *) (src + 240)); + m128iS8 = _mm_load_si128((__m128i *) (src + 24)); + m128iS9 = _mm_load_si128((__m128i *) (src + 56)); + m128iS10 = _mm_load_si128((__m128i *) (src + 88)); + m128iS11 = _mm_loadu_si128((__m128i *) (src + 120)); + m128iS12 = _mm_load_si128((__m128i *) (src + 152)); + m128iS13 = _mm_load_si128((__m128i *) (src + 184)); + m128iS14 = _mm_load_si128((__m128i *) (src + 216)); + m128iS15 = _mm_load_si128((__m128i *) (src + 248)); + } + } + } + } + +} +#endif + + +#if HAVE_SSE4_1 +void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + uint8_t shift_2nd = 12; // 20 - Bit depth + uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) + int i, j; + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t stride = _stride / sizeof(uint8_t); + int shift; + const int16_t *src = coeffs; + + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, + m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, + m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, + E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, + O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, + E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; + __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l; + __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h, + EEE0l, EEE1l, EEE0h, EEE1h; + __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21, + m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27, + m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9, + m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15, + O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l, + O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l, + EE4l, EE7h, EE6h, EE5h, EE4h; + + __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31; + __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63; + __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95; + __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127; + + + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480)); + m128iS16 = _mm_load_si128((__m128i *) (src + 512)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992)); + + shift = shift_1st; + m128iAdd = _mm_set1_epi32(add_1st); + + for (j = 0; j < 2; j++) { + for (i = 0; i < 32; i += 8) { + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][0]))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][0]))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][0]))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][0]))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][0]))); + + m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19); + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][0]))); + m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][0]))); + + m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][0]))); + m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][0]))); + + m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][0]))); + m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][0]))); + + m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][0]))); + m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][0]))); + + O0l = _mm_add_epi32(E0l, E1l); + O0l = _mm_add_epi32(O0l, E2l); + O0l = _mm_add_epi32(O0l, E3l); + O0l = _mm_add_epi32(O0l, E4l); + O0l = _mm_add_epi32(O0l, E5l); + O0l = _mm_add_epi32(O0l, E6l); + O0l = _mm_add_epi32(O0l, E7l); + + O0h = _mm_add_epi32(E0h, E1h); + O0h = _mm_add_epi32(O0h, E2h); + O0h = _mm_add_epi32(O0h, E3h); + O0h = _mm_add_epi32(O0h, E4h); + O0h = _mm_add_epi32(O0h, E5h); + O0h = _mm_add_epi32(O0h, E6h); + O0h = _mm_add_epi32(O0h, E7h); + + /* Compute O1*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][1]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][1]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][1]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][1]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][1]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][1]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][1]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][1]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][1]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][1]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][1]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][1]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][1]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][1]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][1]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][1]))); + + O1l = _mm_add_epi32(E0l, E1l); + O1l = _mm_add_epi32(O1l, E2l); + O1l = _mm_add_epi32(O1l, E3l); + O1l = _mm_add_epi32(O1l, E4l); + O1l = _mm_add_epi32(O1l, E5l); + O1l = _mm_add_epi32(O1l, E6l); + O1l = _mm_add_epi32(O1l, E7l); + + O1h = _mm_add_epi32(E0h, E1h); + O1h = _mm_add_epi32(O1h, E2h); + O1h = _mm_add_epi32(O1h, E3h); + O1h = _mm_add_epi32(O1h, E4h); + O1h = _mm_add_epi32(O1h, E5h); + O1h = _mm_add_epi32(O1h, E6h); + O1h = _mm_add_epi32(O1h, E7h); + /* Compute O2*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][2]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][2]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][2]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][2]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][2]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][2]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][2]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][2]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][2]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][2]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][2]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][2]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][2]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][2]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][2]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][2]))); + + O2l = _mm_add_epi32(E0l, E1l); + O2l = _mm_add_epi32(O2l, E2l); + O2l = _mm_add_epi32(O2l, E3l); + O2l = _mm_add_epi32(O2l, E4l); + O2l = _mm_add_epi32(O2l, E5l); + O2l = _mm_add_epi32(O2l, E6l); + O2l = _mm_add_epi32(O2l, E7l); + + O2h = _mm_add_epi32(E0h, E1h); + O2h = _mm_add_epi32(O2h, E2h); + O2h = _mm_add_epi32(O2h, E3h); + O2h = _mm_add_epi32(O2h, E4h); + O2h = _mm_add_epi32(O2h, E5h); + O2h = _mm_add_epi32(O2h, E6h); + O2h = _mm_add_epi32(O2h, E7h); + /* Compute O3*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][3]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][3]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][3]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][3]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][3]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][3]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][3]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][3]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][3]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][3]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][3]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][3]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][3]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][3]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][3]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][3]))); + + O3l = _mm_add_epi32(E0l, E1l); + O3l = _mm_add_epi32(O3l, E2l); + O3l = _mm_add_epi32(O3l, E3l); + O3l = _mm_add_epi32(O3l, E4l); + O3l = _mm_add_epi32(O3l, E5l); + O3l = _mm_add_epi32(O3l, E6l); + O3l = _mm_add_epi32(O3l, E7l); + + O3h = _mm_add_epi32(E0h, E1h); + O3h = _mm_add_epi32(O3h, E2h); + O3h = _mm_add_epi32(O3h, E3h); + O3h = _mm_add_epi32(O3h, E4h); + O3h = _mm_add_epi32(O3h, E5h); + O3h = _mm_add_epi32(O3h, E6h); + O3h = _mm_add_epi32(O3h, E7h); + /* Compute O4*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][4]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][4]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][4]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][4]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][4]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][4]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][4]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][4]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][4]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][4]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][4]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][4]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][4]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][4]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][4]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][4]))); + + O4l = _mm_add_epi32(E0l, E1l); + O4l = _mm_add_epi32(O4l, E2l); + O4l = _mm_add_epi32(O4l, E3l); + O4l = _mm_add_epi32(O4l, E4l); + O4l = _mm_add_epi32(O4l, E5l); + O4l = _mm_add_epi32(O4l, E6l); + O4l = _mm_add_epi32(O4l, E7l); + + O4h = _mm_add_epi32(E0h, E1h); + O4h = _mm_add_epi32(O4h, E2h); + O4h = _mm_add_epi32(O4h, E3h); + O4h = _mm_add_epi32(O4h, E4h); + O4h = _mm_add_epi32(O4h, E5h); + O4h = _mm_add_epi32(O4h, E6h); + O4h = _mm_add_epi32(O4h, E7h); + + /* Compute O5*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][5]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][5]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][5]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][5]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][5]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][5]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][5]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][5]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][5]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][5]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][5]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][5]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][5]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][5]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][5]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][5]))); + + O5l = _mm_add_epi32(E0l, E1l); + O5l = _mm_add_epi32(O5l, E2l); + O5l = _mm_add_epi32(O5l, E3l); + O5l = _mm_add_epi32(O5l, E4l); + O5l = _mm_add_epi32(O5l, E5l); + O5l = _mm_add_epi32(O5l, E6l); + O5l = _mm_add_epi32(O5l, E7l); + + O5h = _mm_add_epi32(E0h, E1h); + O5h = _mm_add_epi32(O5h, E2h); + O5h = _mm_add_epi32(O5h, E3h); + O5h = _mm_add_epi32(O5h, E4h); + O5h = _mm_add_epi32(O5h, E5h); + O5h = _mm_add_epi32(O5h, E6h); + O5h = _mm_add_epi32(O5h, E7h); + + /* Compute O6*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][6]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][6]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][6]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][6]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][6]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][6]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][6]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][6]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][6]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][6]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][6]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][6]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][6]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][6]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][6]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][6]))); + + O6l = _mm_add_epi32(E0l, E1l); + O6l = _mm_add_epi32(O6l, E2l); + O6l = _mm_add_epi32(O6l, E3l); + O6l = _mm_add_epi32(O6l, E4l); + O6l = _mm_add_epi32(O6l, E5l); + O6l = _mm_add_epi32(O6l, E6l); + O6l = _mm_add_epi32(O6l, E7l); + + O6h = _mm_add_epi32(E0h, E1h); + O6h = _mm_add_epi32(O6h, E2h); + O6h = _mm_add_epi32(O6h, E3h); + O6h = _mm_add_epi32(O6h, E4h); + O6h = _mm_add_epi32(O6h, E5h); + O6h = _mm_add_epi32(O6h, E6h); + O6h = _mm_add_epi32(O6h, E7h); + + /* Compute O7*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][7]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][7]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][7]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][7]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][7]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][7]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][7]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][7]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][7]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][7]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][7]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][7]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][7]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][7]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][7]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][7]))); + + O7l = _mm_add_epi32(E0l, E1l); + O7l = _mm_add_epi32(O7l, E2l); + O7l = _mm_add_epi32(O7l, E3l); + O7l = _mm_add_epi32(O7l, E4l); + O7l = _mm_add_epi32(O7l, E5l); + O7l = _mm_add_epi32(O7l, E6l); + O7l = _mm_add_epi32(O7l, E7l); + + O7h = _mm_add_epi32(E0h, E1h); + O7h = _mm_add_epi32(O7h, E2h); + O7h = _mm_add_epi32(O7h, E3h); + O7h = _mm_add_epi32(O7h, E4h); + O7h = _mm_add_epi32(O7h, E5h); + O7h = _mm_add_epi32(O7h, E6h); + O7h = _mm_add_epi32(O7h, E7h); + + /* Compute O8*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][8]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][8]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][8]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][8]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][8]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][8]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][8]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][8]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][8]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][8]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][8]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][8]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][8]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][8]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][8]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][8]))); + + O8l = _mm_add_epi32(E0l, E1l); + O8l = _mm_add_epi32(O8l, E2l); + O8l = _mm_add_epi32(O8l, E3l); + O8l = _mm_add_epi32(O8l, E4l); + O8l = _mm_add_epi32(O8l, E5l); + O8l = _mm_add_epi32(O8l, E6l); + O8l = _mm_add_epi32(O8l, E7l); + + O8h = _mm_add_epi32(E0h, E1h); + O8h = _mm_add_epi32(O8h, E2h); + O8h = _mm_add_epi32(O8h, E3h); + O8h = _mm_add_epi32(O8h, E4h); + O8h = _mm_add_epi32(O8h, E5h); + O8h = _mm_add_epi32(O8h, E6h); + O8h = _mm_add_epi32(O8h, E7h); + + /* Compute O9*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][9]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][9]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][9]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][9]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][9]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][9]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][9]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][9]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][9]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][9]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][9]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][9]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][9]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][9]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][9]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][9]))); + + O9l = _mm_add_epi32(E0l, E1l); + O9l = _mm_add_epi32(O9l, E2l); + O9l = _mm_add_epi32(O9l, E3l); + O9l = _mm_add_epi32(O9l, E4l); + O9l = _mm_add_epi32(O9l, E5l); + O9l = _mm_add_epi32(O9l, E6l); + O9l = _mm_add_epi32(O9l, E7l); + + O9h = _mm_add_epi32(E0h, E1h); + O9h = _mm_add_epi32(O9h, E2h); + O9h = _mm_add_epi32(O9h, E3h); + O9h = _mm_add_epi32(O9h, E4h); + O9h = _mm_add_epi32(O9h, E5h); + O9h = _mm_add_epi32(O9h, E6h); + O9h = _mm_add_epi32(O9h, E7h); + + /* Compute 10*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][10]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][10]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][10]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][10]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][10]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][10]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][10]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][10]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][10]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][10]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][10]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][10]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][10]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][10]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][10]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][10]))); + + O10l = _mm_add_epi32(E0l, E1l); + O10l = _mm_add_epi32(O10l, E2l); + O10l = _mm_add_epi32(O10l, E3l); + O10l = _mm_add_epi32(O10l, E4l); + O10l = _mm_add_epi32(O10l, E5l); + O10l = _mm_add_epi32(O10l, E6l); + O10l = _mm_add_epi32(O10l, E7l); + + O10h = _mm_add_epi32(E0h, E1h); + O10h = _mm_add_epi32(O10h, E2h); + O10h = _mm_add_epi32(O10h, E3h); + O10h = _mm_add_epi32(O10h, E4h); + O10h = _mm_add_epi32(O10h, E5h); + O10h = _mm_add_epi32(O10h, E6h); + O10h = _mm_add_epi32(O10h, E7h); + + /* Compute 11*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][11]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][11]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][11]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][11]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][11]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][11]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][11]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][11]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][11]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][11]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][11]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][11]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][11]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][11]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][11]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][11]))); + + O11l = _mm_add_epi32(E0l, E1l); + O11l = _mm_add_epi32(O11l, E2l); + O11l = _mm_add_epi32(O11l, E3l); + O11l = _mm_add_epi32(O11l, E4l); + O11l = _mm_add_epi32(O11l, E5l); + O11l = _mm_add_epi32(O11l, E6l); + O11l = _mm_add_epi32(O11l, E7l); + + O11h = _mm_add_epi32(E0h, E1h); + O11h = _mm_add_epi32(O11h, E2h); + O11h = _mm_add_epi32(O11h, E3h); + O11h = _mm_add_epi32(O11h, E4h); + O11h = _mm_add_epi32(O11h, E5h); + O11h = _mm_add_epi32(O11h, E6h); + O11h = _mm_add_epi32(O11h, E7h); + + /* Compute 12*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][12]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][12]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][12]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][12]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][12]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][12]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][12]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][12]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][12]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][12]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][12]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][12]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][12]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][12]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][12]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][12]))); + + O12l = _mm_add_epi32(E0l, E1l); + O12l = _mm_add_epi32(O12l, E2l); + O12l = _mm_add_epi32(O12l, E3l); + O12l = _mm_add_epi32(O12l, E4l); + O12l = _mm_add_epi32(O12l, E5l); + O12l = _mm_add_epi32(O12l, E6l); + O12l = _mm_add_epi32(O12l, E7l); + + O12h = _mm_add_epi32(E0h, E1h); + O12h = _mm_add_epi32(O12h, E2h); + O12h = _mm_add_epi32(O12h, E3h); + O12h = _mm_add_epi32(O12h, E4h); + O12h = _mm_add_epi32(O12h, E5h); + O12h = _mm_add_epi32(O12h, E6h); + O12h = _mm_add_epi32(O12h, E7h); + + /* Compute 13*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][13]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][13]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][13]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][13]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][13]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][13]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][13]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][13]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][13]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][13]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][13]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][13]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][13]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][13]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][13]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][13]))); + + O13l = _mm_add_epi32(E0l, E1l); + O13l = _mm_add_epi32(O13l, E2l); + O13l = _mm_add_epi32(O13l, E3l); + O13l = _mm_add_epi32(O13l, E4l); + O13l = _mm_add_epi32(O13l, E5l); + O13l = _mm_add_epi32(O13l, E6l); + O13l = _mm_add_epi32(O13l, E7l); + + O13h = _mm_add_epi32(E0h, E1h); + O13h = _mm_add_epi32(O13h, E2h); + O13h = _mm_add_epi32(O13h, E3h); + O13h = _mm_add_epi32(O13h, E4h); + O13h = _mm_add_epi32(O13h, E5h); + O13h = _mm_add_epi32(O13h, E6h); + O13h = _mm_add_epi32(O13h, E7h); + + /* Compute O14 */ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][14]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][14]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][14]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][14]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][14]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][14]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][14]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][14]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][14]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][14]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][14]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][14]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][14]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][14]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][14]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][14]))); + + O14l = _mm_add_epi32(E0l, E1l); + O14l = _mm_add_epi32(O14l, E2l); + O14l = _mm_add_epi32(O14l, E3l); + O14l = _mm_add_epi32(O14l, E4l); + O14l = _mm_add_epi32(O14l, E5l); + O14l = _mm_add_epi32(O14l, E6l); + O14l = _mm_add_epi32(O14l, E7l); + + O14h = _mm_add_epi32(E0h, E1h); + O14h = _mm_add_epi32(O14h, E2h); + O14h = _mm_add_epi32(O14h, E3h); + O14h = _mm_add_epi32(O14h, E4h); + O14h = _mm_add_epi32(O14h, E5h); + O14h = _mm_add_epi32(O14h, E6h); + O14h = _mm_add_epi32(O14h, E7h); + + /* Compute O15*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][15]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][15]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][15]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][15]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][15]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][15]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][15]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][15]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][15]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][15]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][15]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][15]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][15]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][15]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][15]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][15]))); + + O15l = _mm_add_epi32(E0l, E1l); + O15l = _mm_add_epi32(O15l, E2l); + O15l = _mm_add_epi32(O15l, E3l); + O15l = _mm_add_epi32(O15l, E4l); + O15l = _mm_add_epi32(O15l, E5l); + O15l = _mm_add_epi32(O15l, E6l); + O15l = _mm_add_epi32(O15l, E7l); + + O15h = _mm_add_epi32(E0h, E1h); + O15h = _mm_add_epi32(O15h, E2h); + O15h = _mm_add_epi32(O15h, E3h); + O15h = _mm_add_epi32(O15h, E4h); + O15h = _mm_add_epi32(O15h, E5h); + O15h = _mm_add_epi32(O15h, E6h); + O15h = _mm_add_epi32(O15h, E7h); + /* Compute E0 */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][0])))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][0])))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][0])))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][0])))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][0])))); + + /* Compute E1 */ + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][1])))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][1])))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][1])))); + + /* Compute E2 */ + E2l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E2h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][2])))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][2])))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][2])))); + + /* Compute E3 */ + E3l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E3h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][3])))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][3])))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][3])))); + + /* Compute E4 */ + E4l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E4h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][4])))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][4])))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][4])))); + + /* Compute E3 */ + E5l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E5h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][5])))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][5])))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][5])))); + + /* Compute E6 */ + E6l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E6h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][6])))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][6])))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][6])))); + + /* Compute E7 */ + E7l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E7h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][7])))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][7])))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][7])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28); + E00l = _mm_add_epi32(E00l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28); + E00h = _mm_add_epi32(E00h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E01l = _mm_add_epi32(E01l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + E01h = _mm_add_epi32(E01h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + + E02l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E02h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E02l = _mm_add_epi32(E02l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + E02h = _mm_add_epi32(E02h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + + E03l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E03h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E03l = _mm_add_epi32(E03l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + E03h = _mm_add_epi32(E03h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24); + EE0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24); + EE0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16); + EEE0l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16); + EEE0h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + + EE1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + EE1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + + EEE1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + EEE1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + + /* Compute EE */ + + EE2l = _mm_sub_epi32(EEE1l, EE1l); + EE3l = _mm_sub_epi32(EEE0l, EE0l); + EE2h = _mm_sub_epi32(EEE1h, EE1h); + EE3h = _mm_sub_epi32(EEE0h, EE0h); + + EE0l = _mm_add_epi32(EEE0l, EE0l); + EE1l = _mm_add_epi32(EEE1l, EE1l); + EE0h = _mm_add_epi32(EEE0h, EE0h); + EE1h = _mm_add_epi32(EEE1h, EE1h); + /**/ + + EE7l = _mm_sub_epi32(EE0l, E00l); + EE6l = _mm_sub_epi32(EE1l, E01l); + EE5l = _mm_sub_epi32(EE2l, E02l); + EE4l = _mm_sub_epi32(EE3l, E03l); + + EE7h = _mm_sub_epi32(EE0h, E00h); + EE6h = _mm_sub_epi32(EE1h, E01h); + EE5h = _mm_sub_epi32(EE2h, E02h); + EE4h = _mm_sub_epi32(EE3h, E03h); + + EE0l = _mm_add_epi32(EE0l, E00l); + EE1l = _mm_add_epi32(EE1l, E01l); + EE2l = _mm_add_epi32(EE2l, E02l); + EE3l = _mm_add_epi32(EE3l, E03l); + + EE0h = _mm_add_epi32(EE0h, E00h); + EE1h = _mm_add_epi32(EE1h, E01h); + EE2h = _mm_add_epi32(EE2h, E02h); + EE3h = _mm_add_epi32(EE3h, E03h); + /* Compute E */ + + E15l = _mm_sub_epi32(EE0l, E0l); + E15l = _mm_add_epi32(E15l, m128iAdd); + E14l = _mm_sub_epi32(EE1l, E1l); + E14l = _mm_add_epi32(E14l, m128iAdd); + E13l = _mm_sub_epi32(EE2l, E2l); + E13l = _mm_add_epi32(E13l, m128iAdd); + E12l = _mm_sub_epi32(EE3l, E3l); + E12l = _mm_add_epi32(E12l, m128iAdd); + E11l = _mm_sub_epi32(EE4l, E4l); + E11l = _mm_add_epi32(E11l, m128iAdd); + E10l = _mm_sub_epi32(EE5l, E5l); + E10l = _mm_add_epi32(E10l, m128iAdd); + E9l = _mm_sub_epi32(EE6l, E6l); + E9l = _mm_add_epi32(E9l, m128iAdd); + E8l = _mm_sub_epi32(EE7l, E7l); + E8l = _mm_add_epi32(E8l, m128iAdd); + + E0l = _mm_add_epi32(EE0l, E0l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E1l = _mm_add_epi32(EE1l, E1l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E2l = _mm_add_epi32(EE2l, E2l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E3l = _mm_add_epi32(EE3l, E3l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E4l = _mm_add_epi32(EE4l, E4l); + E4l = _mm_add_epi32(E4l, m128iAdd); + E5l = _mm_add_epi32(EE5l, E5l); + E5l = _mm_add_epi32(E5l, m128iAdd); + E6l = _mm_add_epi32(EE6l, E6l); + E6l = _mm_add_epi32(E6l, m128iAdd); + E7l = _mm_add_epi32(EE7l, E7l); + E7l = _mm_add_epi32(E7l, m128iAdd); + + E15h = _mm_sub_epi32(EE0h, E0h); + E15h = _mm_add_epi32(E15h, m128iAdd); + E14h = _mm_sub_epi32(EE1h, E1h); + E14h = _mm_add_epi32(E14h, m128iAdd); + E13h = _mm_sub_epi32(EE2h, E2h); + E13h = _mm_add_epi32(E13h, m128iAdd); + E12h = _mm_sub_epi32(EE3h, E3h); + E12h = _mm_add_epi32(E12h, m128iAdd); + E11h = _mm_sub_epi32(EE4h, E4h); + E11h = _mm_add_epi32(E11h, m128iAdd); + E10h = _mm_sub_epi32(EE5h, E5h); + E10h = _mm_add_epi32(E10h, m128iAdd); + E9h = _mm_sub_epi32(EE6h, E6h); + E9h = _mm_add_epi32(E9h, m128iAdd); + E8h = _mm_sub_epi32(EE7h, E7h); + E8h = _mm_add_epi32(E8h, m128iAdd); + + E0h = _mm_add_epi32(EE0h, E0h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E1h = _mm_add_epi32(EE1h, E1h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2h = _mm_add_epi32(EE2h, E2h); + E2h = _mm_add_epi32(E2h, m128iAdd); + E3h = _mm_add_epi32(EE3h, E3h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E4h = _mm_add_epi32(EE4h, E4h); + E4h = _mm_add_epi32(E4h, m128iAdd); + E5h = _mm_add_epi32(EE5h, E5h); + E5h = _mm_add_epi32(E5h, m128iAdd); + E6h = _mm_add_epi32(EE6h, E6h); + E6h = _mm_add_epi32(E6h, m128iAdd); + E7h = _mm_add_epi32(EE7h, E7h); + E7h = _mm_add_epi32(E7h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); + m128iS8 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift), + _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift)); + m128iS9 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift), + _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift)); + m128iS10 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift), + _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift)); + m128iS11 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift), + _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift)); + m128iS12 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift), + _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift)); + m128iS13 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift), + _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift)); + m128iS14 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift), + _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift)); + m128iS15 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift), + _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift)); + + m128iS31 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); + m128iS30 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); + m128iS29 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); + m128iS28 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); + m128iS27 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); + m128iS26 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); + m128iS25 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); + m128iS24 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); + m128iS23 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift), + _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift)); + m128iS22 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift), + _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift)); + m128iS21 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift), + _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift)); + m128iS20 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift), + _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift)); + m128iS19 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift), + _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift)); + m128iS18 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift), + _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift)); + m128iS17 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift), + _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift)); + m128iS16 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift), + _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift)); + + if (!j) { + /* Inverse the matrix */ + E0l = _mm_unpacklo_epi16(m128iS0, m128iS16); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS17); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS18); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS19); + E4l = _mm_unpacklo_epi16(m128iS4, m128iS20); + E5l = _mm_unpacklo_epi16(m128iS5, m128iS21); + E6l = _mm_unpacklo_epi16(m128iS6, m128iS22); + E7l = _mm_unpacklo_epi16(m128iS7, m128iS23); + E8l = _mm_unpacklo_epi16(m128iS8, m128iS24); + E9l = _mm_unpacklo_epi16(m128iS9, m128iS25); + E10l = _mm_unpacklo_epi16(m128iS10, m128iS26); + E11l = _mm_unpacklo_epi16(m128iS11, m128iS27); + E12l = _mm_unpacklo_epi16(m128iS12, m128iS28); + E13l = _mm_unpacklo_epi16(m128iS13, m128iS29); + E14l = _mm_unpacklo_epi16(m128iS14, m128iS30); + E15l = _mm_unpacklo_epi16(m128iS15, m128iS31); + + O0l = _mm_unpackhi_epi16(m128iS0, m128iS16); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS17); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS18); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS19); + O4l = _mm_unpackhi_epi16(m128iS4, m128iS20); + O5l = _mm_unpackhi_epi16(m128iS5, m128iS21); + O6l = _mm_unpackhi_epi16(m128iS6, m128iS22); + O7l = _mm_unpackhi_epi16(m128iS7, m128iS23); + O8l = _mm_unpackhi_epi16(m128iS8, m128iS24); + O9l = _mm_unpackhi_epi16(m128iS9, m128iS25); + O10l = _mm_unpackhi_epi16(m128iS10, m128iS26); + O11l = _mm_unpackhi_epi16(m128iS11, m128iS27); + O12l = _mm_unpackhi_epi16(m128iS12, m128iS28); + O13l = _mm_unpackhi_epi16(m128iS13, m128iS29); + O14l = _mm_unpackhi_epi16(m128iS14, m128iS30); + O15l = _mm_unpackhi_epi16(m128iS15, m128iS31); + + E0h = _mm_unpacklo_epi16(E0l, E8l); + E1h = _mm_unpacklo_epi16(E1l, E9l); + E2h = _mm_unpacklo_epi16(E2l, E10l); + E3h = _mm_unpacklo_epi16(E3l, E11l); + E4h = _mm_unpacklo_epi16(E4l, E12l); + E5h = _mm_unpacklo_epi16(E5l, E13l); + E6h = _mm_unpacklo_epi16(E6l, E14l); + E7h = _mm_unpacklo_epi16(E7l, E15l); + + E8h = _mm_unpackhi_epi16(E0l, E8l); + E9h = _mm_unpackhi_epi16(E1l, E9l); + E10h = _mm_unpackhi_epi16(E2l, E10l); + E11h = _mm_unpackhi_epi16(E3l, E11l); + E12h = _mm_unpackhi_epi16(E4l, E12l); + E13h = _mm_unpackhi_epi16(E5l, E13l); + E14h = _mm_unpackhi_epi16(E6l, E14l); + E15h = _mm_unpackhi_epi16(E7l, E15l); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); + m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); + m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); + m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); + m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); + m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); + m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + /* */ + E0h = _mm_unpacklo_epi16(O0l, O8l); + E1h = _mm_unpacklo_epi16(O1l, O9l); + E2h = _mm_unpacklo_epi16(O2l, O10l); + E3h = _mm_unpacklo_epi16(O3l, O11l); + E4h = _mm_unpacklo_epi16(O4l, O12l); + E5h = _mm_unpacklo_epi16(O5l, O13l); + E6h = _mm_unpacklo_epi16(O6l, O14l); + E7h = _mm_unpacklo_epi16(O7l, O15l); + + E8h = _mm_unpackhi_epi16(O0l, O8l); + E9h = _mm_unpackhi_epi16(O1l, O9l); + E10h = _mm_unpackhi_epi16(O2l, O10l); + E11h = _mm_unpackhi_epi16(O3l, O11l); + E12h = _mm_unpackhi_epi16(O4l, O12l); + E13h = _mm_unpackhi_epi16(O5l, O13l); + E14h = _mm_unpackhi_epi16(O6l, O14l); + E15h = _mm_unpackhi_epi16(O7l, O15l); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); + m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); + m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); + m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); + m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); + m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); + m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + if(i==0){ + int k = 8; + r0=m128iS0; + r1=m128iS1; + r2=m128iS2; + r3=m128iS3; + r4=m128iS4; + r5=m128iS5; + r6=m128iS6; + r7=m128iS7; + r8=m128iS8; + r9=m128iS9; + r10=m128iS10; + r11=m128iS11; + r12=m128iS12; + r13=m128iS13; + r14=m128iS14; + r15=m128iS15; + r16=m128iS16; + r17=m128iS17; + r18=m128iS18; + r19=m128iS19; + r20=m128iS20; + r21=m128iS21; + r22=m128iS22; + r23=m128iS23; + r24=m128iS24; + r25=m128iS25; + r26=m128iS26; + r27=m128iS27; + r28=m128iS28; + r29=m128iS29; + r30=m128iS30; + r31=m128iS31; + m128iS0 = _mm_load_si128((__m128i *) (src + k)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k)); + m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k)); + + m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k)); + + }else if(i ==8){ + + r32=m128iS0; + r33=m128iS1; + r34=m128iS2; + r35=m128iS3; + r36=m128iS4; + r37=m128iS5; + r38=m128iS6; + r39=m128iS7; + r40=m128iS8; + r41=m128iS9; + r42=m128iS10; + r43=m128iS11; + r44=m128iS12; + r45=m128iS13; + r46=m128iS14; + r47=m128iS15; + r48=m128iS16; + r49=m128iS17; + r50=m128iS18; + r51=m128iS19; + r52=m128iS20; + r53=m128iS21; + r54=m128iS22; + r55=m128iS23; + r56=m128iS24; + r57=m128iS25; + r58=m128iS26; + r59=m128iS27; + r60=m128iS28; + r61=m128iS29; + r62=m128iS30; + r63=m128iS31; + + m128iS0 = _mm_load_si128((__m128i *) (src + 16)); + m128iS1 = _mm_load_si128((__m128i *) (src + 48)); + m128iS2 = _mm_load_si128((__m128i *) (src + 80)); + m128iS3 = _mm_load_si128((__m128i *) (src + 112)); + m128iS4 = _mm_load_si128((__m128i *) (src + 144)); + m128iS5 = _mm_load_si128((__m128i *) (src + 176)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16)); + + m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16)); + + + }else if(i ==16){ + + r64=m128iS0; + r65=m128iS1; + r66=m128iS2; + r67=m128iS3; + r68=m128iS4; + r69=m128iS5; + r70=m128iS6; + r71=m128iS7; + r72=m128iS8; + r73=m128iS9; + r74=m128iS10; + r75=m128iS11; + r76=m128iS12; + r77=m128iS13; + r78=m128iS14; + r79=m128iS15; + r80=m128iS16; + r81=m128iS17; + r82=m128iS18; + r83=m128iS19; + r84=m128iS20; + r85=m128iS21; + r86=m128iS22; + r87=m128iS23; + r88=m128iS24; + r89=m128iS25; + r90=m128iS26; + r91=m128iS27; + r92=m128iS28; + r93=m128iS29; + r94=m128iS30; + r95=m128iS31; + + m128iS0 = _mm_load_si128((__m128i *) (src + 24)); + m128iS1 = _mm_load_si128((__m128i *) (src + 56)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24)); + m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24)); + + m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24)); + + }else{ + r96=m128iS0; + r97=m128iS1; + r98=m128iS2; + r99=m128iS3; + r100=m128iS4; + r101=m128iS5; + r102=m128iS6; + r103=m128iS7; + r104=m128iS8; + r105=m128iS9; + r106=m128iS10; + r107=m128iS11; + r108=m128iS12; + r109=m128iS13; + r110=m128iS14; + r111=m128iS15; + r112=m128iS16; + r113=m128iS17; + r114=m128iS18; + r115=m128iS19; + r116=m128iS20; + r117=m128iS21; + r118=m128iS22; + r119=m128iS23; + r120=m128iS24; + r121=m128iS25; + r122=m128iS26; + r123=m128iS27; + r124=m128iS28; + r125=m128iS29; + r126=m128iS30; + r127=m128iS31; + + //load data for next j : + m128iS0 = r0; + m128iS1 = r4; + m128iS2 = r8; + m128iS3 = r12; + m128iS4 = r16; + m128iS5 = r20; + m128iS6 = r24; + m128iS7 = r28; + m128iS8 = r32; + m128iS9 = r36; + m128iS10 = r40; + m128iS11 = r44; + m128iS12 = r48; + m128iS13 = r52; + m128iS14 = r56; + m128iS15 = r60; + m128iS16 = r64; + m128iS17 = r68; + m128iS18 = r72; + m128iS19 = r76; + m128iS20 = r80; + m128iS21 = r84; + m128iS22 = r88; + m128iS23 = r92; + m128iS24 = r96; + m128iS25 = r100; + m128iS26 = r104; + m128iS27 = r108; + m128iS28 = r112; + m128iS29 = r116; + m128iS30 = r120; + m128iS31 =r124; + shift = shift_2nd; + m128iAdd = _mm_set1_epi32(add_2nd); + + + } + + } else { + + //Transpose Matrix + + E0l= _mm_unpacklo_epi16(m128iS0,m128iS1); + E1l= _mm_unpacklo_epi16(m128iS2,m128iS3); + E2l= _mm_unpacklo_epi16(m128iS4,m128iS5); + E3l= _mm_unpacklo_epi16(m128iS6,m128iS7); + E4l= _mm_unpacklo_epi16(m128iS8,m128iS9); + E5l= _mm_unpacklo_epi16(m128iS10,m128iS11); + E6l= _mm_unpacklo_epi16(m128iS12,m128iS13); + E7l= _mm_unpacklo_epi16(m128iS14,m128iS15); + E8l= _mm_unpacklo_epi16(m128iS16,m128iS17); + E9l= _mm_unpacklo_epi16(m128iS18,m128iS19); + E10l= _mm_unpacklo_epi16(m128iS20,m128iS21); + E11l= _mm_unpacklo_epi16(m128iS22,m128iS23); + E12l= _mm_unpacklo_epi16(m128iS24,m128iS25); + E13l= _mm_unpacklo_epi16(m128iS26,m128iS27); + E14l= _mm_unpacklo_epi16(m128iS28,m128iS29); + E15l= _mm_unpacklo_epi16(m128iS30,m128iS31); + + + E0h= _mm_unpackhi_epi16(m128iS0,m128iS1); + E1h= _mm_unpackhi_epi16(m128iS2,m128iS3); + E2h= _mm_unpackhi_epi16(m128iS4,m128iS5); + E3h= _mm_unpackhi_epi16(m128iS6,m128iS7); + E4h= _mm_unpackhi_epi16(m128iS8,m128iS9); + E5h= _mm_unpackhi_epi16(m128iS10,m128iS11); + E6h= _mm_unpackhi_epi16(m128iS12,m128iS13); + E7h= _mm_unpackhi_epi16(m128iS14,m128iS15); + E8h= _mm_unpackhi_epi16(m128iS16,m128iS17); + E9h= _mm_unpackhi_epi16(m128iS18,m128iS19); + E10h= _mm_unpackhi_epi16(m128iS20,m128iS21); + E11h= _mm_unpackhi_epi16(m128iS22,m128iS23); + E12h= _mm_unpackhi_epi16(m128iS24,m128iS25); + E13h= _mm_unpackhi_epi16(m128iS26,m128iS27); + E14h= _mm_unpackhi_epi16(m128iS28,m128iS29); + E15h= _mm_unpackhi_epi16(m128iS30,m128iS31); + + m128Tmp0= _mm_unpacklo_epi32(E0l,E1l); + m128Tmp1= _mm_unpacklo_epi32(E2l,E3l); + m128Tmp2= _mm_unpacklo_epi32(E4l,E5l); + m128Tmp3= _mm_unpacklo_epi32(E6l,E7l); + m128Tmp4= _mm_unpacklo_epi32(E8l,E9l); + m128Tmp5= _mm_unpacklo_epi32(E10l,E11l); + m128Tmp6= _mm_unpacklo_epi32(E12l,E13l); + m128Tmp7= _mm_unpacklo_epi32(E14l,E15l); + + m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row + m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row + + + m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row + m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row + + //second row + + m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter + + m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter + + //third row + + m128Tmp0= _mm_unpackhi_epi32(E0l,E1l); + m128Tmp1= _mm_unpackhi_epi32(E2l,E3l); + m128Tmp2= _mm_unpackhi_epi32(E4l,E5l); + m128Tmp3= _mm_unpackhi_epi32(E6l,E7l); + m128Tmp4= _mm_unpackhi_epi32(E8l,E9l); + m128Tmp5= _mm_unpackhi_epi32(E10l,E11l); + m128Tmp6= _mm_unpackhi_epi32(E12l,E13l); + m128Tmp7= _mm_unpackhi_epi32(E14l,E15l); + + + m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter + + m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter + + //fourth row + + m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter + + m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter + + //fith row + + m128Tmp0= _mm_unpacklo_epi32(E0h,E1h); + m128Tmp1= _mm_unpacklo_epi32(E2h,E3h); + m128Tmp2= _mm_unpacklo_epi32(E4h,E5h); + m128Tmp3= _mm_unpacklo_epi32(E6h,E7h); + m128Tmp4= _mm_unpacklo_epi32(E8h,E9h); + m128Tmp5= _mm_unpacklo_epi32(E10h,E11h); + m128Tmp6= _mm_unpacklo_epi32(E12h,E13h); + m128Tmp7= _mm_unpacklo_epi32(E14h,E15h); + + m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter + + + m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); + + //sixth row + + m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter + + + m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter + + //seventh row + + m128Tmp0= _mm_unpackhi_epi32(E0h,E1h); + m128Tmp1= _mm_unpackhi_epi32(E2h,E3h); + m128Tmp2= _mm_unpackhi_epi32(E4h,E5h); + m128Tmp3= _mm_unpackhi_epi32(E6h,E7h); + m128Tmp4= _mm_unpackhi_epi32(E8h,E9h); + m128Tmp5= _mm_unpackhi_epi32(E10h,E11h); + m128Tmp6= _mm_unpackhi_epi32(E12h,E13h); + m128Tmp7= _mm_unpackhi_epi32(E14h,E15h); + + + m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter + + + m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter + + //last row + + + m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter + m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter + + m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter + m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter + + + m128Tmp0=_mm_setzero_si128(); + + + //store + dst = (uint8_t*) _dst + i*stride; + + + E0l= _mm_load_si128((__m128i*)dst); //16 values + E1l= _mm_load_si128((__m128i*)(dst+16)); + E2l= _mm_load_si128((__m128i*)(dst+stride)); + E3l= _mm_load_si128((__m128i*)(dst+stride+16)); + E4l= _mm_load_si128((__m128i*)(dst+2*stride)); + E5l= _mm_load_si128((__m128i*)(dst+2*stride+16)); + E6l= _mm_load_si128((__m128i*)(dst+3*stride)); + E7l= _mm_load_si128((__m128i*)(dst+3*stride+16)); + E8l= _mm_load_si128((__m128i*)(dst+4*stride)); + E9l= _mm_load_si128((__m128i*)(dst+4*stride+16)); + E10l= _mm_load_si128((__m128i*)(dst+5*stride)); + E11l= _mm_load_si128((__m128i*)(dst+5*stride+16)); + E12l= _mm_load_si128((__m128i*)(dst+6*stride)); + E13l= _mm_load_si128((__m128i*)(dst+6*stride+16)); + E14l= _mm_load_si128((__m128i*)(dst+7*stride)); + E15l= _mm_load_si128((__m128i*)(dst+7*stride+16)); + + m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0)); + m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0)); + m128iS0= _mm_packus_epi16(m128iS0,m128iS1); + + m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0)); + m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0)); + m128iS2= _mm_packus_epi16(m128iS2,m128iS3); + + m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0)); + m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0)); + m128iS4= _mm_packus_epi16(m128iS4,m128iS5); + + m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0)); + m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0)); + m128iS6= _mm_packus_epi16(m128iS6,m128iS7); + + m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0)); + m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0)); + m128iS8= _mm_packus_epi16(m128iS8,m128iS9); + + m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0)); + m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0)); + m128iS10= _mm_packus_epi16(m128iS10,m128iS11); + + m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0)); + m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0)); + m128iS12= _mm_packus_epi16(m128iS12,m128iS13); + + m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0)); + m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0)); + m128iS14= _mm_packus_epi16(m128iS14,m128iS15); + + m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0)); + m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0)); + m128iS16= _mm_packus_epi16(m128iS16,m128iS17); + + m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0)); + m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0)); + m128iS18= _mm_packus_epi16(m128iS18,m128iS19); + + m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0)); + m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0)); + m128iS20= _mm_packus_epi16(m128iS20,m128iS21); + + m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0)); + m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0)); + m128iS22= _mm_packus_epi16(m128iS22,m128iS23); + + m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0)); + m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0)); + m128iS24= _mm_packus_epi16(m128iS24,m128iS25); + + m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0)); + m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0)); + m128iS26= _mm_packus_epi16(m128iS26,m128iS27); + + m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0)); + m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0)); + m128iS28= _mm_packus_epi16(m128iS28,m128iS29); + + m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0)); + m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0)); + m128iS30= _mm_packus_epi16(m128iS30,m128iS31); + + + _mm_store_si128((__m128i*)dst,m128iS0); + _mm_store_si128((__m128i*)(dst+16),m128iS2); + _mm_store_si128((__m128i*)(dst+stride),m128iS4); + _mm_store_si128((__m128i*)(dst+stride+16),m128iS6); + _mm_store_si128((__m128i*)(dst+2*stride),m128iS8); + _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10); + _mm_store_si128((__m128i*)(dst+3*stride),m128iS12); + _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14); + _mm_store_si128((__m128i*)(dst+4*stride),m128iS16); + _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18); + _mm_store_si128((__m128i*)(dst+5*stride),m128iS20); + _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22); + _mm_store_si128((__m128i*)(dst+6*stride),m128iS24); + _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26); + _mm_store_si128((__m128i*)(dst+7*stride),m128iS28); + _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30); + + + if(i==0){ + //load next values : + m128iS0 = r1; + m128iS1 = r5; + m128iS2 = r9; + m128iS3 = r13; + m128iS4 = r17; + m128iS5 = r21; + m128iS6 = r25; + m128iS7 = r29; + m128iS8 = r33; + m128iS9 = r37; + m128iS10 = r41; + m128iS11 = r45; + m128iS12 = r49; + m128iS13 = r53; + m128iS14 = r57; + m128iS15 = r61; + m128iS16 = r65; + m128iS17 = r69; + m128iS18 = r73; + m128iS19 = r77; + m128iS20 = r81; + m128iS21 = r85; + m128iS22 = r89; + m128iS23 = r93; + m128iS24 = r97; + m128iS25 = r101; + m128iS26 = r105; + m128iS27 = r109; + m128iS28 = r113; + m128iS29 = r117; + m128iS30 = r121; + m128iS31 =r125; + + }else if(i ==8){ + //load next values : + m128iS0 = r2; + m128iS1 = r6; + m128iS2 = r10; + m128iS3 = r14; + m128iS4 = r18; + m128iS5 = r22; + m128iS6 = r26; + m128iS7 = r30; + m128iS8 = r34; + m128iS9 = r38; + m128iS10 = r42; + m128iS11 = r46; + m128iS12 = r50; + m128iS13 = r54; + m128iS14 = r58; + m128iS15 = r62; + m128iS16 = r66; + m128iS17 = r70; + m128iS18 = r74; + m128iS19 = r78; + m128iS20 = r82; + m128iS21 = r86; + m128iS22 = r90; + m128iS23 = r94; + m128iS24 = r98; + m128iS25 = r102; + m128iS26 = r106; + m128iS27 = r110; + m128iS28 = r114; + m128iS29 = r118; + m128iS30 = r122; + m128iS31 =r126; + + }else if(i==16) + { + //load next values : + m128iS0 = r3; + m128iS1 = r7; + m128iS2 = r11; + m128iS3 = r15; + m128iS4 = r19; + m128iS5 = r23; + m128iS6 = r27; + m128iS7 = r31; + m128iS8 = r35; + m128iS9 = r39; + m128iS10 = r43; + m128iS11 = r47; + m128iS12 = r51; + m128iS13 = r55; + m128iS14 = r59; + m128iS15 = r63; + m128iS16 = r67; + m128iS17 = r71; + m128iS18 = r75; + m128iS19 = r79; + m128iS20 = r83; + m128iS21 = r87; + m128iS22 = r91; + m128iS23 = r95; + m128iS24 = r99; + m128iS25 = r103; + m128iS26 = r107; + m128iS27 = r111; + m128iS28 = r115; + m128iS29 = r119; + m128iS30 = r123; + m128iS31 =r127; + } + } + } + } +} +#endif + + +#if 0 +void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, + ptrdiff_t _stride) { + int i, j; + uint16_t *dst = (uint16_t*) _dst; + ptrdiff_t stride = _stride / 2; + int shift; + uint8_t shift_2nd = 10; //20 - bit depth + uint16_t add_2nd = 1<<9; //shift2 - 1 + int16_t *src = coeffs; + + __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, + m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, + m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, + m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, + E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, + O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, + E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; + __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l; + __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h, + EEE0l, EEE1l, EEE0h, EEE1h; + __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21, + m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27, + m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9, + m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15, + O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l, + O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l, + EE4l, EE7h, EE6h, EE5h, EE4h; + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 384)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480)); + m128iS16 = _mm_load_si128((__m128i *) (src + 512)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992)); + + shift = shift_1st; + m128iAdd = _mm_set1_epi32(add_1st); + + for (j = 0; j < 2; j++) { + for (i = 0; i < 32; i += 8) { + m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][0]))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][0]))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][0]))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][0]))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][0]))); + + m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19); + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][0]))); + m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][0]))); + + m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][0]))); + m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][0]))); + + m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][0]))); + m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][0]))); + + m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][0]))); + m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][0]))); + + O0l = _mm_add_epi32(E0l, E1l); + O0l = _mm_add_epi32(O0l, E2l); + O0l = _mm_add_epi32(O0l, E3l); + O0l = _mm_add_epi32(O0l, E4l); + O0l = _mm_add_epi32(O0l, E5l); + O0l = _mm_add_epi32(O0l, E6l); + O0l = _mm_add_epi32(O0l, E7l); + + O0h = _mm_add_epi32(E0h, E1h); + O0h = _mm_add_epi32(O0h, E2h); + O0h = _mm_add_epi32(O0h, E3h); + O0h = _mm_add_epi32(O0h, E4h); + O0h = _mm_add_epi32(O0h, E5h); + O0h = _mm_add_epi32(O0h, E6h); + O0h = _mm_add_epi32(O0h, E7h); + + /* Compute O1*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][1]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][1]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][1]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][1]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][1]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][1]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][1]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][1]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][1]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][1]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][1]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][1]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][1]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][1]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][1]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][1]))); + + O1l = _mm_add_epi32(E0l, E1l); + O1l = _mm_add_epi32(O1l, E2l); + O1l = _mm_add_epi32(O1l, E3l); + O1l = _mm_add_epi32(O1l, E4l); + O1l = _mm_add_epi32(O1l, E5l); + O1l = _mm_add_epi32(O1l, E6l); + O1l = _mm_add_epi32(O1l, E7l); + + O1h = _mm_add_epi32(E0h, E1h); + O1h = _mm_add_epi32(O1h, E2h); + O1h = _mm_add_epi32(O1h, E3h); + O1h = _mm_add_epi32(O1h, E4h); + O1h = _mm_add_epi32(O1h, E5h); + O1h = _mm_add_epi32(O1h, E6h); + O1h = _mm_add_epi32(O1h, E7h); + /* Compute O2*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][2]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][2]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][2]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][2]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][2]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][2]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][2]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][2]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][2]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][2]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][2]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][2]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][2]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][2]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][2]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][2]))); + + O2l = _mm_add_epi32(E0l, E1l); + O2l = _mm_add_epi32(O2l, E2l); + O2l = _mm_add_epi32(O2l, E3l); + O2l = _mm_add_epi32(O2l, E4l); + O2l = _mm_add_epi32(O2l, E5l); + O2l = _mm_add_epi32(O2l, E6l); + O2l = _mm_add_epi32(O2l, E7l); + + O2h = _mm_add_epi32(E0h, E1h); + O2h = _mm_add_epi32(O2h, E2h); + O2h = _mm_add_epi32(O2h, E3h); + O2h = _mm_add_epi32(O2h, E4h); + O2h = _mm_add_epi32(O2h, E5h); + O2h = _mm_add_epi32(O2h, E6h); + O2h = _mm_add_epi32(O2h, E7h); + /* Compute O3*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][3]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][3]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][3]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][3]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][3]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][3]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][3]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][3]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][3]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][3]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][3]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][3]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][3]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][3]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][3]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][3]))); + + O3l = _mm_add_epi32(E0l, E1l); + O3l = _mm_add_epi32(O3l, E2l); + O3l = _mm_add_epi32(O3l, E3l); + O3l = _mm_add_epi32(O3l, E4l); + O3l = _mm_add_epi32(O3l, E5l); + O3l = _mm_add_epi32(O3l, E6l); + O3l = _mm_add_epi32(O3l, E7l); + + O3h = _mm_add_epi32(E0h, E1h); + O3h = _mm_add_epi32(O3h, E2h); + O3h = _mm_add_epi32(O3h, E3h); + O3h = _mm_add_epi32(O3h, E4h); + O3h = _mm_add_epi32(O3h, E5h); + O3h = _mm_add_epi32(O3h, E6h); + O3h = _mm_add_epi32(O3h, E7h); + /* Compute O4*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][4]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][4]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][4]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][4]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][4]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][4]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][4]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][4]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][4]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][4]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][4]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][4]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][4]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][4]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][4]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][4]))); + + O4l = _mm_add_epi32(E0l, E1l); + O4l = _mm_add_epi32(O4l, E2l); + O4l = _mm_add_epi32(O4l, E3l); + O4l = _mm_add_epi32(O4l, E4l); + O4l = _mm_add_epi32(O4l, E5l); + O4l = _mm_add_epi32(O4l, E6l); + O4l = _mm_add_epi32(O4l, E7l); + + O4h = _mm_add_epi32(E0h, E1h); + O4h = _mm_add_epi32(O4h, E2h); + O4h = _mm_add_epi32(O4h, E3h); + O4h = _mm_add_epi32(O4h, E4h); + O4h = _mm_add_epi32(O4h, E5h); + O4h = _mm_add_epi32(O4h, E6h); + O4h = _mm_add_epi32(O4h, E7h); + + /* Compute O5*/ + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][5]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][5]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][5]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][5]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][5]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][5]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][5]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][5]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][5]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][5]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][5]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][5]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][5]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][5]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][5]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][5]))); + + O5l = _mm_add_epi32(E0l, E1l); + O5l = _mm_add_epi32(O5l, E2l); + O5l = _mm_add_epi32(O5l, E3l); + O5l = _mm_add_epi32(O5l, E4l); + O5l = _mm_add_epi32(O5l, E5l); + O5l = _mm_add_epi32(O5l, E6l); + O5l = _mm_add_epi32(O5l, E7l); + + O5h = _mm_add_epi32(E0h, E1h); + O5h = _mm_add_epi32(O5h, E2h); + O5h = _mm_add_epi32(O5h, E3h); + O5h = _mm_add_epi32(O5h, E4h); + O5h = _mm_add_epi32(O5h, E5h); + O5h = _mm_add_epi32(O5h, E6h); + O5h = _mm_add_epi32(O5h, E7h); + + /* Compute O6*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][6]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][6]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][6]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][6]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][6]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][6]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][6]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][6]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][6]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][6]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][6]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][6]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][6]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][6]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][6]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][6]))); + + O6l = _mm_add_epi32(E0l, E1l); + O6l = _mm_add_epi32(O6l, E2l); + O6l = _mm_add_epi32(O6l, E3l); + O6l = _mm_add_epi32(O6l, E4l); + O6l = _mm_add_epi32(O6l, E5l); + O6l = _mm_add_epi32(O6l, E6l); + O6l = _mm_add_epi32(O6l, E7l); + + O6h = _mm_add_epi32(E0h, E1h); + O6h = _mm_add_epi32(O6h, E2h); + O6h = _mm_add_epi32(O6h, E3h); + O6h = _mm_add_epi32(O6h, E4h); + O6h = _mm_add_epi32(O6h, E5h); + O6h = _mm_add_epi32(O6h, E6h); + O6h = _mm_add_epi32(O6h, E7h); + + /* Compute O7*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][7]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][7]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][7]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][7]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][7]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][7]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][7]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][7]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][7]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][7]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][7]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][7]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][7]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][7]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][7]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][7]))); + + O7l = _mm_add_epi32(E0l, E1l); + O7l = _mm_add_epi32(O7l, E2l); + O7l = _mm_add_epi32(O7l, E3l); + O7l = _mm_add_epi32(O7l, E4l); + O7l = _mm_add_epi32(O7l, E5l); + O7l = _mm_add_epi32(O7l, E6l); + O7l = _mm_add_epi32(O7l, E7l); + + O7h = _mm_add_epi32(E0h, E1h); + O7h = _mm_add_epi32(O7h, E2h); + O7h = _mm_add_epi32(O7h, E3h); + O7h = _mm_add_epi32(O7h, E4h); + O7h = _mm_add_epi32(O7h, E5h); + O7h = _mm_add_epi32(O7h, E6h); + O7h = _mm_add_epi32(O7h, E7h); + + /* Compute O8*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][8]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][8]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][8]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][8]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][8]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][8]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][8]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][8]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][8]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][8]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][8]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][8]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][8]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][8]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][8]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][8]))); + + O8l = _mm_add_epi32(E0l, E1l); + O8l = _mm_add_epi32(O8l, E2l); + O8l = _mm_add_epi32(O8l, E3l); + O8l = _mm_add_epi32(O8l, E4l); + O8l = _mm_add_epi32(O8l, E5l); + O8l = _mm_add_epi32(O8l, E6l); + O8l = _mm_add_epi32(O8l, E7l); + + O8h = _mm_add_epi32(E0h, E1h); + O8h = _mm_add_epi32(O8h, E2h); + O8h = _mm_add_epi32(O8h, E3h); + O8h = _mm_add_epi32(O8h, E4h); + O8h = _mm_add_epi32(O8h, E5h); + O8h = _mm_add_epi32(O8h, E6h); + O8h = _mm_add_epi32(O8h, E7h); + + /* Compute O9*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][9]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][9]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][9]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][9]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][9]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][9]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][9]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][9]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][9]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][9]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][9]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][9]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][9]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][9]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][9]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][9]))); + + O9l = _mm_add_epi32(E0l, E1l); + O9l = _mm_add_epi32(O9l, E2l); + O9l = _mm_add_epi32(O9l, E3l); + O9l = _mm_add_epi32(O9l, E4l); + O9l = _mm_add_epi32(O9l, E5l); + O9l = _mm_add_epi32(O9l, E6l); + O9l = _mm_add_epi32(O9l, E7l); + + O9h = _mm_add_epi32(E0h, E1h); + O9h = _mm_add_epi32(O9h, E2h); + O9h = _mm_add_epi32(O9h, E3h); + O9h = _mm_add_epi32(O9h, E4h); + O9h = _mm_add_epi32(O9h, E5h); + O9h = _mm_add_epi32(O9h, E6h); + O9h = _mm_add_epi32(O9h, E7h); + + /* Compute 10*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][10]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][10]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][10]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][10]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][10]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][10]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][10]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][10]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][10]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][10]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][10]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][10]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][10]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][10]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][10]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][10]))); + + O10l = _mm_add_epi32(E0l, E1l); + O10l = _mm_add_epi32(O10l, E2l); + O10l = _mm_add_epi32(O10l, E3l); + O10l = _mm_add_epi32(O10l, E4l); + O10l = _mm_add_epi32(O10l, E5l); + O10l = _mm_add_epi32(O10l, E6l); + O10l = _mm_add_epi32(O10l, E7l); + + O10h = _mm_add_epi32(E0h, E1h); + O10h = _mm_add_epi32(O10h, E2h); + O10h = _mm_add_epi32(O10h, E3h); + O10h = _mm_add_epi32(O10h, E4h); + O10h = _mm_add_epi32(O10h, E5h); + O10h = _mm_add_epi32(O10h, E6h); + O10h = _mm_add_epi32(O10h, E7h); + + /* Compute 11*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][11]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][11]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][11]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][11]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][11]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][11]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][11]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][11]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][11]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][11]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][11]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][11]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][11]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][11]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][11]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][11]))); + + O11l = _mm_add_epi32(E0l, E1l); + O11l = _mm_add_epi32(O11l, E2l); + O11l = _mm_add_epi32(O11l, E3l); + O11l = _mm_add_epi32(O11l, E4l); + O11l = _mm_add_epi32(O11l, E5l); + O11l = _mm_add_epi32(O11l, E6l); + O11l = _mm_add_epi32(O11l, E7l); + + O11h = _mm_add_epi32(E0h, E1h); + O11h = _mm_add_epi32(O11h, E2h); + O11h = _mm_add_epi32(O11h, E3h); + O11h = _mm_add_epi32(O11h, E4h); + O11h = _mm_add_epi32(O11h, E5h); + O11h = _mm_add_epi32(O11h, E6h); + O11h = _mm_add_epi32(O11h, E7h); + + /* Compute 12*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][12]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][12]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][12]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][12]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][12]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][12]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][12]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][12]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][12]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][12]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][12]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][12]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][12]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][12]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][12]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][12]))); + + O12l = _mm_add_epi32(E0l, E1l); + O12l = _mm_add_epi32(O12l, E2l); + O12l = _mm_add_epi32(O12l, E3l); + O12l = _mm_add_epi32(O12l, E4l); + O12l = _mm_add_epi32(O12l, E5l); + O12l = _mm_add_epi32(O12l, E6l); + O12l = _mm_add_epi32(O12l, E7l); + + O12h = _mm_add_epi32(E0h, E1h); + O12h = _mm_add_epi32(O12h, E2h); + O12h = _mm_add_epi32(O12h, E3h); + O12h = _mm_add_epi32(O12h, E4h); + O12h = _mm_add_epi32(O12h, E5h); + O12h = _mm_add_epi32(O12h, E6h); + O12h = _mm_add_epi32(O12h, E7h); + + /* Compute 13*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][13]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][13]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][13]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][13]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][13]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][13]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][13]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][13]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][13]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][13]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][13]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][13]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][13]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][13]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][13]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][13]))); + + O13l = _mm_add_epi32(E0l, E1l); + O13l = _mm_add_epi32(O13l, E2l); + O13l = _mm_add_epi32(O13l, E3l); + O13l = _mm_add_epi32(O13l, E4l); + O13l = _mm_add_epi32(O13l, E5l); + O13l = _mm_add_epi32(O13l, E6l); + O13l = _mm_add_epi32(O13l, E7l); + + O13h = _mm_add_epi32(E0h, E1h); + O13h = _mm_add_epi32(O13h, E2h); + O13h = _mm_add_epi32(O13h, E3h); + O13h = _mm_add_epi32(O13h, E4h); + O13h = _mm_add_epi32(O13h, E5h); + O13h = _mm_add_epi32(O13h, E6h); + O13h = _mm_add_epi32(O13h, E7h); + + /* Compute O14 */ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][14]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][14]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][14]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][14]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][14]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][14]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][14]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][14]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][14]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][14]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][14]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][14]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][14]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][14]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][14]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][14]))); + + O14l = _mm_add_epi32(E0l, E1l); + O14l = _mm_add_epi32(O14l, E2l); + O14l = _mm_add_epi32(O14l, E3l); + O14l = _mm_add_epi32(O14l, E4l); + O14l = _mm_add_epi32(O14l, E5l); + O14l = _mm_add_epi32(O14l, E6l); + O14l = _mm_add_epi32(O14l, E7l); + + O14h = _mm_add_epi32(E0h, E1h); + O14h = _mm_add_epi32(O14h, E2h); + O14h = _mm_add_epi32(O14h, E3h); + O14h = _mm_add_epi32(O14h, E4h); + O14h = _mm_add_epi32(O14h, E5h); + O14h = _mm_add_epi32(O14h, E6h); + O14h = _mm_add_epi32(O14h, E7h); + + /* Compute O15*/ + + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform32x32[0][15]))); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform32x32[0][15]))); + E1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform32x32[1][15]))); + E1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform32x32[1][15]))); + E2l = _mm_madd_epi16(m128Tmp4, + _mm_load_si128((__m128i *) (transform32x32[2][15]))); + E2h = _mm_madd_epi16(m128Tmp5, + _mm_load_si128((__m128i *) (transform32x32[2][15]))); + E3l = _mm_madd_epi16(m128Tmp6, + _mm_load_si128((__m128i *) (transform32x32[3][15]))); + E3h = _mm_madd_epi16(m128Tmp7, + _mm_load_si128((__m128i *) (transform32x32[3][15]))); + + E4l = _mm_madd_epi16(m128Tmp8, + _mm_load_si128((__m128i *) (transform32x32[4][15]))); + E4h = _mm_madd_epi16(m128Tmp9, + _mm_load_si128((__m128i *) (transform32x32[4][15]))); + E5l = _mm_madd_epi16(m128Tmp10, + _mm_load_si128((__m128i *) (transform32x32[5][15]))); + E5h = _mm_madd_epi16(m128Tmp11, + _mm_load_si128((__m128i *) (transform32x32[5][15]))); + E6l = _mm_madd_epi16(m128Tmp12, + _mm_load_si128((__m128i *) (transform32x32[6][15]))); + E6h = _mm_madd_epi16(m128Tmp13, + _mm_load_si128((__m128i *) (transform32x32[6][15]))); + E7l = _mm_madd_epi16(m128Tmp14, + _mm_load_si128((__m128i *) (transform32x32[7][15]))); + E7h = _mm_madd_epi16(m128Tmp15, + _mm_load_si128((__m128i *) (transform32x32[7][15]))); + + O15l = _mm_add_epi32(E0l, E1l); + O15l = _mm_add_epi32(O15l, E2l); + O15l = _mm_add_epi32(O15l, E3l); + O15l = _mm_add_epi32(O15l, E4l); + O15l = _mm_add_epi32(O15l, E5l); + O15l = _mm_add_epi32(O15l, E6l); + O15l = _mm_add_epi32(O15l, E7l); + + O15h = _mm_add_epi32(E0h, E1h); + O15h = _mm_add_epi32(O15h, E2h); + O15h = _mm_add_epi32(O15h, E3h); + O15h = _mm_add_epi32(O15h, E4h); + O15h = _mm_add_epi32(O15h, E5h); + O15h = _mm_add_epi32(O15h, E6h); + O15h = _mm_add_epi32(O15h, E7h); + /* Compute E0 */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); + E0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); + E0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][0])))); + + m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][0])))); + m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][0])))); + + m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30); + E0l = _mm_add_epi32(E0l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][0])))); + m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30); + E0h = _mm_add_epi32(E0h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][0])))); + + /* Compute E1 */ + E1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][1])))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][1])))); + E1l = _mm_add_epi32(E1l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][1])))); + E1h = _mm_add_epi32(E1h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][1])))); + + /* Compute E2 */ + E2l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E2h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][2])))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][2])))); + E2l = _mm_add_epi32(E2l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][2])))); + E2h = _mm_add_epi32(E2h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][2])))); + + /* Compute E3 */ + E3l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E3h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][3])))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][3])))); + E3l = _mm_add_epi32(E3l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][3])))); + E3h = _mm_add_epi32(E3h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][3])))); + + /* Compute E4 */ + E4l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E4h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][4])))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][4])))); + E4l = _mm_add_epi32(E4l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][4])))); + E4h = _mm_add_epi32(E4h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][4])))); + + /* Compute E3 */ + E5l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E5h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][5])))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][5])))); + E5l = _mm_add_epi32(E5l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][5])))); + E5h = _mm_add_epi32(E5h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][5])))); + + /* Compute E6 */ + E6l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E6h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][6])))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][6])))); + E6l = _mm_add_epi32(E6l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][6])))); + E6h = _mm_add_epi32(E6h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][6])))); + + /* Compute E7 */ + E7l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E7h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_1[1][7])))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp4, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp5, + _mm_load_si128( + (__m128i *) (transform16x16_1[2][7])))); + E7l = _mm_add_epi32(E7l, + _mm_madd_epi16(m128Tmp6, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][7])))); + E7h = _mm_add_epi32(E7h, + _mm_madd_epi16(m128Tmp7, + _mm_load_si128( + (__m128i *) (transform16x16_1[3][7])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); + E00l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); + E00h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28); + E00l = _mm_add_epi32(E00l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28); + E00h = _mm_add_epi32(E00h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][0])))); + + E01l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E01h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); + E01l = _mm_add_epi32(E01l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + E01h = _mm_add_epi32(E01h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][1])))); + + E02l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E02h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); + E02l = _mm_add_epi32(E02l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + E02h = _mm_add_epi32(E02h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][2])))); + + E03l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E03h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); + E03l = _mm_add_epi32(E03l, + _mm_madd_epi16(m128Tmp2, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + E03h = _mm_add_epi32(E03h, + _mm_madd_epi16(m128Tmp3, + _mm_load_si128( + (__m128i *) (transform16x16_2[1][3])))); + + /* Compute EE0 and EEE */ + + m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24); + EE0l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24); + EE0h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); + + m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16); + EEE0l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16); + EEE0h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); + + EE1l = _mm_madd_epi16(m128Tmp0, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + EE1h = _mm_madd_epi16(m128Tmp1, + _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); + + EEE1l = _mm_madd_epi16(m128Tmp2, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + EEE1h = _mm_madd_epi16(m128Tmp3, + _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); + + /* Compute EE */ + + EE2l = _mm_sub_epi32(EEE1l, EE1l); + EE3l = _mm_sub_epi32(EEE0l, EE0l); + EE2h = _mm_sub_epi32(EEE1h, EE1h); + EE3h = _mm_sub_epi32(EEE0h, EE0h); + + EE0l = _mm_add_epi32(EEE0l, EE0l); + EE1l = _mm_add_epi32(EEE1l, EE1l); + EE0h = _mm_add_epi32(EEE0h, EE0h); + EE1h = _mm_add_epi32(EEE1h, EE1h); + /**/ + + EE7l = _mm_sub_epi32(EE0l, E00l); + EE6l = _mm_sub_epi32(EE1l, E01l); + EE5l = _mm_sub_epi32(EE2l, E02l); + EE4l = _mm_sub_epi32(EE3l, E03l); + + EE7h = _mm_sub_epi32(EE0h, E00h); + EE6h = _mm_sub_epi32(EE1h, E01h); + EE5h = _mm_sub_epi32(EE2h, E02h); + EE4h = _mm_sub_epi32(EE3h, E03h); + + EE0l = _mm_add_epi32(EE0l, E00l); + EE1l = _mm_add_epi32(EE1l, E01l); + EE2l = _mm_add_epi32(EE2l, E02l); + EE3l = _mm_add_epi32(EE3l, E03l); + + EE0h = _mm_add_epi32(EE0h, E00h); + EE1h = _mm_add_epi32(EE1h, E01h); + EE2h = _mm_add_epi32(EE2h, E02h); + EE3h = _mm_add_epi32(EE3h, E03h); + /* Compute E */ + + E15l = _mm_sub_epi32(EE0l, E0l); + E15l = _mm_add_epi32(E15l, m128iAdd); + E14l = _mm_sub_epi32(EE1l, E1l); + E14l = _mm_add_epi32(E14l, m128iAdd); + E13l = _mm_sub_epi32(EE2l, E2l); + E13l = _mm_add_epi32(E13l, m128iAdd); + E12l = _mm_sub_epi32(EE3l, E3l); + E12l = _mm_add_epi32(E12l, m128iAdd); + E11l = _mm_sub_epi32(EE4l, E4l); + E11l = _mm_add_epi32(E11l, m128iAdd); + E10l = _mm_sub_epi32(EE5l, E5l); + E10l = _mm_add_epi32(E10l, m128iAdd); + E9l = _mm_sub_epi32(EE6l, E6l); + E9l = _mm_add_epi32(E9l, m128iAdd); + E8l = _mm_sub_epi32(EE7l, E7l); + E8l = _mm_add_epi32(E8l, m128iAdd); + + E0l = _mm_add_epi32(EE0l, E0l); + E0l = _mm_add_epi32(E0l, m128iAdd); + E1l = _mm_add_epi32(EE1l, E1l); + E1l = _mm_add_epi32(E1l, m128iAdd); + E2l = _mm_add_epi32(EE2l, E2l); + E2l = _mm_add_epi32(E2l, m128iAdd); + E3l = _mm_add_epi32(EE3l, E3l); + E3l = _mm_add_epi32(E3l, m128iAdd); + E4l = _mm_add_epi32(EE4l, E4l); + E4l = _mm_add_epi32(E4l, m128iAdd); + E5l = _mm_add_epi32(EE5l, E5l); + E5l = _mm_add_epi32(E5l, m128iAdd); + E6l = _mm_add_epi32(EE6l, E6l); + E6l = _mm_add_epi32(E6l, m128iAdd); + E7l = _mm_add_epi32(EE7l, E7l); + E7l = _mm_add_epi32(E7l, m128iAdd); + + E15h = _mm_sub_epi32(EE0h, E0h); + E15h = _mm_add_epi32(E15h, m128iAdd); + E14h = _mm_sub_epi32(EE1h, E1h); + E14h = _mm_add_epi32(E14h, m128iAdd); + E13h = _mm_sub_epi32(EE2h, E2h); + E13h = _mm_add_epi32(E13h, m128iAdd); + E12h = _mm_sub_epi32(EE3h, E3h); + E12h = _mm_add_epi32(E12h, m128iAdd); + E11h = _mm_sub_epi32(EE4h, E4h); + E11h = _mm_add_epi32(E11h, m128iAdd); + E10h = _mm_sub_epi32(EE5h, E5h); + E10h = _mm_add_epi32(E10h, m128iAdd); + E9h = _mm_sub_epi32(EE6h, E6h); + E9h = _mm_add_epi32(E9h, m128iAdd); + E8h = _mm_sub_epi32(EE7h, E7h); + E8h = _mm_add_epi32(E8h, m128iAdd); + + E0h = _mm_add_epi32(EE0h, E0h); + E0h = _mm_add_epi32(E0h, m128iAdd); + E1h = _mm_add_epi32(EE1h, E1h); + E1h = _mm_add_epi32(E1h, m128iAdd); + E2h = _mm_add_epi32(EE2h, E2h); + E2h = _mm_add_epi32(E2h, m128iAdd); + E3h = _mm_add_epi32(EE3h, E3h); + E3h = _mm_add_epi32(E3h, m128iAdd); + E4h = _mm_add_epi32(EE4h, E4h); + E4h = _mm_add_epi32(E4h, m128iAdd); + E5h = _mm_add_epi32(EE5h, E5h); + E5h = _mm_add_epi32(E5h, m128iAdd); + E6h = _mm_add_epi32(EE6h, E6h); + E6h = _mm_add_epi32(E6h, m128iAdd); + E7h = _mm_add_epi32(EE7h, E7h); + E7h = _mm_add_epi32(E7h, m128iAdd); + + m128iS0 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); + m128iS1 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); + m128iS2 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); + m128iS3 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); + m128iS4 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); + m128iS5 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); + m128iS6 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); + m128iS7 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); + m128iS8 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift), + _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift)); + m128iS9 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift), + _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift)); + m128iS10 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift), + _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift)); + m128iS11 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift), + _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift)); + m128iS12 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift), + _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift)); + m128iS13 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift), + _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift)); + m128iS14 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift), + _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift)); + m128iS15 = _mm_packs_epi32( + _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift), + _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift)); + + m128iS31 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), + _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); + m128iS30 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), + _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); + m128iS29 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), + _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); + m128iS28 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), + _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); + m128iS27 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), + _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); + m128iS26 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), + _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); + m128iS25 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), + _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); + m128iS24 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), + _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); + m128iS23 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift), + _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift)); + m128iS22 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift), + _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift)); + m128iS21 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift), + _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift)); + m128iS20 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift), + _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift)); + m128iS19 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift), + _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift)); + m128iS18 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift), + _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift)); + m128iS17 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift), + _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift)); + m128iS16 = _mm_packs_epi32( + _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift), + _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift)); + + if (!j) { + /* Inverse the matrix */ + E0l = _mm_unpacklo_epi16(m128iS0, m128iS16); + E1l = _mm_unpacklo_epi16(m128iS1, m128iS17); + E2l = _mm_unpacklo_epi16(m128iS2, m128iS18); + E3l = _mm_unpacklo_epi16(m128iS3, m128iS19); + E4l = _mm_unpacklo_epi16(m128iS4, m128iS20); + E5l = _mm_unpacklo_epi16(m128iS5, m128iS21); + E6l = _mm_unpacklo_epi16(m128iS6, m128iS22); + E7l = _mm_unpacklo_epi16(m128iS7, m128iS23); + E8l = _mm_unpacklo_epi16(m128iS8, m128iS24); + E9l = _mm_unpacklo_epi16(m128iS9, m128iS25); + E10l = _mm_unpacklo_epi16(m128iS10, m128iS26); + E11l = _mm_unpacklo_epi16(m128iS11, m128iS27); + E12l = _mm_unpacklo_epi16(m128iS12, m128iS28); + E13l = _mm_unpacklo_epi16(m128iS13, m128iS29); + E14l = _mm_unpacklo_epi16(m128iS14, m128iS30); + E15l = _mm_unpacklo_epi16(m128iS15, m128iS31); + + O0l = _mm_unpackhi_epi16(m128iS0, m128iS16); + O1l = _mm_unpackhi_epi16(m128iS1, m128iS17); + O2l = _mm_unpackhi_epi16(m128iS2, m128iS18); + O3l = _mm_unpackhi_epi16(m128iS3, m128iS19); + O4l = _mm_unpackhi_epi16(m128iS4, m128iS20); + O5l = _mm_unpackhi_epi16(m128iS5, m128iS21); + O6l = _mm_unpackhi_epi16(m128iS6, m128iS22); + O7l = _mm_unpackhi_epi16(m128iS7, m128iS23); + O8l = _mm_unpackhi_epi16(m128iS8, m128iS24); + O9l = _mm_unpackhi_epi16(m128iS9, m128iS25); + O10l = _mm_unpackhi_epi16(m128iS10, m128iS26); + O11l = _mm_unpackhi_epi16(m128iS11, m128iS27); + O12l = _mm_unpackhi_epi16(m128iS12, m128iS28); + O13l = _mm_unpackhi_epi16(m128iS13, m128iS29); + O14l = _mm_unpackhi_epi16(m128iS14, m128iS30); + O15l = _mm_unpackhi_epi16(m128iS15, m128iS31); + + E0h = _mm_unpacklo_epi16(E0l, E8l); + E1h = _mm_unpacklo_epi16(E1l, E9l); + E2h = _mm_unpacklo_epi16(E2l, E10l); + E3h = _mm_unpacklo_epi16(E3l, E11l); + E4h = _mm_unpacklo_epi16(E4l, E12l); + E5h = _mm_unpacklo_epi16(E5l, E13l); + E6h = _mm_unpacklo_epi16(E6l, E14l); + E7h = _mm_unpacklo_epi16(E7l, E15l); + + E8h = _mm_unpackhi_epi16(E0l, E8l); + E9h = _mm_unpackhi_epi16(E1l, E9l); + E10h = _mm_unpackhi_epi16(E2l, E10l); + E11h = _mm_unpackhi_epi16(E3l, E11l); + E12h = _mm_unpackhi_epi16(E4l, E12l); + E13h = _mm_unpackhi_epi16(E5l, E13l); + E14h = _mm_unpackhi_epi16(E6l, E14l); + E15h = _mm_unpackhi_epi16(E7l, E15l); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); + m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); + m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); + m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); + m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); + m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); + m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + /* */ + E0h = _mm_unpacklo_epi16(O0l, O8l); + E1h = _mm_unpacklo_epi16(O1l, O9l); + E2h = _mm_unpacklo_epi16(O2l, O10l); + E3h = _mm_unpacklo_epi16(O3l, O11l); + E4h = _mm_unpacklo_epi16(O4l, O12l); + E5h = _mm_unpacklo_epi16(O5l, O13l); + E6h = _mm_unpacklo_epi16(O6l, O14l); + E7h = _mm_unpacklo_epi16(O7l, O15l); + + E8h = _mm_unpackhi_epi16(O0l, O8l); + E9h = _mm_unpackhi_epi16(O1l, O9l); + E10h = _mm_unpackhi_epi16(O2l, O10l); + E11h = _mm_unpackhi_epi16(O3l, O11l); + E12h = _mm_unpackhi_epi16(O4l, O12l); + E13h = _mm_unpackhi_epi16(O5l, O13l); + E14h = _mm_unpackhi_epi16(O6l, O14l); + E15h = _mm_unpackhi_epi16(O7l, O15l); + + m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); + m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); + m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); + m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); + m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); + m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); + m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); + m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); + m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); + m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); + m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); + m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); + m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); + + m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); + m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + + m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); + m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); + m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); + m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); + /* */ + _mm_store_si128((__m128i *) (src + i), m128iS0); + _mm_store_si128((__m128i *) (src + 32 + i), m128iS1); + _mm_store_si128((__m128i *) (src + 64 + i), m128iS2); + _mm_store_si128((__m128i *) (src + 96 + i), m128iS3); + _mm_store_si128((__m128i *) (src + 128 + i), m128iS4); + _mm_store_si128((__m128i *) (src + 160 + i), m128iS5); + _mm_store_si128((__m128i *) (src + 192 + i), m128iS6); + _mm_store_si128((__m128i *) (src + 224 + i), m128iS7); + _mm_store_si128((__m128i *) (src + 256 + i), m128iS8); + _mm_store_si128((__m128i *) (src + 288 + i), m128iS9); + _mm_store_si128((__m128i *) (src + 320 + i), m128iS10); + _mm_store_si128((__m128i *) (src + 352 + i), m128iS11); + _mm_store_si128((__m128i *) (src + 384 + i), m128iS12); + _mm_store_si128((__m128i *) (src + 416 + i), m128iS13); + _mm_store_si128((__m128i *) (src + 448 + i), m128iS14); + _mm_store_si128((__m128i *) (src + 480 + i), m128iS15); + _mm_store_si128((__m128i *) (src + 512 + i), m128iS16); + _mm_store_si128((__m128i *) (src + 544 + i), m128iS17); + _mm_store_si128((__m128i *) (src + 576 + i), m128iS18); + _mm_store_si128((__m128i *) (src + 608 + i), m128iS19); + _mm_store_si128((__m128i *) (src + 640 + i), m128iS20); + _mm_store_si128((__m128i *) (src + 672 + i), m128iS21); + _mm_store_si128((__m128i *) (src + 704 + i), m128iS22); + _mm_store_si128((__m128i *) (src + 736 + i), m128iS23); + _mm_store_si128((__m128i *) (src + 768 + i), m128iS24); + _mm_store_si128((__m128i *) (src + 800 + i), m128iS25); + _mm_store_si128((__m128i *) (src + 832 + i), m128iS26); + _mm_store_si128((__m128i *) (src + 864 + i), m128iS27); + _mm_store_si128((__m128i *) (src + 896 + i), m128iS28); + _mm_store_si128((__m128i *) (src + 928 + i), m128iS29); + _mm_store_si128((__m128i *) (src + 960 + i), m128iS30); + _mm_store_si128((__m128i *) (src + 992 + i), m128iS31); + + if (i <= 16) { + int k = i + 8; + m128iS0 = _mm_load_si128((__m128i *) (src + k)); + m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k)); + m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k)); + m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k)); + m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k)); + m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k)); + m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k)); + m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k)); + m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k)); + m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k)); + m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k)); + m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k)); + m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k)); + m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k)); + m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k)); + m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k)); + + m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k)); + m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k)); + m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k)); + m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k)); + m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k)); + m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k)); + m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k)); + m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k)); + m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k)); + m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k)); + m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k)); + m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k)); + m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k)); + m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k)); + m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k)); + m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k)); + } else { + m128iS0 = _mm_load_si128((__m128i *) (src)); + m128iS1 = _mm_load_si128((__m128i *) (src + 128)); + m128iS2 = _mm_load_si128((__m128i *) (src + 256)); + m128iS3 = _mm_load_si128((__m128i *) (src + 384)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 512)); + m128iS5 = _mm_load_si128((__m128i *) (src + 640)); + m128iS6 = _mm_load_si128((__m128i *) (src + 768)); + m128iS7 = _mm_load_si128((__m128i *) (src + 896)); + m128iS8 = _mm_load_si128((__m128i *) (src + 8)); + m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8)); + m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8)); + m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8)); + m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8)); + m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8)); + m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8)); + m128iS16 = _mm_load_si128((__m128i *) (src + 16)); + m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16)); + m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16)); + m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16)); + m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16)); + m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16)); + m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16)); + m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16)); + m128iS24 = _mm_load_si128((__m128i *) (src + 24)); + m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24)); + m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24)); + m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24)); + m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24)); + m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24)); + m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24)); + m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24)); + shift = shift_2nd; + m128iAdd = _mm_set1_epi32(add_2nd); + } + + } else { + int k, m = 0; + _mm_storeu_si128((__m128i *) (src), m128iS0); + _mm_storeu_si128((__m128i *) (src + 8), m128iS1); + _mm_storeu_si128((__m128i *) (src + 16), m128iS2); + _mm_storeu_si128((__m128i *) (src + 24), m128iS3); + _mm_storeu_si128((__m128i *) (src + 128), m128iS4); + _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5); + _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6); + _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7); + _mm_storeu_si128((__m128i *) (src + 256), m128iS8); + _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9); + _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10); + _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11); + _mm_storeu_si128((__m128i *) (src + 384), m128iS12); + _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13); + _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14); + _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15); + + _mm_storeu_si128((__m128i *) (src + 512), m128iS16); + _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17); + _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18); + _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19); + _mm_storeu_si128((__m128i *) (src + 640), m128iS20); + _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21); + _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22); + _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23); + _mm_storeu_si128((__m128i *) (src + 768), m128iS24); + _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25); + _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26); + _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27); + _mm_storeu_si128((__m128i *) (src + 896), m128iS28); + _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29); + _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30); + _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31); + dst = (uint16_t*) _dst + (i * stride); + for (k = 0; k < 8; k++) { + dst[0] = av_clip_uintp2(dst[0] + src[m],10); + dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10); + dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10); + dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10); + dst[4] = av_clip_uintp2( + dst[4] + src[m + 128],10); + dst[5] = av_clip_uintp2( + dst[5] + src[m + 128 + 8],10); + dst[6] = av_clip_uintp2( + dst[6] + src[m + 128 + 16],10); + dst[7] = av_clip_uintp2( + dst[7] + src[m + 128 + 24],10); + + dst[8] = av_clip_uintp2( + dst[8] + src[m + 256],10); + dst[9] = av_clip_uintp2( + dst[9] + src[m + 256 + 8],10); + dst[10] = av_clip_uintp2( + dst[10] + src[m + 256 + 16],10); + dst[11] = av_clip_uintp2( + dst[11] + src[m + 256 + 24],10); + dst[12] = av_clip_uintp2( + dst[12] + src[m + 384],10); + dst[13] = av_clip_uintp2( + dst[13] + src[m + 384 + 8],10); + dst[14] = av_clip_uintp2( + dst[14] + src[m + 384 + 16],10); + dst[15] = av_clip_uintp2( + dst[15] + src[m + 384 + 24],10); + + dst[16] = av_clip_uintp2( + dst[16] + src[m + 512],10); + dst[17] = av_clip_uintp2( + dst[17] + src[m + 512 + 8],10); + dst[18] = av_clip_uintp2( + dst[18] + src[m + 512 + 16],10); + dst[19] = av_clip_uintp2( + dst[19] + src[m + 512 + 24],10); + dst[20] = av_clip_uintp2( + dst[20] + src[m + 640],10); + dst[21] = av_clip_uintp2( + dst[21] + src[m + 640 + 8],10); + dst[22] = av_clip_uintp2( + dst[22] + src[m + 640 + 16],10); + dst[23] = av_clip_uintp2( + dst[23] + src[m + 640 + 24],10); + + dst[24] = av_clip_uintp2( + dst[24] + src[m + 768],10); + dst[25] = av_clip_uintp2( + dst[25] + src[m + 768 + 8],10); + dst[26] = av_clip_uintp2( + dst[26] + src[m + 768 + 16],10); + dst[27] = av_clip_uintp2( + dst[27] + src[m + 768 + 24],10); + dst[28] = av_clip_uintp2( + dst[28] + src[m + 896],10); + dst[29] = av_clip_uintp2( + dst[29] + src[m + 896 + 8],10); + dst[30] = av_clip_uintp2( + dst[30] + src[m + 896 + 16],10); + dst[31] = av_clip_uintp2( + dst[31] + src[m + 896 + 24],10); + + m += 1; + dst += stride; + } + if (i <= 16) { + int k = (i + 8) * 4; + m128iS0 = _mm_load_si128((__m128i *) (src + k)); + m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k)); + m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k)); + m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k)); + m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k)); + m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k)); + m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k)); + m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k)); + m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k)); + m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k)); + m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k)); + m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k)); + m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k)); + m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k)); + m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k)); + m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k)); + m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k)); + m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k)); + m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k)); + m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k)); + m128iS20 = _mm_loadu_si128( + (__m128i *) (src + 512 + 16 + k)); + m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k)); + m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k)); + m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k)); + m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k)); + m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k)); + m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k)); + m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k)); + m128iS28 = _mm_loadu_si128( + (__m128i *) (src + 512 + 24 + k)); + m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k)); + m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k)); + m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k)); + } + } + } + } +} +#endif + diff --git a/x86/sse-dct.h b/x86/sse-dct.h new file mode 100644 index 0000000..bc50ade --- /dev/null +++ b/x86/sse-dct.h @@ -0,0 +1,35 @@ +/* + * H.265 video codec. + * Copyright (c) 2013 openHEVC contributors + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef SSE_DCT_H +#define SSE_DCT_H + +#include +#include + +void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride); +void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_4x4_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_8x8_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_16x16_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_32x32_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); + +#endif diff --git a/x86/sse-motion.cc b/x86/sse-motion.cc new file mode 100644 index 0000000..c8c7571 --- /dev/null +++ b/x86/sse-motion.cc @@ -0,0 +1,4971 @@ +/* + * H.265 video codec. + * Copyright (c) 2013 openHEVC contributors + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include // SSSE3 +#if HAVE_SSE4_1 +#include +#endif + +#include "sse-motion.h" +#include "libde265/util.h" + + +ALIGNED_16(const int8_t) epel_filters[7][16] = { + { -2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2,-2, 58, 10, -2 }, + { -4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2,-4, 54, 16, -2 }, + { -6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4,-6, 46, 28, -4 }, + { -4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4,-4, 36, 36, -4 }, + { -4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6,-4, 28, 46, -6 }, + { -2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4,-2, 16, 54, -4 }, + { -2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2,-2, 10, 58, -2 }, +}; + +static const uint8_t qpel_extra_before[4] = { 0, 3, 3, 2 }; +static const uint8_t qpel_extra_after[4] = { 0, 3, 4, 4 }; +static const uint8_t qpel_extra[4] = { 0, 6, 7, 6 }; + +static const int epel_extra_before = 1; +static const int epel_extra_after = 2; +static const int epel_extra = 3; + +#define MAX_PB_SIZE 64 + +#define MASKMOVE 0 + +void print128(const char* prefix, __m128i r) +{ + unsigned char buf[16]; + + *(__m128i*)buf = r; + + printf("%s ",prefix); + for (int i=0;i<16;i++) + { + if (i>0) { printf(":"); } + printf("%02x", buf[i]); + } + + printf("\n"); +} + + +void printm32(const char* prefix, unsigned char* p) +{ + printf("%s ",prefix); + + for (int i=0;i<4;i++) + { + if (i>0) { printf(":"); } + printf("%02x", p[i]); + } + + printf("\n"); +} + + +#define BIT_DEPTH 8 + +void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height) { + int x, y; + uint8_t *dst = (uint8_t*) _dst; + __m128i r0, r1, f0; + + f0 = _mm_set1_epi16(32); + + + if(!(width & 15)) + { + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + r0 = _mm_load_si128((__m128i *) (src+x)); + + r1 = _mm_load_si128((__m128i *) (src+x + 8)); + r0 = _mm_adds_epi16(r0, f0); + + r1 = _mm_adds_epi16(r1, f0); + r0 = _mm_srai_epi16(r0, 6); + r1 = _mm_srai_epi16(r1, 6); + r0 = _mm_packus_epi16(r0, r1); + + _mm_storeu_si128((__m128i *) (dst+x), r0); + } + dst += dststride; + src += srcstride; + } + }else if(!(width & 7)) + { + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + r0 = _mm_load_si128((__m128i *) (src+x)); + + r0 = _mm_adds_epi16(r0, f0); + + r0 = _mm_srai_epi16(r0, 6); + r0 = _mm_packus_epi16(r0, r0); + + _mm_storel_epi64((__m128i *) (dst+x), r0); + } + dst += dststride; + src += srcstride; + } + }else if(!(width & 3)){ + for (y = 0; y < height; y++) { + for(x = 0;x < width; x+=4){ + r0 = _mm_loadl_epi64((__m128i *) (src+x)); + r0 = _mm_adds_epi16(r0, f0); + + r0 = _mm_srai_epi16(r0, 6); + r0 = _mm_packus_epi16(r0, r0); +#if MASKMOVE + _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); +#else + //r0 = _mm_shuffle_epi32 (r0, 0x00); + *((uint32_t*)(dst+x)) = _mm_cvtsi128_si32(r0); +#endif + } + dst += dststride; + src += srcstride; + } + }else{ + for (y = 0; y < height; y++) { + for(x = 0;x < width; x+=2){ + r0 = _mm_loadl_epi64((__m128i *) (src+x)); + r0 = _mm_adds_epi16(r0, f0); + + r0 = _mm_srai_epi16(r0, 6); + r0 = _mm_packus_epi16(r0, r0); +#if MASKMOVE + _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1),(char *) (dst+x)); +#else + *((uint16_t*)(dst+x)) = _mm_cvtsi128_si32(r0); +#endif + } + dst += dststride; + src += srcstride; + } + } + +} + +void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height) { + int x, y; + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t dststride = _dststride / sizeof(uint8_t); + __m128i r0, r1, f0; + int shift = 14 - BIT_DEPTH; +#if BIT_DEPTH < 14 + int16_t offset = 1 << (shift - 1); +#else + int16_t offset = 0; + +#endif + f0 = _mm_set1_epi16(offset); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + r0 = _mm_load_si128((__m128i *) &src[x]); + + r1 = _mm_load_si128((__m128i *) &src[x + 8]); + r0 = _mm_adds_epi16(r0, f0); + + r1 = _mm_adds_epi16(r1, f0); + r0 = _mm_srai_epi16(r0, shift); + r1 = _mm_srai_epi16(r1, shift); + r0 = _mm_packus_epi16(r0, r1); + + _mm_storeu_si128((__m128i *) &dst[x], r0); + } + dst += dststride; + src += srcstride; + } +} + +void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height) { + int x, y; + uint8_t *dst = (uint8_t*) _dst; + __m128i r0, r1, f0, r2, r3; + + f0 = _mm_set1_epi16(64); + if(!(width & 15)){ + for (y = 0; y < height; y++) { + + for (x = 0; x < width; x += 16) { + r0 = _mm_load_si128((__m128i *) &src1[x]); + r1 = _mm_load_si128((__m128i *) &src1[x + 8]); + r2 = _mm_load_si128((__m128i *) &src2[x]); + r3 = _mm_load_si128((__m128i *) &src2[x + 8]); + + r0 = _mm_adds_epi16(r0, f0); + r1 = _mm_adds_epi16(r1, f0); + r0 = _mm_adds_epi16(r0, r2); + r1 = _mm_adds_epi16(r1, r3); + r0 = _mm_srai_epi16(r0, 7); + r1 = _mm_srai_epi16(r1, 7); + r0 = _mm_packus_epi16(r0, r1); + + _mm_storeu_si128((__m128i *) (dst + x), r0); + } + dst += dststride; + src1 += srcstride; + src2 += srcstride; + } + }else if(!(width & 7)){ + for (y = 0; y < height; y++) { + for(x=0;x= 1){ + if(!(width & 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + x0 = _mm_load_si128((__m128i *) &src[x]); + x2 = _mm_load_si128((__m128i *) &src[x + 8]); + x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_add_epi32(x0, add2); + x1 = _mm_add_epi32(x1, add2); + x2 = _mm_add_epi32(x2, add2); + x3 = _mm_add_epi32(x3, add2); + x0 = _mm_srai_epi32(x0, log2Wd); + x1 = _mm_srai_epi32(x1, log2Wd); + x2 = _mm_srai_epi32(x2, log2Wd); + x3 = _mm_srai_epi32(x3, log2Wd); + x0 = _mm_add_epi32(x0, add); + x1 = _mm_add_epi32(x1, add); + x2 = _mm_add_epi32(x2, add); + x3 = _mm_add_epi32(x3, add); + x0 = _mm_packus_epi32(x0, x1); + x2 = _mm_packus_epi32(x2, x3); + x0 = _mm_packus_epi16(x0, x2); + + _mm_storeu_si128((__m128i *) (dst + x), x0); + + } + dst += dststride; + src += srcstride; + } + }else if(!(width & 7)){ + for (y = 0; y < height; y++) { + for(x=0;x= 1) + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + x0 = _mm_load_si128((__m128i *) &src[x]); + x2 = _mm_load_si128((__m128i *) &src[x + 8]); + x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_add_epi32(x0, add2); + x1 = _mm_add_epi32(x1, add2); + x2 = _mm_add_epi32(x2, add2); + x3 = _mm_add_epi32(x3, add2); + x0 = _mm_srai_epi32(x0, log2Wd); + x1 = _mm_srai_epi32(x1, log2Wd); + x2 = _mm_srai_epi32(x2, log2Wd); + x3 = _mm_srai_epi32(x3, log2Wd); + x0 = _mm_add_epi32(x0, add); + x1 = _mm_add_epi32(x1, add); + x2 = _mm_add_epi32(x2, add); + x3 = _mm_add_epi32(x3, add); + x0 = _mm_packus_epi32(x0, x1); + x2 = _mm_packus_epi32(x2, x3); + x0 = _mm_packus_epi16(x0, x2); + + _mm_storeu_si128((__m128i *) (dst + x), x0); + + } + dst += dststride; + src += srcstride; + } + else + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + + x0 = _mm_load_si128((__m128i *) &src[x]); + x2 = _mm_load_si128((__m128i *) &src[x + 8]); + x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + x0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c0), + _mm_mulhi_epi16(x2, c0)); + + x0 = _mm_add_epi32(x0, add2); + x1 = _mm_add_epi32(x1, add2); + x2 = _mm_add_epi32(x2, add2); + x3 = _mm_add_epi32(x3, add2); + + x0 = _mm_packus_epi32(x0, x1); + x2 = _mm_packus_epi32(x2, x3); + x0 = _mm_packus_epi16(x0, x2); + + _mm_storeu_si128((__m128i *) (dst + x), x0); + + } + dst += dststride; + src += srcstride; + } +} +#endif + +#if HAVE_SSE4_1 +void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag, + int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, + uint8_t *_dst, ptrdiff_t _dststride, + const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride, + int width, int height) { + int shift, shift2; + int log2Wd; + int o0; + int o1; + int x, y; + uint8_t *dst = (uint8_t*) _dst; + ptrdiff_t dststride = _dststride / sizeof(uint8_t); + __m128i x0, x1, x2, x3, r0, r1, r2, r3, c0, c1, c2; + shift = 14 - BIT_DEPTH; + log2Wd = denom + shift; + + o0 = (ol0Flag) * (1 << (BIT_DEPTH - 8)); + o1 = (ol1Flag) * (1 << (BIT_DEPTH - 8)); + shift2 = (log2Wd + 1); + c0 = _mm_set1_epi16(wl0Flag); + c1 = _mm_set1_epi16(wl1Flag); + c2 = _mm_set1_epi32((o0 + o1 + 1) << log2Wd); + + if(!(width & 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + x0 = _mm_load_si128((__m128i *) &src1[x]); + x1 = _mm_load_si128((__m128i *) &src1[x + 8]); + x2 = _mm_load_si128((__m128i *) &src2[x]); + x3 = _mm_load_si128((__m128i *) &src2[x + 8]); + + r0 = _mm_unpacklo_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + r1 = _mm_unpacklo_epi16(_mm_mullo_epi16(x1, c0), + _mm_mulhi_epi16(x1, c0)); + r2 = _mm_unpacklo_epi16(_mm_mullo_epi16(x2, c1), + _mm_mulhi_epi16(x2, c1)); + r3 = _mm_unpacklo_epi16(_mm_mullo_epi16(x3, c1), + _mm_mulhi_epi16(x3, c1)); + x0 = _mm_unpackhi_epi16(_mm_mullo_epi16(x0, c0), + _mm_mulhi_epi16(x0, c0)); + x1 = _mm_unpackhi_epi16(_mm_mullo_epi16(x1, c0), + _mm_mulhi_epi16(x1, c0)); + x2 = _mm_unpackhi_epi16(_mm_mullo_epi16(x2, c1), + _mm_mulhi_epi16(x2, c1)); + x3 = _mm_unpackhi_epi16(_mm_mullo_epi16(x3, c1), + _mm_mulhi_epi16(x3, c1)); + r0 = _mm_add_epi32(r0, r2); + r1 = _mm_add_epi32(r1, r3); + r2 = _mm_add_epi32(x0, x2); + r3 = _mm_add_epi32(x1, x3); + + r0 = _mm_add_epi32(r0, c2); + r1 = _mm_add_epi32(r1, c2); + r2 = _mm_add_epi32(r2, c2); + r3 = _mm_add_epi32(r3, c2); + + r0 = _mm_srai_epi32(r0, shift2); + r1 = _mm_srai_epi32(r1, shift2); + r2 = _mm_srai_epi32(r2, shift2); + r3 = _mm_srai_epi32(r3, shift2); + + r0 = _mm_packus_epi32(r0, r2); + r1 = _mm_packus_epi32(r1, r3); + r0 = _mm_packus_epi16(r0, r1); + + _mm_storeu_si128((__m128i *) (dst + x), r0); + + } + dst += dststride; + src1 += srcstride; + src2 += srcstride; + } + }else if(!(width & 7)){ + for (y = 0; y < height; y++) { + for(x=0;x>1; + if(!(width & 7)){ + //x1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + + x2 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH + _mm_store_si128((__m128i *) &dst[x], x2); + + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 3)){ + //x1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 4) { + + x2 = _mm_loadl_epi64((__m128i *) &src[x]); + x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH + + _mm_storel_epi64((__m128i *) &dst[x], x2); + + } + src += srcstride; + dst += dststride; + } + }else{ + //x1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) { + + x2 = _mm_loadl_epi64((__m128i *) &src[x]); + x2 = _mm_slli_epi16(x2, 4); //shift 14 - BIT LENGTH + _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); + } + src += srcstride; + dst += dststride; + } + } + +} +#endif + +void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int width, int height, int mx, + int my, int16_t* mcbuffer, int bit_depth) { + int x, y; + const uint8_t *src = (const uint8_t*) _src; + ptrdiff_t srcstride = _srcstride; + const int8_t *filter = epel_filters[mx - 1]; + __m128i r0, bshuffle1, bshuffle2, x1, x2, x3; + int8_t filter_0 = filter[0]; + int8_t filter_1 = filter[1]; + int8_t filter_2 = filter[2]; + int8_t filter_3 = filter[3]; + r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3, + filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, + filter_0, filter_3, filter_2, filter_1, filter_0); + bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); + + + /* + printf("---IN---SSE\n"); + + int extra_top = 1; + int extra_left = 1; + int extra_right = 2; + int extra_bottom = 2; + + for (int y=-extra_top;y>1; + const int8_t *filter = epel_filters[mx - 1]; + __m128i r0, bshuffle1, bshuffle2, x1, x2, x3, r1; + int8_t filter_0 = filter[0]; + int8_t filter_1 = filter[1]; + int8_t filter_2 = filter[2]; + int8_t filter_3 = filter[3]; + r0 = _mm_set_epi16(filter_3, filter_2, filter_1, + filter_0, filter_3, filter_2, filter_1, filter_0); + bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0); + + if(!(width & 3)){ + bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 4) { + + x1 = _mm_loadu_si128((__m128i *) &src[x-1]); + x2 = _mm_shuffle_epi8(x1, bshuffle1); + x3 = _mm_shuffle_epi8(x1, bshuffle2); + + + x2 = _mm_madd_epi16(x2, r0); + x3 = _mm_madd_epi16(x3, r0); + x2 = _mm_hadd_epi32(x2, x3); + x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8) + + x2 = _mm_packs_epi32(x2,r0); + //give results back + _mm_storel_epi64((__m128i *) &dst[x], x2); + } + src += srcstride; + dst += dststride; + } + }else{ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) { + /* load data in register */ + x1 = _mm_loadu_si128((__m128i *) &src[x-1]); + x2 = _mm_shuffle_epi8(x1, bshuffle1); + + /* PMADDUBSW then PMADDW */ + x2 = _mm_madd_epi16(x2, r0); + x2 = _mm_hadd_epi32(x2, r1); + x2= _mm_srai_epi32(x2,2); //>> (BIT_DEPTH - 8) + x2 = _mm_packs_epi32(x2, r1); + /* give results back */ + _mm_maskmoveu_si128(x2,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); + } + src += srcstride; + dst += dststride; + } + } +} +#endif + + +void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx, + int my, int16_t* mcbuffer, int bit_depth) { + int x, y; + __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1; + uint8_t *src = (uint8_t*) _src; + ptrdiff_t srcstride = _srcstride / sizeof(uint8_t); + const int8_t *filter = epel_filters[my - 1]; + int8_t filter_0 = filter[0]; + int8_t filter_1 = filter[1]; + int8_t filter_2 = filter[2]; + int8_t filter_3 = filter[3]; + f0 = _mm_set1_epi16(filter_0); + f1 = _mm_set1_epi16(filter_1); + f2 = _mm_set1_epi16(filter_2); + f3 = _mm_set1_epi16(filter_3); + + if(!(width & 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + /* check if memory needs to be reloaded */ + + x0 = _mm_loadu_si128((__m128i *) &src[x - srcstride]); + x1 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_loadu_si128((__m128i *) &src[x + srcstride]); + x3 = _mm_loadu_si128((__m128i *) &src[x + 2 * srcstride]); + + t0 = _mm_unpacklo_epi8(x0, _mm_setzero_si128()); + t1 = _mm_unpacklo_epi8(x1, _mm_setzero_si128()); + t2 = _mm_unpacklo_epi8(x2, _mm_setzero_si128()); + t3 = _mm_unpacklo_epi8(x3, _mm_setzero_si128()); + + x0 = _mm_unpackhi_epi8(x0, _mm_setzero_si128()); + x1 = _mm_unpackhi_epi8(x1, _mm_setzero_si128()); + x2 = _mm_unpackhi_epi8(x2, _mm_setzero_si128()); + x3 = _mm_unpackhi_epi8(x3, _mm_setzero_si128()); + + /* multiply by correct value : */ + r0 = _mm_mullo_epi16(t0, f0); + r1 = _mm_mullo_epi16(x0, f0); + r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t1, f1)); + r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x1, f1)); + r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t2, f2)); + r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x2, f2)); + r0 = _mm_adds_epi16(r0, _mm_mullo_epi16(t3, f3)); + r1 = _mm_adds_epi16(r1, _mm_mullo_epi16(x3, f3)); + /* give results back */ + _mm_store_si128((__m128i *) &dst[x], r0); + _mm_storeu_si128((__m128i *) &dst[x + 8], r1); + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 7)){ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for(x=0;x>1; + const int8_t *filter = epel_filters[my - 1]; + int8_t filter_0 = filter[0]; + int8_t filter_1 = filter[1]; + int8_t filter_2 = filter[2]; + int8_t filter_3 = filter[3]; + f0 = _mm_set1_epi16(filter_0); + f1 = _mm_set1_epi16(filter_1); + f2 = _mm_set1_epi16(filter_2); + f3 = _mm_set1_epi16(filter_3); + + if(!(width & 7)){ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for(x=0;x> (BIT_DEPTH - 8) + t0= _mm_srai_epi32(t0,2);//>> (BIT_DEPTH - 8) + + r0= _mm_packs_epi32(r0, t0); + // give results back + _mm_storeu_si128((__m128i *) &dst[x], r0); + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 3)){ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for(x=0;x> (BIT_DEPTH - 8) + + r0= _mm_packs_epi32(r0, r0); + + // give results back + _mm_storel_epi64((__m128i *) &dst[x], r0); + } + src += srcstride; + dst += dststride; + } + }else{ + r1= _mm_setzero_si128(); + for (y = 0; y < height; y++) { + for(x=0;x> (BIT_DEPTH - 8) + + r0= _mm_packs_epi32(r0, r0); + + /* give results back */ + _mm_maskmoveu_si128(r0,_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1),(char *) (dst+x)); + + } + src += srcstride; + dst += dststride; + } + } +} +#endif + +void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx, + int my, int16_t* mcbuffer, int bit_depth) { + int x, y; + uint8_t *src = (uint8_t*) _src; + ptrdiff_t srcstride = _srcstride; + const int8_t *filter_h = epel_filters[mx - 1]; + const int8_t *filter_v = epel_filters[my - 1]; + __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1, + f2, f3, r1, r2; + int8_t filter_0 = filter_h[0]; + int8_t filter_1 = filter_h[1]; + int8_t filter_2 = filter_h[2]; + int8_t filter_3 = filter_h[3]; + int16_t *tmp = mcbuffer; + r0 = _mm_set_epi8(filter_3, filter_2, filter_1, filter_0, filter_3, + filter_2, filter_1, filter_0, filter_3, filter_2, filter_1, + filter_0, filter_3, filter_2, filter_1, filter_0); + bshuffle1 = _mm_set_epi8(6, 5, 4, 3, 5, 4, 3, 2, 4, 3, 2, 1, 3, 2, 1, 0); + + src -= epel_extra_before * srcstride; + + f3 = _mm_set1_epi16(filter_v[3]); + f1 = _mm_set1_epi16(filter_v[1]); + f2 = _mm_set1_epi16(filter_v[2]); + f0 = _mm_set1_epi16(filter_v[0]); + + /* horizontal treatment */ + if(!(width & 7)){ + bshuffle2 = _mm_set_epi8(10, 9, 8, 7, 9, 8, 7, 6, 8, 7, 6, 5, 7, 6, 5, + 4); + for (y = 0; y < height + epel_extra; y++) { + for (x = 0; x < width; x += 8) { + + x1 = _mm_loadu_si128((__m128i *) &src[x - 1]); + x2 = _mm_shuffle_epi8(x1, bshuffle1); + x3 = _mm_shuffle_epi8(x1, bshuffle2); + + /* PMADDUBSW then PMADDW */ + x2 = _mm_maddubs_epi16(x2, r0); + x3 = _mm_maddubs_epi16(x3, r0); + x2 = _mm_hadd_epi16(x2, x3); + _mm_store_si128((__m128i *) &tmp[x], x2); + } + src += srcstride; + tmp += MAX_PB_SIZE; + } + tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; + + /* vertical treatment */ + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + /* check if memory needs to be reloaded */ + x0 = _mm_load_si128((__m128i *) &tmp[x - MAX_PB_SIZE]); + x1 = _mm_load_si128((__m128i *) &tmp[x]); + x2 = _mm_load_si128((__m128i *) &tmp[x + MAX_PB_SIZE]); + x3 = _mm_load_si128((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); + + r0 = _mm_mullo_epi16(x0, f0); + r1 = _mm_mulhi_epi16(x0, f0); + r2 = _mm_mullo_epi16(x1, f1); + t0 = _mm_unpacklo_epi16(r0, r1); + x0 = _mm_unpackhi_epi16(r0, r1); + r0 = _mm_mulhi_epi16(x1, f1); + r1 = _mm_mullo_epi16(x2, f2); + t1 = _mm_unpacklo_epi16(r2, r0); + x1 = _mm_unpackhi_epi16(r2, r0); + r2 = _mm_mulhi_epi16(x2, f2); + r0 = _mm_mullo_epi16(x3, f3); + t2 = _mm_unpacklo_epi16(r1, r2); + x2 = _mm_unpackhi_epi16(r1, r2); + r1 = _mm_mulhi_epi16(x3, f3); + t3 = _mm_unpacklo_epi16(r0, r1); + x3 = _mm_unpackhi_epi16(r0, r1); + + /* multiply by correct value : */ + r0 = _mm_add_epi32(t0, t1); + r1 = _mm_add_epi32(x0, x1); + r0 = _mm_add_epi32(r0, t2); + r1 = _mm_add_epi32(r1, x2); + r0 = _mm_add_epi32(r0, t3); + r1 = _mm_add_epi32(r1, x3); + r0 = _mm_srai_epi32(r0, 6); + r1 = _mm_srai_epi32(r1, 6); + + /* give results back */ + r0 = _mm_packs_epi32(r0, r1); + _mm_store_si128((__m128i *) &dst[x], r0); + } + tmp += MAX_PB_SIZE; + dst += dststride; + } + }else if(!(width & 3)){ + for (y = 0; y < height + epel_extra; y ++) { + for(x=0;x>1; + const int8_t *filter_h = epel_filters[mx - 1]; + const int8_t *filter_v = epel_filters[my - 1]; + __m128i r0, bshuffle1, bshuffle2, x0, x1, x2, x3, t0, t1, t2, t3, f0, f1, + f2, f3, r1, r2, r3; + int8_t filter_0 = filter_h[0]; + int8_t filter_1 = filter_h[1]; + int8_t filter_2 = filter_h[2]; + int8_t filter_3 = filter_h[3]; + int16_t *tmp = mcbuffer; + + r0 = _mm_set_epi16(filter_3, filter_2, filter_1, + filter_0, filter_3, filter_2, filter_1, filter_0); + bshuffle1 = _mm_set_epi8(9,8,7,6,5,4, 3, 2,7,6,5,4, 3, 2, 1, 0); + + src -= epel_extra_before * srcstride; + + f0 = _mm_set1_epi16(filter_v[0]); + f1 = _mm_set1_epi16(filter_v[1]); + f2 = _mm_set1_epi16(filter_v[2]); + f3 = _mm_set1_epi16(filter_v[3]); + + + /* horizontal treatment */ + if(!(width & 3)){ + bshuffle2 = _mm_set_epi8(13,12,11,10,9,8,7,6,11,10, 9,8,7,6,5, 4); + for (y = 0; y < height + epel_extra; y ++) { + for(x=0;x> (BIT_DEPTH - 8) + + x2 = _mm_packs_epi32(x2,r0); + //give results back + _mm_storel_epi64((__m128i *) &tmp[x], x2); + + } + src += srcstride; + tmp += MAX_PB_SIZE; + } + tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; + + // vertical treatment + + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 4) { + x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]); + x1 = _mm_loadl_epi64((__m128i *) &tmp[x]); + x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]); + x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); + + r0 = _mm_mullo_epi16(x0, f0); + r1 = _mm_mulhi_epi16(x0, f0); + r2 = _mm_mullo_epi16(x1, f1); + t0 = _mm_unpacklo_epi16(r0, r1); + + r0 = _mm_mulhi_epi16(x1, f1); + r1 = _mm_mullo_epi16(x2, f2); + t1 = _mm_unpacklo_epi16(r2, r0); + + r2 = _mm_mulhi_epi16(x2, f2); + r0 = _mm_mullo_epi16(x3, f3); + t2 = _mm_unpacklo_epi16(r1, r2); + + r1 = _mm_mulhi_epi16(x3, f3); + t3 = _mm_unpacklo_epi16(r0, r1); + + + + r0 = _mm_add_epi32(t0, t1); + r0 = _mm_add_epi32(r0, t2); + r0 = _mm_add_epi32(r0, t3); + r0 = _mm_srai_epi32(r0, 6); + + // give results back + r0 = _mm_packs_epi32(r0, r0); + _mm_storel_epi64((__m128i *) &dst[x], r0); + } + tmp += MAX_PB_SIZE; + dst += dststride; + } + }else{ + bshuffle2=_mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,-1,-1); + r1= _mm_setzero_si128(); + for (y = 0; y < height + epel_extra; y ++) { + for(x=0;x> (BIT_DEPTH - 8) + x2 = _mm_packs_epi32(x2, r1); + /* give results back */ + _mm_maskmoveu_si128(x2,bshuffle2,(char *) (tmp+x)); + } + src += srcstride; + tmp += MAX_PB_SIZE; + } + + tmp = mcbuffer + epel_extra_before * MAX_PB_SIZE; + + /* vertical treatment */ + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) { + /* check if memory needs to be reloaded */ + x0 = _mm_loadl_epi64((__m128i *) &tmp[x - MAX_PB_SIZE]); + x1 = _mm_loadl_epi64((__m128i *) &tmp[x]); + x2 = _mm_loadl_epi64((__m128i *) &tmp[x + MAX_PB_SIZE]); + x3 = _mm_loadl_epi64((__m128i *) &tmp[x + 2 * MAX_PB_SIZE]); + + r0 = _mm_mullo_epi16(x0, f0); + t0 = _mm_mulhi_epi16(x0, f0); + + x0= _mm_unpacklo_epi16(r0,t0); + + r1 = _mm_mullo_epi16(x1, f1); + t1 = _mm_mulhi_epi16(x1, f1); + + x1= _mm_unpacklo_epi16(r1,t1); + + r2 = _mm_mullo_epi16(x2, f2); + t2 = _mm_mulhi_epi16(x2, f2); + + x2= _mm_unpacklo_epi16(r2,t2); + + r3 = _mm_mullo_epi16(x3, f3); + t3 = _mm_mulhi_epi16(x3, f3); + + x3= _mm_unpacklo_epi16(r3,t3); + + r0= _mm_add_epi32(x0,x1); + r1= _mm_add_epi32(x2,x3); + r0= _mm_add_epi32(r0,r1); + r0 = _mm_srai_epi32(r0, 6); + /* give results back */ + r0 = _mm_packs_epi32(r0, r0); + _mm_maskmoveu_si128(r0,bshuffle2,(char *) (dst+x)); + } + tmp += MAX_PB_SIZE; + dst += dststride; + } + } +} +#endif + +void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, + int16_t* mcbuffer) { + int x, y; + __m128i x1, x2, x3, x0; + uint8_t *src = (uint8_t*) _src; + ptrdiff_t srcstride = _srcstride; + x0= _mm_setzero_si128(); + if(!(width & 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + + x1 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_unpacklo_epi8(x1, x0); + + x3 = _mm_unpackhi_epi8(x1, x0); + + x2 = _mm_slli_epi16(x2, 6); + x3 = _mm_slli_epi16(x3, 6); + _mm_storeu_si128((__m128i *) &dst[x], x2); + _mm_storeu_si128((__m128i *) &dst[x + 8], x3); + + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 7)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + + x1 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_unpacklo_epi8(x1, x0); + x2 = _mm_slli_epi16(x2, 6); + _mm_storeu_si128((__m128i *) &dst[x], x2); + + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 3)){ + for (y = 0; y < height; y++) { + for(x=0;x>1; + if(!(width & 7)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + + x1 = _mm_loadu_si128((__m128i *) &src[x]); + x2 = _mm_slli_epi16(x1, 4); //14-BIT DEPTH + _mm_storeu_si128((__m128i *) &dst[x], x2); + + } + src += srcstride; + dst += dststride; + } + }else if(!(width & 3)){ + for (y = 0; y < height; y++) { + for(x=0;x>1; + __m128i x0, x1, x2, x3, r0; + + r0 = _mm_set_epi16(0, 1, -5, 17, 58, -10, 4, -1); + x0= _mm_setzero_si128(); + x3= _mm_set_epi32(0,0,0,-1); + for (y = 0; y < height; y ++) { + for(x=0;x>BIT_DEPTH-8 + x1= _mm_packs_epi32(x1,x0); + // dst[x]= _mm_extract_epi16(x1,0); + _mm_maskmoveu_si128(x1,x3,(char *) (dst+x)); + } + src += srcstride; + dst += dststride; + } + +} +#endif + + +void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, + int16_t* mcbuffer) { + int x, y; + const uint8_t *src = _src; + ptrdiff_t srcstride = _srcstride / sizeof(uint8_t); + __m128i x1, r0, x2, x3, x4, x5; + + r0 = _mm_set_epi8(-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, + 4, -1); + + /* LOAD src from memory to registers to limit memory bandwidth */ + if(!(width - 15)){ + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + /* load data in register */ + x1 = _mm_loadu_si128((__m128i *) &src[x - 3]); + x2 = _mm_unpacklo_epi64(x1, _mm_srli_si128(x1, 1)); + x3 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 2), + _mm_srli_si128(x1, 3)); + x4 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 4), + _mm_srli_si128(x1, 5)); + x5 = _mm_unpacklo_epi64(_mm_srli_si128(x1, 6), + _mm_srli_si128(x1, 7)); + + /* PMADDUBSW then PMADDW */ + x2 = _mm_maddubs_epi16(x2, r0); + x3 = _mm_maddubs_epi16(x3, r0); + x4 = _mm_maddubs_epi16(x4, r0); + x5 = _mm_maddubs_epi16(x5, r0); + x2 = _mm_hadd_epi16(x2, x3); + x4 = _mm_hadd_epi16(x4, x5); + x2 = _mm_hadd_epi16(x2, x4); + /* give results back */ + _mm_store_si128((__m128i *) &dst[x],x2); + } + src += srcstride; + dst += dststride; + } + + }else{ + + for (y = 0; y < height; y ++) { + for(x=0;x> 1; + __m128i x1, x2, x3, x4, x5, x6, x7, r1; + __m128i t1, t2, t3, t4, t5, t6, t7, t8; + + t7= _mm_set1_epi32(1); + t6= _mm_set1_epi32(-5); + t5= _mm_set1_epi32(17); + t4= _mm_set1_epi32(58); + t3= _mm_set1_epi32(-10); + t2= _mm_set1_epi32(4); + t1= _mm_set1_epi32(-1); + t8= _mm_setzero_si128(); + + for (y = 0; y < height; y ++) { + for(x=0;x> 1; + __m128i x1, x2, x3, x4, x5, x6, x7, x8, r0, r1, r2; + __m128i t1, t2, t3, t4, t5, t6, t7, t8; + r1 = _mm_set_epi16(-1, 4, -11, 40, 40, -11, 4, -1); + + t1= _mm_set1_epi32(-1); + t2= _mm_set1_epi32(4); + t3= _mm_set1_epi32(-11); + t4= _mm_set1_epi32(40); + t5= _mm_set1_epi32(40); + t6= _mm_set1_epi32(-11); + t7= _mm_set1_epi32(4); + t8= _mm_set1_epi32(-1); + + { + x = 0; + r0 = _mm_setzero_si128(); + for (y = 0; y < height; y ++) { + for(x=0;x> 1; + __m128i x1, x2, x3, x4, x5, x6, x7, r0; + __m128i t1, t2, t3, t4, t5, t6, t7, t8; + + t7 = _mm_set1_epi32(-1); + t6 = _mm_set1_epi32(4); + t5 = _mm_set1_epi32(-10); + t4 = _mm_set1_epi32(58); + t3 = _mm_set1_epi32(17); + t2 = _mm_set1_epi32(-5); + t1 = _mm_set1_epi32(1); + t8= _mm_setzero_si128(); + { + + for (y = 0; y < height; y ++) { + for(x=0;x + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef SSE_MOTION_H +#define SSE_MOTION_H + +#include +#include + + +void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src, ptrdiff_t srcstride, + int width, int height); + +void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride, + const int16_t *src1, const int16_t *src2, + ptrdiff_t srcstride, int width, + int height); + +void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer); +void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); +void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); +void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *_src, ptrdiff_t srcstride, + int width, int height, + int mx, int my, int16_t* mcbuffer, int bit_depth); + +void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); +void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride, + const uint8_t *src, ptrdiff_t srcstride, + int width, int height, int16_t* mcbuffer); + +#endif diff --git a/x86/sse.cc b/x86/sse.cc new file mode 100644 index 0000000..2ee0f8f --- /dev/null +++ b/x86/sse.cc @@ -0,0 +1,104 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifdef _MSC_VER +#include +#endif + +#include "x86/sse.h" +#include "x86/sse-motion.h" +#include "x86/sse-dct.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef __GNUC__ +#include +#endif + +void init_acceleration_functions_sse(struct acceleration_functions* accel) +{ + uint32_t ecx=0,edx=0; + +#ifdef _MSC_VER + uint32_t regs[4]; + int a = 1; + + __cpuid((int *)regs, (int)a); + + ecx = regs[2]; + edx = regs[3]; +#else + uint32_t eax,ebx; + __get_cpuid(1, &eax,&ebx,&ecx,&edx); +#endif + + // printf("CPUID EAX=1 -> ECX=%x EDX=%x\n", regs[2], regs[3]); + + //int have_MMX = !!(edx & (1<<23)); + int have_SSE = !!(edx & (1<<25)); + int have_SSE4_1 = !!(ecx & (1<<19)); + + // printf("MMX:%d SSE:%d SSE4_1:%d\n",have_MMX,have_SSE,have_SSE4_1); + + if (have_SSE) { + } + +#if HAVE_SSE4_1 + if (have_SSE4_1) { + accel->put_unweighted_pred_8 = ff_hevc_put_unweighted_pred_8_sse; + accel->put_weighted_pred_avg_8 = ff_hevc_put_weighted_pred_avg_8_sse; + + accel->put_hevc_epel_8 = ff_hevc_put_hevc_epel_pixels_8_sse; + accel->put_hevc_epel_h_8 = ff_hevc_put_hevc_epel_h_8_sse; + accel->put_hevc_epel_v_8 = ff_hevc_put_hevc_epel_v_8_sse; + accel->put_hevc_epel_hv_8 = ff_hevc_put_hevc_epel_hv_8_sse; + + accel->put_hevc_qpel_8[0][0] = ff_hevc_put_hevc_qpel_pixels_8_sse; + accel->put_hevc_qpel_8[0][1] = ff_hevc_put_hevc_qpel_v_1_8_sse; + accel->put_hevc_qpel_8[0][2] = ff_hevc_put_hevc_qpel_v_2_8_sse; + accel->put_hevc_qpel_8[0][3] = ff_hevc_put_hevc_qpel_v_3_8_sse; + accel->put_hevc_qpel_8[1][0] = ff_hevc_put_hevc_qpel_h_1_8_sse; + accel->put_hevc_qpel_8[1][1] = ff_hevc_put_hevc_qpel_h_1_v_1_sse; + accel->put_hevc_qpel_8[1][2] = ff_hevc_put_hevc_qpel_h_1_v_2_sse; + accel->put_hevc_qpel_8[1][3] = ff_hevc_put_hevc_qpel_h_1_v_3_sse; + accel->put_hevc_qpel_8[2][0] = ff_hevc_put_hevc_qpel_h_2_8_sse; + accel->put_hevc_qpel_8[2][1] = ff_hevc_put_hevc_qpel_h_2_v_1_sse; + accel->put_hevc_qpel_8[2][2] = ff_hevc_put_hevc_qpel_h_2_v_2_sse; + accel->put_hevc_qpel_8[2][3] = ff_hevc_put_hevc_qpel_h_2_v_3_sse; + accel->put_hevc_qpel_8[3][0] = ff_hevc_put_hevc_qpel_h_3_8_sse; + accel->put_hevc_qpel_8[3][1] = ff_hevc_put_hevc_qpel_h_3_v_1_sse; + accel->put_hevc_qpel_8[3][2] = ff_hevc_put_hevc_qpel_h_3_v_2_sse; + accel->put_hevc_qpel_8[3][3] = ff_hevc_put_hevc_qpel_h_3_v_3_sse; + + accel->transform_skip_8 = ff_hevc_transform_skip_8_sse; + + // actually, for these two functions, the scalar fallback seems to be faster than the SSE code + //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO + //accel->transform_4x4_add_8 = ff_hevc_transform_4x4_add_8_sse4; + + accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4; + accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4; + accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4; + } +#endif +} + diff --git a/x86/sse.h b/x86/sse.h new file mode 100644 index 0000000..d4663d0 --- /dev/null +++ b/x86/sse.h @@ -0,0 +1,28 @@ +/* + * H.265 video codec. + * Copyright (c) 2013-2014 struktur AG, Dirk Farin + * + * This file is part of libde265. + * + * libde265 is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * libde265 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with libde265. If not, see . + */ + +#ifndef DE265_SSE_H +#define DE265_SSE_H + +#include "acceleration.h" + +void init_acceleration_functions_sse(struct acceleration_functions* accel); + +#endif